AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 15 additions & 0 deletions b/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 18 additions & 11 deletions b/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎src/maxdiffusion/models/__init__.py‎
Lines changed: 9 additions & 8 deletions b/‎src/maxdiffusion/models/__init__.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/__init__.py‎ b/‎src/maxdiffusion/models/ltx_video/__init__.py‎
diff --git a/‎src/maxdiffusion/models/ltx_video/gradient_checkpoint.py‎
Lines changed: 70 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/gradient_checkpoint.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/linear.py‎
Lines changed: 111 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/linear.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/repeatable_layer.py‎
Lines changed: 105 additions & 0 deletions b/‎src/maxdiffusion/models/ltx_video/repeatable_layer.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers/__init__.py‎ b/‎src/maxdiffusion/models/ltx_video/transformers/__init__.py‎
@@ -1,3 +1,18 @@
+#  Copyright 2025 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
 #hardware
 hardware: 'tpu'
 skip_jax_distributed_system: False
 
@@ -1,3 +1,20 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+
 from absl import app
 from typing import Sequence
 import jax
@@ -50,17 +67,7 @@ def run(config):
         text_tokens,
         num_tokens,
         features,
-        eval_only=False
-    )
-
-    transformer_state, transformer_state_shardings = setup_initial_state(
-        model=transformer,
-        tx=None,
-        config=config,
-        mesh=mesh,
-        weights_init_fn=weights_init_fn,
-        model_params=None,
-        training=False,
+        eval_only=True
     )
 
 
 
@@ -25,14 +25,15 @@
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
 
-  from .controlnet_flax import FlaxControlNetModel
-  from .unet_2d_condition_flax import FlaxUNet2DConditionModel
-  from .vae_flax import FlaxAutoencoderKL
-  from .lora import *
-  from .flux.transformers.transformer_flux_flax import FluxTransformer2DModel
-  from .ltx_video.transformers.transformer3d import Transformer3DModel
+    from .controlnet_flax import FlaxControlNetModel
+    from .unet_2d_condition_flax import FlaxUNet2DConditionModel
+    from .vae_flax import FlaxAutoencoderKL
+    from .lora import *
+    from .flux.transformers.transformer_flux_flax import FluxTransformer2DModel
+    from .ltx_video.transformers.transformer3d import Transformer3DModel
 
 else:
-  import sys
+    import sys
 
-  sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__)
@@ -0,0 +1,70 @@
+from enum import Enum, auto
+from typing import Optional
+
+import jax
+from flax import linen as nn
+
+SKIP_GRADIENT_CHECKPOINT_KEY = "skip"
+
+
+class GradientCheckpointType(Enum):
+    """
+    Defines the type of the gradient checkpoint we will have
+
+    NONE - means no gradient checkpoint
+    FULL - means full gradient checkpoint, wherever possible (minimum memory usage)
+    MATMUL_WITHOUT_BATCH - means gradient checkpoint for every linear/matmul operation,
+                            except for ones that involve batch dimension - that means that all attention and projection
+                            layers will have gradient checkpoint, but not the backward with respect to the parameters
+    """
+
+    NONE = auto()
+    FULL = auto()
+    MATMUL_WITHOUT_BATCH = auto()
+
+    @classmethod
+    def from_str(cls, s: Optional[str] = None) -> "GradientCheckpointType":
+        """
+        Constructs the gradient checkpoint type from a string
+
+        Args:
+            s (Optional[str], optional): The name of the gradient checkpointing policy. Defaults to None.
+
+        Returns:
+            GradientCheckpointType: The policy that corresponds to the string
+        """
+        if s is None:
+            s = "none"
+        return GradientCheckpointType[s.upper()]
+
+    def to_jax_policy(self):
+        """
+        Converts the gradient checkpoint type to a jax policy
+        """
+        match self:
+            case GradientCheckpointType.NONE:
+                return SKIP_GRADIENT_CHECKPOINT_KEY
+            case GradientCheckpointType.FULL:
+                return None
+            case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
+                return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
+
+    def apply(self, module: nn.Module) -> nn.Module:
+        """
+        Applies a gradient checkpoint policy to a module
+        if no policy is needed, it will return the module as is
+
+        Args:
+            module (nn.Module): the module to apply the policy to
+
+        Returns:
+            nn.Module: the module with the policy applied
+        """
+        policy = self.to_jax_policy()
+        if policy == SKIP_GRADIENT_CHECKPOINT_KEY:
+            return module
+        return nn.remat(  # pylint: disable=invalid-name
+            module,
+            prevent_cse=False,
+            policy=policy,
+        )
@@ -0,0 +1,111 @@
+from typing import Union, Iterable, Tuple, Optional, Callable
+
+import numpy as np
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from flax.linen.initializers import lecun_normal
+
+
+Shape = Tuple[int, ...]
+Initializer = Callable[[jax.random.PRNGKey, Shape, jax.numpy.dtype], jax.Array]
+InitializerAxis = Union[int, Shape]
+
+
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+    # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+    return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
+
+
+def _canonicalize_tuple(x):
+    if isinstance(x, Iterable):
+        return tuple(x)
+    else:
+        return (x,)
+
+
+NdInitializer = Callable[[jax.random.PRNGKey, Shape,
+                          jnp.dtype, InitializerAxis, InitializerAxis], jax.Array]
+KernelInitializer = Callable[[jax.random.PRNGKey, Shape,
+                              jnp.dtype, InitializerAxis, InitializerAxis], jax.Array]
+
+
+class DenseGeneral(nn.Module):
+    """A linear transformation with flexible axes.
+
+    Adapted from https://github.com/AI-Hypercomputer/maxtext/blob/4bf3beaa5e721745427bfed09938427e369c2aaf/MaxText/layers/linears.py#L86
+
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      weight_dtype: the dtype of the weights (default: float32).
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+      use_bias: whether to add bias in linear transformation.
+      bias_norm: whether to add normalization before adding bias.
+      quant: quantization config, defaults to None implying no quantization.
+    """
+
+    features: Union[Iterable[int], int]
+    axis: Union[Iterable[int], int] = -1
+    weight_dtype: jnp.dtype = jnp.float32
+    dtype: np.dtype = jnp.float32
+    kernel_init: KernelInitializer = lecun_normal()
+    kernel_axes: Tuple[Optional[str], ...] = ()
+    use_bias: bool = False
+    matmul_precision: str = "default"
+
+    bias_init: Initializer = jax.nn.initializers.constant(0.0)
+
+    @nn.compact
+    def __call__(self, inputs: jax.Array) -> jax.Array:
+        """Applies a linear transformation to the inputs along multiple dimensions.
+
+        Args:
+          inputs: The nd-array to be transformed.
+
+        Returns:
+          The transformed input.
+        """
+
+        def compute_dot_general(inputs, kernel, axis, contract_ind):
+            """Computes a dot_general operation that may be quantized."""
+            dot_general = jax.lax.dot_general
+            matmul_precision = jax.lax.Precision(self.matmul_precision)
+            return dot_general(inputs, kernel, ((axis, contract_ind), ((), ())), precision=matmul_precision)
+
+        features = _canonicalize_tuple(self.features)
+        axis = _canonicalize_tuple(self.axis)
+
+        inputs = jnp.asarray(inputs, self.dtype)
+        axis = _normalize_axes(axis, inputs.ndim)
+
+        kernel_shape = tuple(inputs.shape[ax] for ax in axis) + features
+        kernel_in_axis = np.arange(len(axis))
+        kernel_out_axis = np.arange(len(axis), len(axis) + len(features))
+        kernel = self.param(
+            "kernel",
+            nn.with_logical_partitioning(self.kernel_init, self.kernel_axes),
+            kernel_shape,
+            self.weight_dtype,
+        )
+        kernel = jnp.asarray(kernel, self.dtype)
+
+        contract_ind = tuple(range(0, len(axis)))
+        output = compute_dot_general(inputs, kernel, axis, contract_ind)
+
+        if self.use_bias:
+            bias_axes, bias_shape = (
+                self.kernel_axes[-len(features):],
+                kernel_shape[-len(features):],
+            )
+            bias = self.param(
+                "bias",
+                nn.with_logical_partitioning(self.bias_init, bias_axes),
+                bias_shape,
+                self.weight_dtype,
+            )
+            bias = jnp.asarray(bias, self.dtype)
+
+            output += bias
+        return output
@@ -0,0 +1,105 @@
+from dataclasses import field
+from typing import Any, Callable, Dict, List, Tuple, Optional
+
+import jax
+from flax import linen as nn
+from flax.linen import partitioning
+
+
+class RepeatableCarryBlock(nn.Module):
+    """
+    Integrates an input module in a jax carry format
+
+    ergo, the module assumes the role of a building block
+    and returns both input and output across all blocks
+    """
+
+    module: Callable[[Any], nn.Module]
+    module_init_args: List[Any]
+    module_init_kwargs: Dict[str, Any]
+
+    @nn.compact
+    def __call__(self, *args) -> Tuple[jax.Array, None]:
+        """
+        jax carry-op format of block
+        assumes the input contains an input tensor to the block along with kwargs that might be send to the block
+        kwargs are assumed to have static role, while the input changes between cycles
+
+        Returns:
+            Tuple[jax.Array, None]: Output tensor from the block
+        """
+        mod = self.module(*self.module_init_args, **self.module_init_kwargs)
+        output = mod(*args)
+        return output, None
+
+
+class RepeatableLayer(nn.Module):
+    """
+    RepeatableLayer will assume a similar role to torch.nn.ModuleList
+    with the condition that each block has the same graph, and only the parameters differ
+
+    The compilation in RepeatableLayer will happen only once, in contrast to repeat-graph compilation
+    """
+
+    module: Callable[[Any], nn.Module]
+    """
+    A Callable function for single block construction
+    """
+
+    num_layers: int
+    """
+    The amount of blocks to build
+    """
+
+    module_init_args: List[Any] = field(default_factory=list)
+    """
+    args passed to RepeatableLayer.module callable, to support block construction
+    """
+
+    module_init_kwargs: Dict[str, Any] = field(default_factory=dict)
+    """
+    kwargs passed to RepeatableLayer.module callable, to support block construction
+    """
+
+    pspec_name: Optional[str] = None
+    """
+    Partition spec metadata
+    """
+
+    param_scan_axis: int = 0
+    """
+    The axis that the "layers" will be aggragated on
+    eg: if a kernel is shaped (8, 16)
+    N layers will be (N, 8, 16) if param_scan_axis=0
+    and (8, N, 16) if param_scan_axis=1
+    """
+
+    @nn.compact
+    def __call__(self, *args):
+
+        scan_kwargs = {}
+        if self.pspec_name is not None:
+            scan_kwargs["metadata_params"] = {
+                nn.PARTITION_NAME: self.pspec_name}
+
+        initializing = self.is_mutable_collection("params")
+        params_spec = self.param_scan_axis if initializing else partitioning.ScanIn(
+            self.param_scan_axis)
+        scan_fn = nn.scan(
+            RepeatableCarryBlock,
+            variable_axes={
+                "params": params_spec,
+                "cache": 0,
+                "intermediates": 0,
+                "aqt": 0,
+                "_overwrite_with_gradient": 0,
+            },  # Separate params per timestep
+            split_rngs={"params": True},
+            in_axes=(nn.broadcast,) * (len(args) - 1),
+            length=self.num_layers,
+            **scan_kwargs,
+        )
+        wrapped_function = scan_fn(
+            self.module, self.module_init_args, self.module_init_kwargs)
+        x, _ = wrapped_function(*args)
+        return x