AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 6 additions & 0 deletions b/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 12 additions & 7 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 38 additions & 24 deletions b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 38 additions & 24 deletions
diff --git a/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 11 additions & 21 deletions b/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 11 additions & 21 deletions
diff --git a/‎src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py‎
Lines changed: 11 additions & 1 deletion b/‎src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py‎
Lines changed: 11 additions & 1 deletion
@@ -64,6 +64,12 @@ logical_axis_rules: [
                     ]
 data_sharding: ['data', 'fsdp', 'context', 'tensor']
 
+sharding:
+  transformer: 'default'
+  vae: 'default'
+  text_encoder: 'default'
+  text_connector: 'default'
+
 dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
 dcn_fsdp_parallelism: -1
 
 
@@ -15,7 +15,7 @@
 import contextlib
 import functools
 import math
-from typing import Optional, Callable, Tuple
+from typing import Optional, Callable, Tuple, Any
 import flax.linen as nn
 from flax import nnx
 import jax
@@ -607,8 +607,6 @@ def wrap_ulysses_attention(query, key, value):
           orig_q_seq_len=query_seq_len,
           orig_kv_seq_len=key_seq_len,
           heads_per_tile=heads_per_tile,
-          use_base2_exp=use_base2_exp,
-          use_experimental_scheduler=use_experimental_scheduler,
       )
 
       vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0))
@@ -1106,9 +1104,16 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: Optional[jax.lax.Precision] = None,
+      sharding_specs: Optional[Any] = None,
   ):
     inner_dim = int(dim * mult)
     dim_out = dim_out if dim_out is not None else dim
+
+    net_0_kernel = getattr(sharding_specs, "net_0_kernel", (None, "mlp"))
+    net_0_bias = getattr(sharding_specs, "net_0_bias", ("mlp",))
+    net_2_kernel = getattr(sharding_specs, "net_2_kernel", ("mlp", None))
+    net_2_bias = getattr(sharding_specs, "net_2_bias", (None,))
+
     self.net_0 = nnx.Linear(
         dim,
         inner_dim,
@@ -1117,8 +1122,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "mlp")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), net_0_kernel),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, net_0_bias),
     )
     self.act = get_activation(activation_fn)
     self.net_2 = nnx.Linear(
@@ -1129,8 +1134,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", None)),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None,)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), net_2_kernel),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, net_2_bias),
     )
 
   def __call__(self, hidden_states: Array) -> Array:
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Optional
+from typing import Optional, Any
 import flax.linen as nn
 from flax import nnx
 import jax.numpy as jnp
@@ -84,7 +84,12 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
+    linear_1_kernel = getattr(sharding_specs, "emb_linear_1_kernel", ("embed", "mlp"))
+    linear_1_bias = getattr(sharding_specs, "emb_linear_1_bias", ("mlp",))
+    linear_2_kernel = getattr(sharding_specs, "emb_linear_2_kernel", ("mlp", "embed"))
+    linear_2_bias = getattr(sharding_specs, "emb_linear_2_bias", ("embed",))
     self.linear_1 = nnx.Linear(
         rngs=rngs,
         in_features=in_channels,
@@ -95,12 +100,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "embed",
-                "mlp",
-            ),
+            linear_1_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_1_bias),
     )
 
     if cond_proj_dim is not None:
@@ -127,12 +129,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "mlp",
-                "embed",
-            ),
+            linear_2_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_2_bias),
     )
 
     if post_act_fn is None:
@@ -336,7 +335,12 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
+    linear_1_kernel = getattr(sharding_specs, "emb_linear_1_kernel", ("embed", "mlp"))
+    linear_1_bias = getattr(sharding_specs, "emb_linear_1_bias", ("mlp",))
+    linear_2_kernel = getattr(sharding_specs, "emb_linear_2_kernel", ("mlp", "embed"))
+    linear_2_bias = getattr(sharding_specs, "emb_linear_2_bias", ("embed",))
     if out_features is None:
       out_features = hidden_size
 
@@ -350,12 +354,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "embed",
-                "mlp",
-            ),
+            linear_1_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_1_bias),
     )
     self.act_1 = get_activation(act_fn)
 
@@ -369,12 +370,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "mlp",
-                "embed",
-            ),
+            linear_2_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_2_bias),
     )
 
   def __call__(self, caption):
@@ -530,22 +528,38 @@ def __init__(
       use_additional_conditions: bool = False,
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
+      sharding_specs: Optional[Any] = None,
   ):
     self.outdim = size_emb_dim
     self.use_additional_conditions = use_additional_conditions
 
     self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
     self.timestep_embedder = NNXTimestepEmbedding(
-        rngs=rngs, in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, weights_dtype=weights_dtype
+        rngs=rngs,
+        in_channels=256,
+        time_embed_dim=embedding_dim,
+        dtype=dtype,
+        weights_dtype=weights_dtype,
+        sharding_specs=sharding_specs,
     )
 
     if use_additional_conditions:
       self.additional_condition_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
       self.resolution_embedder = NNXTimestepEmbedding(
-          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+          rngs=rngs,
+          in_channels=256,
+          time_embed_dim=size_emb_dim,
+          dtype=dtype,
+          weights_dtype=weights_dtype,
+          sharding_specs=sharding_specs,
       )
       self.aspect_ratio_embedder = NNXTimestepEmbedding(
-          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+          rngs=rngs,
+          in_channels=256,
+          time_embed_dim=size_emb_dim,
+          dtype=dtype,
+          weights_dtype=weights_dtype,
+          sharding_specs=sharding_specs,
       )
 
   def __call__(
 
@@ -21,6 +21,7 @@
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
 from maxdiffusion.tpu_utils import get_tpu_type, TpuType
+from .logical_sharding_ltx2 import get_sharding_specs, LTX2DiTShardingSpecs
 
 Array = common_types.Array
 Mesh = common_types.Mesh
@@ -350,43 +351,32 @@ def __init__(
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
       flash_min_seq_length: int = 4096,
-      qkv_sharding_spec: Optional[tuple] = None,
-      out_sharding_spec: Optional[tuple] = None,
-      out_bias_sharding_spec: Optional[tuple] = None,
+      sharding_specs: Optional[LTX2DiTShardingSpecs] = None,
   ):
     self.heads = heads
     self.rope_type = rope_type
     self.dim_head = dim_head
     self.inner_dim = dim_head * heads
     self.dropout_rate = dropout
 
-    # Auto-detect hardware for sharding specs if not overridden
-    tpu_type = get_tpu_type()
-    is_ironwood = tpu_type == TpuType.TPU_7X
-
-    # Hardware-aware sharding: Ironwood (v7x) uses 1D sharding along the heads dimension (leaving the embedding dimension replicated)
-    # to minimize cross-device communication, while other hardware defaults to 2D sharding along both heads and embed dimensions.
-    # This has currently only been tested on Trillium (v6e) and Ironwood (v7x).
-    if qkv_sharding_spec is None:
-      qkv_sharding_spec = (None, "heads") if is_ironwood else ("embed", "heads")
-    if out_sharding_spec is None:
-      out_sharding_spec = ("heads", None) if is_ironwood else ("heads", "embed")
-    if out_bias_sharding_spec is None:
-      out_bias_sharding_spec = (None,) if is_ironwood else ("embed",)
+    if sharding_specs is None:
+      specs = get_sharding_specs("default", "ltx2_dit")
+    else:
+      specs = sharding_specs
 
     # 1. Define Partitioned Initializers (Logical Axes)
     # Q, K, V kernels: [in_features (embed), out_features (heads)]
-    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), qkv_sharding_spec)
+    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), specs.qkv_kernel)
     # Q, K, V biases: [out_features (heads)]
-    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",))
+    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), specs.qkv_bias)
 
     # Out kernel: [in_features (heads), out_features (embed)]
-    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), out_sharding_spec)
+    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), specs.out_kernel)
     # Out bias: [out_features (embed)]
-    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), out_bias_sharding_spec)
+    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), specs.out_bias)
 
     # Norm scales
-    norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), ("norm",))
+    norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), specs.norm_scale)
 
     # 2. Projections
     self.to_q = nnx.Linear(
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Union, Optional, Sequence
+from typing import Tuple, Union, Optional, Sequence, Any
 
 import jax
 import jax.numpy as jnp
@@ -584,6 +584,7 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
     if timestep_conditioning:
       self.time_embedder = nnx.data(
@@ -594,6 +595,7 @@ def __init__(
               use_additional_conditions=False,
               dtype=dtype,
               weights_dtype=weights_dtype,
+              sharding_specs=sharding_specs,
           )
       )
     else:
@@ -674,6 +676,7 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
     out_channels = out_channels or in_channels
 
@@ -687,6 +690,7 @@ def __init__(
               use_additional_conditions=False,
               dtype=dtype,
               weights_dtype=weights_dtype,
+              sharding_specs=sharding_specs,
           )
       )
 
@@ -960,6 +964,7 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
     self.patch_size = patch_size
     self.patch_size_t = patch_size_t
@@ -999,6 +1004,7 @@ def __init__(
         dtype=dtype,
         weights_dtype=weights_dtype,
         precision=precision,
+        sharding_specs=sharding_specs,
     )
 
     # up blocks
@@ -1026,6 +1032,7 @@ def __init__(
               dtype=dtype,
               weights_dtype=weights_dtype,
               precision=precision,
+              sharding_specs=sharding_specs,
           )
       )
 
@@ -1059,6 +1066,7 @@ def __init__(
               use_additional_conditions=False,
               dtype=dtype,
               weights_dtype=weights_dtype,
+              sharding_specs=sharding_specs,
           )
       )
     else:
@@ -1155,6 +1163,7 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
     self.encoder = LTX2VideoEncoder3d(
         in_channels=in_channels,
@@ -1196,6 +1205,7 @@ def __init__(
         dtype=dtype,
         weights_dtype=weights_dtype,
         precision=precision,
+        sharding_specs=sharding_specs,
     )
 
     self.scaling_factor = scaling_factor