AI-Hypercomputer
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 12 additions & 7 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 41 additions & 24 deletions b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 41 additions & 24 deletions
diff --git a/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 11 additions & 21 deletions b/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 11 additions & 21 deletions
diff --git a/‎src/maxdiffusion/models/ltx2/logical_sharding_ltx2.py‎
Lines changed: 116 additions & 0 deletions b/‎src/maxdiffusion/models/ltx2/logical_sharding_ltx2.py‎
Lines changed: 116 additions & 0 deletions
@@ -15,7 +15,7 @@
 import contextlib
 import functools
 import math
-from typing import Optional, Callable, Tuple
+from typing import Optional, Callable, Tuple, Any
 import flax.linen as nn
 from flax import nnx
 import jax
@@ -607,8 +607,6 @@ def wrap_ulysses_attention(query, key, value):
           orig_q_seq_len=query_seq_len,
           orig_kv_seq_len=key_seq_len,
           heads_per_tile=heads_per_tile,
-          use_base2_exp=use_base2_exp,
-          use_experimental_scheduler=use_experimental_scheduler,
       )
 
       vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0))
@@ -1106,9 +1104,16 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: Optional[jax.lax.Precision] = None,
+      sharding_specs: Optional[Any] = None,
   ):
     inner_dim = int(dim * mult)
     dim_out = dim_out if dim_out is not None else dim
+
+    net_0_kernel = getattr(sharding_specs, "net_0_kernel", (None, "mlp"))
+    net_0_bias = getattr(sharding_specs, "net_0_bias", ("mlp",))
+    net_2_kernel = getattr(sharding_specs, "net_2_kernel", ("mlp", None))
+    net_2_bias = getattr(sharding_specs, "net_2_bias", (None,))
+
     self.net_0 = nnx.Linear(
         dim,
         inner_dim,
@@ -1117,8 +1122,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "mlp")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), net_0_kernel),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, net_0_bias),
     )
     self.act = get_activation(activation_fn)
     self.net_2 = nnx.Linear(
@@ -1129,8 +1134,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", None)),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None,)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), net_2_kernel),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, net_2_bias),
     )
 
   def __call__(self, hidden_states: Array) -> Array:
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Optional
+from typing import Optional, Any
 import flax.linen as nn
 from flax import nnx
 import jax.numpy as jnp
@@ -60,6 +60,9 @@ def get_sinusoidal_embeddings(
   return signal
 
 
+
+
+
 class NNXTimestepEmbedding(nnx.Module):
   r"""
   Time step Embedding Module. Learns embeddings for input time steps.
@@ -84,7 +87,12 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
+    linear_1_kernel = getattr(sharding_specs, "emb_linear_1_kernel", ("embed", "mlp"))
+    linear_1_bias = getattr(sharding_specs, "emb_linear_1_bias", ("mlp",))
+    linear_2_kernel = getattr(sharding_specs, "emb_linear_2_kernel", ("mlp", "embed"))
+    linear_2_bias = getattr(sharding_specs, "emb_linear_2_bias", ("embed",))
     self.linear_1 = nnx.Linear(
         rngs=rngs,
         in_features=in_channels,
@@ -95,12 +103,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "embed",
-                "mlp",
-            ),
+            linear_1_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_1_bias),
     )
 
     if cond_proj_dim is not None:
@@ -127,12 +132,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "mlp",
-                "embed",
-            ),
+            linear_2_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_2_bias),
     )
 
     if post_act_fn is None:
@@ -336,7 +338,12 @@ def __init__(
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
+      sharding_specs: Optional[Any] = None,
   ):
+    linear_1_kernel = getattr(sharding_specs, "emb_linear_1_kernel", ("embed", "mlp"))
+    linear_1_bias = getattr(sharding_specs, "emb_linear_1_bias", ("mlp",))
+    linear_2_kernel = getattr(sharding_specs, "emb_linear_2_kernel", ("mlp", "embed"))
+    linear_2_bias = getattr(sharding_specs, "emb_linear_2_bias", ("embed",))
     if out_features is None:
       out_features = hidden_size
 
@@ -350,12 +357,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "embed",
-                "mlp",
-            ),
+            linear_1_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_1_bias),
     )
     self.act_1 = get_activation(act_fn)
 
@@ -369,12 +373,9 @@ def __init__(
         precision=precision,
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
-            (
-                "mlp",
-                "embed",
-            ),
+            linear_2_kernel,
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, linear_2_bias),
     )
 
   def __call__(self, caption):
@@ -530,22 +531,38 @@ def __init__(
       use_additional_conditions: bool = False,
       dtype: jnp.dtype = jnp.float32,
       weights_dtype: jnp.dtype = jnp.float32,
+      sharding_specs: Optional[Any] = None,
   ):
     self.outdim = size_emb_dim
     self.use_additional_conditions = use_additional_conditions
 
     self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
     self.timestep_embedder = NNXTimestepEmbedding(
-        rngs=rngs, in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, weights_dtype=weights_dtype
+        rngs=rngs,
+        in_channels=256,
+        time_embed_dim=embedding_dim,
+        dtype=dtype,
+        weights_dtype=weights_dtype,
+        sharding_specs=sharding_specs,
     )
 
     if use_additional_conditions:
       self.additional_condition_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
       self.resolution_embedder = NNXTimestepEmbedding(
-          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+          rngs=rngs,
+          in_channels=256,
+          time_embed_dim=size_emb_dim,
+          dtype=dtype,
+          weights_dtype=weights_dtype,
+          sharding_specs=sharding_specs,
       )
       self.aspect_ratio_embedder = NNXTimestepEmbedding(
-          rngs=rngs, in_channels=256, time_embed_dim=size_emb_dim, dtype=dtype, weights_dtype=weights_dtype
+          rngs=rngs,
+          in_channels=256,
+          time_embed_dim=size_emb_dim,
+          dtype=dtype,
+          weights_dtype=weights_dtype,
+          sharding_specs=sharding_specs,
       )
 
   def __call__(
 
@@ -21,6 +21,7 @@
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
 from maxdiffusion.tpu_utils import get_tpu_type, TpuType
+from .logical_sharding_ltx2 import get_sharding_specs, LTX2DiTShardingSpecs
 
 Array = common_types.Array
 Mesh = common_types.Mesh
@@ -350,43 +351,32 @@ def __init__(
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
       flash_min_seq_length: int = 4096,
-      qkv_sharding_spec: Optional[tuple] = None,
-      out_sharding_spec: Optional[tuple] = None,
-      out_bias_sharding_spec: Optional[tuple] = None,
+      sharding_specs: Optional[LTX2DiTShardingSpecs] = None,
   ):
     self.heads = heads
     self.rope_type = rope_type
     self.dim_head = dim_head
     self.inner_dim = dim_head * heads
     self.dropout_rate = dropout
 
-    # Auto-detect hardware for sharding specs if not overridden
-    tpu_type = get_tpu_type()
-    is_ironwood = tpu_type == TpuType.TPU_7X
-
-    # Hardware-aware sharding: Ironwood (v7x) uses 1D sharding along the heads dimension (leaving the embedding dimension replicated)
-    # to minimize cross-device communication, while other hardware defaults to 2D sharding along both heads and embed dimensions.
-    # This has currently only been tested on Trillium (v6e) and Ironwood (v7x).
-    if qkv_sharding_spec is None:
-      qkv_sharding_spec = (None, "heads") if is_ironwood else ("embed", "heads")
-    if out_sharding_spec is None:
-      out_sharding_spec = ("heads", None) if is_ironwood else ("heads", "embed")
-    if out_bias_sharding_spec is None:
-      out_bias_sharding_spec = (None,) if is_ironwood else ("embed",)
+    if sharding_specs is None:
+      specs = get_sharding_specs("default", "ltx2_dit")
+    else:
+      specs = sharding_specs
 
     # 1. Define Partitioned Initializers (Logical Axes)
     # Q, K, V kernels: [in_features (embed), out_features (heads)]
-    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), qkv_sharding_spec)
+    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), specs.qkv_kernel)
     # Q, K, V biases: [out_features (heads)]
-    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",))
+    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), specs.qkv_bias)
 
     # Out kernel: [in_features (heads), out_features (embed)]
-    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), out_sharding_spec)
+    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), specs.out_kernel)
     # Out bias: [out_features (embed)]
-    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), out_bias_sharding_spec)
+    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), specs.out_bias)
 
     # Norm scales
-    norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), ("norm",))
+    norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), specs.norm_scale)
 
     # 2. Projections
     self.to_q = nnx.Linear(
 
@@ -0,0 +1,116 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Optional
+from maxdiffusion.tpu_utils import get_tpu_type, TpuType
+
+
+# --- Discrete Specs ---
+@dataclass
+class LTX2DiTShardingSpecs:
+  """Sharding specs for the LTX2 Diffusion Transformer."""
+
+  # --- Attention Layers (LTX2Attention) ---
+  qkv_kernel: tuple
+  out_kernel: tuple
+  out_bias: tuple
+  qkv_bias: tuple = ("heads",)
+
+  # --- Feed Forward Network (NNXSimpleFeedForward) ---
+  net_0_kernel: tuple = (None, "mlp")
+  net_0_bias: tuple = ("mlp",)
+  net_2_kernel: tuple = ("mlp", None)
+  net_2_bias: tuple = (None,)
+
+  # --- Input/Output Projections and Tables ---
+  embed_kernel: tuple = (None, "embed")
+  embed_bias: tuple = ("embed",)
+  out_embed_kernel: tuple = ("embed", None)
+  out_embed_bias: tuple = (None,)
+
+  # --- Shared Embeddings (NNXTimestepEmbedding, NNXPixArtAlphaTextProjection) ---
+  emb_linear_1_kernel: tuple = ("embed", "mlp")
+  emb_linear_1_bias: tuple = ("mlp",)
+  emb_linear_2_kernel: tuple = ("mlp", "embed")
+  emb_linear_2_bias: tuple = ("embed",)
+
+  # --- Normalization ---
+  norm_scale: tuple = ("norm",)
+
+
+@dataclass
+class TextEncoderShardingSpecs:
+  """Specs for the Text Encoder execution."""
+
+  use_batched_text_encoder: bool = False
+  text_encoder_kernel: Optional[tuple] = None
+
+
+@dataclass
+class VAEShardingSpecs:
+  """Sharding specs for the VAE."""
+
+  vae_conv_kernel: Optional[tuple] = None
+  force_replication: bool = True
+
+
+# --- Unified Registry for LTX2 ---
+STRATEGIES = {
+    "ironwood": {
+        "ltx2_dit": LTX2DiTShardingSpecs(
+            qkv_kernel=(None, "heads"),
+            out_kernel=("heads", None),
+            out_bias=(None,),
+        ),
+        "text_encoder": TextEncoderShardingSpecs(
+            use_batched_text_encoder=True,
+            text_encoder_kernel=(None, "embed"),
+        ),
+        "vae": VAEShardingSpecs(vae_conv_kernel=("batch", None, None, None)),
+    },
+    "trillium": {
+        "ltx2_dit": LTX2DiTShardingSpecs(
+            qkv_kernel=("embed", "heads"),
+            out_kernel=("heads", "embed"),
+            out_bias=("embed",),
+        ),
+        "text_encoder": TextEncoderShardingSpecs(
+            use_batched_text_encoder=False,
+            text_encoder_kernel=(None, "embed"),
+        ),
+        "vae": VAEShardingSpecs(vae_conv_kernel=(None, None, None, None)),
+    },
+}
+
+
+def get_sharding_specs(strategy_name: str, component_name: str) -> Any:
+  """Unified factory to get specs for any component.
+
+  If strategy_name is 'default', it auto-detects the hardware.
+  """
+  if strategy_name == "default":
+    tpu_type = get_tpu_type()
+    if tpu_type == TpuType.TPU_7X:
+      strategy_name = "ironwood"
+    else:
+      strategy_name = "trillium"
+
+  hardware_profile = STRATEGIES.get(strategy_name, STRATEGIES["trillium"])
+  specs = hardware_profile.get(component_name)
+  if specs is None:
+    raise ValueError(f"Component {component_name} not found in strategy {strategy_name}")
+  return specs