Fix LTX2 sharding: NNXSimpleFeedForward kernel axes and LTX2Attention bias axes

Perseus14 · Perseus14 · commit 5f32283afdaf · 2026-04-11T18:07:53.000+05:30
1. NNXSimpleFeedForward (used by LTX2 transformer blocks):
   - net_0 (up-projection): kernel sharding fixed from ('embed', None) to
     ('embed', 'mlp'). The output dim should be sharded across tensor axis
     to parallelize the computation. Previous sharding left the output
     fully replicated, causing unnecessary all-gathers.
   - net_2 (down-projection): kernel sharding fixed from ('embed', 'mlp')
     to ('mlp', 'embed'). Input dim must match net_0's output sharding,
     and output dim should use embed sharding. Previous sharding had the
     axes reversed, creating resharding overhead.
   - Bias axes updated to match their respective output dimensions.

2. LTX2Attention:
   - QKV bias: fixed from ('embed',) to ('heads',) to match the QKV
     kernel output dimension sharding.
   - Output projection bias: fixed from ('heads',) to ('embed',) to match
     the output kernel output dimension sharding.
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -717,8 +717,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", None)),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None,)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "mlp")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
     )
     self.act = get_activation(activation_fn)
     self.net_2 = nnx.Linear(
@@ -729,8 +729,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "mlp")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", "embed")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
   def __call__(self, hidden_states: Array) -> Array:
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -359,13 +359,13 @@ def __init__(
     # 1. Define Partitioned Initializers (Logical Axes)
     # Q, K, V kernels: [in_features (embed), out_features (heads)]
     qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads"))
-    # Q, K, V biases: [out_features (embed)]
-    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("embed",))
+    # Q, K, V biases: [out_features (heads)]
+    qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",))
 
     # Out kernel: [in_features (heads), out_features (embed)]
     out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed"))
-    # Out bias: [out_features (heads)]
-    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",))
+    # Out bias: [out_features (embed)]
+    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("embed",))
 
     # Norm scales
     norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), ("norm",))