Explicit Activation Sharding in forward pass

prishajain1 · Perseus14 · commit 0f6ef5088a19 · 2026-02-12T07:12:57.000Z
parity check test fixed
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -17,6 +17,7 @@
 import jax
 import jax.numpy as jnp
 from flax import nnx
+import flax.linen as nn
 
 from maxdiffusion.models.ltx2.attention_ltx2 import LTX2Attention, LTX2RotaryPosEmbed
 from maxdiffusion.models.attention_flax import NNXSimpleFeedForward
@@ -321,6 +322,15 @@ def __call__(
   ) -> Tuple[jax.Array, jax.Array]:
     batch_size = hidden_states.shape[0]
 
+    axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_embed"))
+    hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)
+    audio_hidden_states = jax.lax.with_sharding_constraint(audio_hidden_states, axis_names)
+
+    if encoder_hidden_states is not None:
+      encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, axis_names)
+    if audio_encoder_hidden_states is not None:
+      audio_encoder_hidden_states = jax.lax.with_sharding_constraint(audio_encoder_hidden_states, axis_names)
+
     # 1. Video and Audio Self-Attention
     norm_hidden_states = self.norm1(hidden_states)
 
diff --git a/src/maxdiffusion/tests/ltx2_parity_test.py b/src/maxdiffusion/tests/ltx2_parity_test.py
@@ -570,21 +570,22 @@ def convert_weight(pt_key_base, jax_key):
 
     # 4. Run Forward
     print("Running MaxDiffusion forward pass...")
-    output = model(
-        hidden_states=jax_inputs["hidden_states"],
-        audio_hidden_states=jax_inputs["audio_hidden_states"],
-        encoder_hidden_states=jax_inputs["encoder_hidden_states"],
-        audio_encoder_hidden_states=jax_inputs["audio_encoder_hidden_states"],
-        timestep=jax_inputs["timestep"],
-        encoder_attention_mask=jax_inputs["encoder_attention_mask"],
-        audio_encoder_attention_mask=jax_inputs["audio_encoder_attention_mask"],
-        num_frames=config["num_frames"] if "num_frames" in config else 4,
-        height=config["height"] if "height" in config else 32,
-        width=config["width"] if "width" in config else 32,
-        audio_num_frames=128,
-        fps=24.0,
-        return_dict=True,
-    )
+    with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+      output = model(
+          hidden_states=jax_inputs["hidden_states"],
+          audio_hidden_states=jax_inputs["audio_hidden_states"],
+          encoder_hidden_states=jax_inputs["encoder_hidden_states"],
+          audio_encoder_hidden_states=jax_inputs["audio_encoder_hidden_states"],
+          timestep=jax_inputs["timestep"],
+          encoder_attention_mask=jax_inputs["encoder_attention_mask"],
+          audio_encoder_attention_mask=jax_inputs["audio_encoder_attention_mask"],
+          num_frames=config["num_frames"] if "num_frames" in config else 4,
+          height=config["height"] if "height" in config else 32,
+          width=config["width"] if "width" in config else 32,
+          audio_num_frames=128,
+          fps=24.0,
+          return_dict=True,
+      )
 
     max_sample = output["sample"]
     max_audio_sample = output["audio_sample"]