force sharding on FFN intermediate activation

eltsai · eltsai · commit 073f70300112 · 2025-12-06T19:09:07.000Z
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -250,6 +250,8 @@ def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: n
       jax.debug.print(f"MLP input shape: {{shape}}", shape=hidden_states.shape)
       jax.debug.inspect_array_sharding(hidden_states, callback=print)
       hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
+      # Add logical constraint to ensure batch dimension is properly sharded
+      hidden_states = nn.with_logical_constraint(hidden_states, ("activation_batch", "activation_length", "mlp"))
       jax.debug.print(f"MLP intermediate activation shape: {{shape}}", shape=hidden_states.shape)
       jax.debug.inspect_array_sharding(hidden_states, callback=print)
       hidden_states = checkpoint_name(hidden_states, "ffn_activation")