sharding k,v,q across context by fix in attwntion_ltx2.py

prishajain1 · prishajain1 · commit 91f29c4dc172 · 2026-04-16T20:40:24.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -16,6 +16,7 @@
 
 from typing import Optional, Tuple
 from flax import nnx
+from flax import linen as nn
 import jax
 import jax.numpy as jnp
 from ... import common_types
@@ -485,8 +486,24 @@ def __call__(
             key = apply_rotary_emb(key, rotary_emb)
 
     with jax.named_scope("Attention and Output Project"):
-      # 4. Attention
-      # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
+      # Reshape to 4D [B, H, S, D] before passing to avoid All-Gather during transpose
+      b, s, _ = query.shape
+      h = self.heads
+      d = self.dim_head
+      
+      query = query.reshape(b, s, h, d).transpose(0, 2, 1, 3)
+      key = key.reshape(b, s, h, d).transpose(0, 2, 1, 3)
+      value = value.reshape(b, s, h, d).transpose(0, 2, 1, 3)
+      
+      # Apply explicit sharding constraints on the 4D tensors
+      q_axis_names = nn.logical_to_mesh_axes((common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_Q_LENGTH, common_types.D_KV))
+      kv_axis_names = nn.logical_to_mesh_axes((common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_KV_LENGTH, common_types.D_KV))
+      
+      query = jax.lax.with_sharding_constraint(query, q_axis_names)
+      key = jax.lax.with_sharding_constraint(key, kv_axis_names)
+      value = jax.lax.with_sharding_constraint(value, kv_axis_names)
+
+      # 4. Attention (passing 4D tensors now)
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
       # 7. Output Projection