sharding k,v,q across context by fix in attwntion_ltx2.py

prishajain1 · prishajain1 · commit 50160c655250 · 2026-04-16T20:48:15.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -487,13 +487,14 @@ def __call__(
 
     with jax.named_scope("Attention and Output Project"):
       # Reshape to 4D [B, H, S, D] before passing to avoid All-Gather during transpose
-      b, s, _ = query.shape
+      b, s_q, _ = query.shape
+      _, s_kv, _ = key.shape
       h = self.heads
       d = self.dim_head
       
-      query = query.reshape(b, s, h, d).transpose(0, 2, 1, 3)
-      key = key.reshape(b, s, h, d).transpose(0, 2, 1, 3)
-      value = value.reshape(b, s, h, d).transpose(0, 2, 1, 3)
+      query = query.reshape(b, s_q, h, d).transpose(0, 2, 1, 3)
+      key = key.reshape(b, s_kv, h, d).transpose(0, 2, 1, 3)
+      value = value.reshape(b, s_kv, h, d).transpose(0, 2, 1, 3)
       
       # Apply explicit sharding constraints on the 4D tensors
       q_axis_names = nn.logical_to_mesh_axes((common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_Q_LENGTH, common_types.D_KV))