sharding k,v,q across context by fix in attwntion_ltx2.py

prishajain1 · prishajain1 · commit 9651aeb395a1 · 2026-04-16T21:33:52.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -267,21 +267,38 @@ def _tpu_flash_attention(
         use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False,
     )
   num_context_shards = mesh.shape["context"]
-  query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_context_shards)
-  key, _ = _reshape_data_for_flash(key, heads, num_context_shards)
-  value, _ = _reshape_data_for_flash(value, heads, num_context_shards)
-
-  q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
-  kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
+  def _pad_3d(tensor, num_shards):
+    org_len = tensor.shape[1]
+    rem = org_len % num_shards
+    if rem == 0:
+      return tensor, org_len
+    pad_width = [(0, 0)] * tensor.ndim
+    pad_width[1] = (0, num_shards - rem)
+    return jnp.pad(tensor, pad_width), org_len
+
+  query, orig_q_seq_len = _pad_3d(query, num_context_shards)
+  key, _ = _pad_3d(key, num_context_shards)
+  value, _ = _pad_3d(value, num_context_shards)
+
+  # Define 3D sharding specs (Batch, Seq, None)
+  q_axis_names_3d = nn.logical_to_mesh_axes((axis_names_q[0], axis_names_q[2], None))
+  kv_axis_names_3d = nn.logical_to_mesh_axes((axis_names_kv[0], axis_names_kv[2], None))
+
+  # Output spec is still 4D [Batch, Heads, Seq, HeadDim]
+  q_axis_names_4d = nn.logical_to_mesh_axes(axis_names_q)
 
   @functools.partial(
       shard_map.shard_map,
       mesh=mesh,
-      in_specs=(q_axis_names, kv_axis_names, kv_axis_names),
-      out_specs=q_axis_names,
+      in_specs=(q_axis_names_3d, kv_axis_names_3d, kv_axis_names_3d),
+      out_specs=q_axis_names_4d,
       check_rep=False,
   )
   def wrap_flash_attention(query, key, value):
+    # Reshape to 4D inside shard_map to avoid All-Gather during transpose
+    query = _unflatten_heads(query, heads)
+    key = _unflatten_heads(key, heads)
+    value = _unflatten_heads(value, heads)
     uses_fused_kernel = block_sizes.use_fused_bwd_kernel
     block_q_sizes = (
         block_sizes.block_q,
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -16,7 +16,6 @@
 
 from typing import Optional, Tuple
 from flax import nnx
-from flax import linen as nn
 import jax
 import jax.numpy as jnp
 from ... import common_types
@@ -486,25 +485,8 @@ def __call__(
             key = apply_rotary_emb(key, rotary_emb)
 
     with jax.named_scope("Attention and Output Project"):
-      # Reshape to 4D [B, H, S, D] before passing to avoid All-Gather during transpose
-      b, s_q, _ = query.shape
-      _, s_kv, _ = key.shape
-      h = self.heads
-      d = self.dim_head
-      
-      query = query.reshape(b, s_q, h, d).transpose(0, 2, 1, 3)
-      key = key.reshape(b, s_kv, h, d).transpose(0, 2, 1, 3)
-      value = value.reshape(b, s_kv, h, d).transpose(0, 2, 1, 3)
-      
-      # Apply explicit sharding constraints on the 4D tensors
-      q_axis_names = nn.logical_to_mesh_axes((common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_Q_LENGTH, common_types.D_KV))
-      kv_axis_names = nn.logical_to_mesh_axes((common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_KV_LENGTH, common_types.D_KV))
-      
-      query = jax.lax.with_sharding_constraint(query, q_axis_names)
-      key = jax.lax.with_sharding_constraint(key, kv_axis_names)
-      value = jax.lax.with_sharding_constraint(value, kv_axis_names)
-
-      # 4. Attention (passing 4D tensors now)
+      # 4. Attention
+      # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
       # 7. Output Projection