Fix flash attention shard_map for sequence lengths not divisible by context mesh axis

mbohlool · mbohlool · commit e22ec2c45e21 · 2026-03-24T16:55:19.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -258,6 +258,24 @@ def _tpu_flash_attention(
   query = _reshape_data_for_flash(query, heads)
   key = _reshape_data_for_flash(key, heads)
   value = _reshape_data_for_flash(value, heads)
+
+  # Pad sequence dimension so it is evenly divisible by the context mesh axis,
+  # which shard_map requires.  The output is trimmed back afterwards and the
+  # existing segment-ID masking inside wrap_flash_attention ensures padded
+  # positions do not affect the result.
+  orig_q_seq_len = query.shape[2]
+  if num_context_shards > 1:
+    def _pad_seq_to_context(arr, axis=2):
+      rem = arr.shape[axis] % num_context_shards
+      if rem == 0:
+        return arr
+      pad_width = [(0, 0)] * arr.ndim
+      pad_width[axis] = (0, num_context_shards - rem)
+      return jnp.pad(arr, pad_width)
+    query = _pad_seq_to_context(query)
+    key = _pad_seq_to_context(key)
+    value = _pad_seq_to_context(value)
+
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
 
@@ -401,6 +419,8 @@ def ring_scan_body(carry, _):
         f" axis, batch dimension: {query.shape[0]}, devices_in_data_context: {devices_in_data_context}"
     )
   x = wrap_flash_attention(query, key, value)
+  # Trim back to original sequence length after context-axis padding.
+  x = x[:, :, :orig_q_seq_len, :]
   x = _reshape_heads_to_head_dim(x)
 
   return x