AI-Hypercomputer
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 65 additions & 59 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 65 additions & 59 deletions
@@ -227,6 +227,38 @@ def convert_to_tokamax_splash_config(
   )
 
 
+def _resolve_tpu_attention_block_sizes(
+    query_seq_len: int,
+    kv_seq_len: int,
+    flash_block_sizes: BlockSizes,
+    dtype: jnp.dtype,
+    attention_kernel: str = "flash",
+) -> BlockSizes:
+  """Resolve TPU splash attention block sizes for self- and cross-attention."""
+  q_max_block_size = 1024 if dtype == jnp.bfloat16 else 512
+  is_cross_attention = kv_seq_len != query_seq_len
+  if is_cross_attention:
+    kv_max_block_size = ((kv_seq_len + 127) // 128) * 128
+  else:
+    kv_max_block_size = q_max_block_size
+
+  if flash_block_sizes and not is_cross_attention:
+    return flash_block_sizes
+
+  block_size_q = flash_block_sizes.block_q if flash_block_sizes else q_max_block_size
+  return splash_attention_kernel.BlockSizes(
+      block_q=block_size_q,
+      block_kv_compute=min(kv_max_block_size, kv_seq_len),
+      block_kv=min(kv_max_block_size, kv_seq_len),
+      block_q_dkv=block_size_q,
+      block_kv_dkv=min(kv_max_block_size, kv_seq_len),
+      block_kv_dkv_compute=min(kv_max_block_size, query_seq_len),
+      block_q_dq=None if attention_kernel == "tokamax_flash" else block_size_q,
+      block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query_seq_len),
+      use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False,
+  )
+
+
 def _tpu_flash_attention(
     query: jax.Array,
     key: jax.Array,
@@ -244,32 +276,17 @@ def _tpu_flash_attention(
 ) -> jax.Array:
   """TPU Flash Attention"""
 
-  q_max_block_size = 1024 if dtype == jnp.bfloat16 else 512
-  # This is the case for cross-attn.
-  if key.shape[1] != query.shape[1]:
-    kv_max_block_size = ((key.shape[1] + 127) // 128) * 128
-  else:
-    kv_max_block_size = q_max_block_size
-  # ensure that for cross attention we override the block sizes.
-  if flash_block_sizes and key.shape[1] == query.shape[1]:
-    block_sizes = flash_block_sizes
-  else:
-    block_size_q = flash_block_sizes.block_q if flash_block_sizes else q_max_block_size
-    block_sizes = splash_attention_kernel.BlockSizes(
-        block_q=block_size_q,
-        block_kv_compute=min(kv_max_block_size, key.shape[2]),
-        block_kv=min(kv_max_block_size, key.shape[2]),
-        block_q_dkv=block_size_q,
-        block_kv_dkv=min(kv_max_block_size, key.shape[2]),
-        block_kv_dkv_compute=min(kv_max_block_size, query.shape[2]),
-        block_q_dq=None if attention_kernel == "tokamax_flash" else block_size_q,
-        block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query.shape[2]),
-        use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False,
-    )
   num_context_shards = mesh.shape["context"]
   query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_context_shards)
   key, _ = _reshape_data_for_flash(key, heads, num_context_shards)
   value, _ = _reshape_data_for_flash(value, heads, num_context_shards)
+  block_sizes = _resolve_tpu_attention_block_sizes(
+      query_seq_len=query.shape[2],
+      kv_seq_len=key.shape[2],
+      flash_block_sizes=flash_block_sizes,
+      dtype=dtype,
+      attention_kernel=attention_kernel,
+  )
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
@@ -425,6 +442,7 @@ def ring_scan_body(carry, _):
 # Ulysses sequence-parallel attention
 # ---------------------------------------------------------------------------
 
+
 def _ulysses_attention(
     query: jax.Array,
     key: jax.Array,
@@ -456,53 +474,41 @@ def _ulysses_attention(
   query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_shards)
   key, _ = _reshape_data_for_flash(key, heads, num_shards)
   value, _ = _reshape_data_for_flash(value, heads, num_shards)
+  num_heads = query.shape[1]
+  # Ulysses only redistributes existing heads across the context mesh; unlike
+  # the earlier draft, we fail fast instead of padding synthetic heads.
+  if num_heads % num_shards != 0:
+    raise ValueError(
+        "Ulysses attention requires the number of heads to be divisible by the context shard count, "
+        f"got heads={num_heads} and context_shards={num_shards}."
+    )
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
 
-  # Pre-compute block sizes outside shard_map (uses global shapes).
-  q_max_block_size = 1024 if dtype == jnp.bfloat16 else 512
-  if key.shape[2] != query.shape[2]:
-    kv_max_block_size = ((key.shape[2] + 127) // 128) * 128
-  else:
-    kv_max_block_size = q_max_block_size
-  if flash_block_sizes and key.shape[2] == query.shape[2]:
-    block_sizes = flash_block_sizes
-  else:
-    block_size_q = flash_block_sizes.block_q if flash_block_sizes else q_max_block_size
-    block_sizes = splash_attention_kernel.BlockSizes(
-        block_q=block_size_q,
-        block_kv_compute=min(kv_max_block_size, key.shape[2]),
-        block_kv=min(kv_max_block_size, key.shape[2]),
-        block_q_dkv=block_size_q,
-        block_kv_dkv=min(kv_max_block_size, key.shape[2]),
-        block_kv_dkv_compute=min(kv_max_block_size, query.shape[2]),
-        block_q_dq=block_size_q,
-        block_kv_dq=min(kv_max_block_size, query.shape[2]),
-        use_fused_bwd_kernel=False,
-    )
+  block_sizes = _resolve_tpu_attention_block_sizes(
+      query_seq_len=query.shape[2],
+      kv_seq_len=key.shape[2],
+      flash_block_sizes=flash_block_sizes,
+      dtype=dtype,
+  )
 
   @functools.partial(
-      shard_map.shard_map,
+      jax.shard_map,
       mesh=mesh,
       in_specs=(q_axis_names, kv_axis_names, kv_axis_names),
       out_specs=q_axis_names,
-      check_rep=False,
+      check_vma=False,
   )
   def wrap_ulysses_attention(query, key, value):
-    # --- Step 1: all-to-all  sequence-sharded -> head-sharded ---
-    original_q_heads = query.shape[1]
-    head_pad = (-original_q_heads) % num_shards
-    if head_pad:
-      query = jnp.pad(query, ((0, 0), (0, head_pad), (0, 0), (0, 0)))
-      key = jnp.pad(key, ((0, 0), (0, head_pad), (0, 0), (0, 0)))
-      value = jnp.pad(value, ((0, 0), (0, head_pad), (0, 0), (0, 0)))
-
+    # Swap sharding modes: each device gives up a slice of sequence and gathers
+    # a slice of heads, so the local splash kernel sees the full sequence.
     query = jax.lax.all_to_all(query, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
     key = jax.lax.all_to_all(key, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
     value = jax.lax.all_to_all(value, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
 
-    # --- Step 2: local flash attention (full sequence, subset of heads) ---
+    # Run the same local splash kernel as standard TPU flash attention, but now
+    # on full-sequence / fewer-heads tensors produced by the all-to-all above.
     uses_fused_kernel = block_sizes.use_fused_bwd_kernel
     block_q_sizes = (block_sizes.block_q, block_sizes.block_q_dkv)
     block_kv_sizes = (block_sizes.block_kv, block_sizes.block_kv_dkv)
@@ -530,6 +536,8 @@ def wrap_ulysses_attention(query, key, value):
     kv_indices = jax.lax.broadcasted_iota(jnp.int32, (kv_padded_len,), 0)
     kv_segment_ids = (kv_indices < key_seq_len).astype(jnp.int32)
 
+    # Reuse the standard flash-attention masking convention by zeroing invalid
+    # KV positions in the segment ids passed down to splash.
     if attention_mask is not None:
       mask_len = min(key_seq_len, attention_mask.shape[1])
       kv_mask_for_batch = attention_mask[0, :mask_len]
@@ -559,11 +567,9 @@ def wrap_ulysses_attention(query, key, value):
     attention_output = vmapped_splash(query, key, value, segment_ids)
     attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
 
-    # --- Step 3: all-to-all  head-sharded -> sequence-sharded ---
-    attention_output = jax.lax.all_to_all(
-        attention_output, axis_name=axis_name, split_axis=2, concat_axis=1, tiled=True
-    )
-    attention_output = attention_output[:, :original_q_heads, :, :]
+    # Restore the original layout expected by the rest of the model:
+    # head-sharded / full-sequence -> sequence-sharded / full-heads.
+    attention_output = jax.lax.all_to_all(attention_output, axis_name=axis_name, split_axis=2, concat_axis=1, tiled=True)
     return attention_output
 
   devices_in_data_context = mesh.shape["data"] * num_shards