resolving comments, tests pending

eltsai · eltsai · commit 56c76b81e587 · 2026-04-27T17:22:07.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -522,6 +522,7 @@ def _ulysses_attention(
     mask_padding_tokens: bool = True,
     residual_checkpoint_name: str | None = None,
     attention_mask: jax.Array = None,
+    use_custom_kernel: bool = False,
 ) -> jax.Array:
   """Ulysses sequence-parallel attention.
 
@@ -545,7 +546,9 @@ def _ulysses_attention(
         "Ulysses attention requires the number of heads to be divisible by the context shard count, "
         f"got heads={num_heads} and context_shards={num_shards}."
     )
-  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, "flash")
+
+  if not use_custom_kernel:
+    block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, "flash")
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
@@ -564,65 +567,93 @@ def wrap_ulysses_attention(query, key, value):
     key = jax.lax.all_to_all(key, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
     value = jax.lax.all_to_all(value, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
 
-    # Run the same local splash kernel as standard TPU flash attention, but now
-    # on full-sequence / fewer-heads tensors produced by the all-to-all above.
-    uses_fused_kernel = block_sizes.use_fused_bwd_kernel
-    block_q_sizes = (block_sizes.block_q, block_sizes.block_q_dkv)
-    block_kv_sizes = (block_sizes.block_kv, block_sizes.block_kv_dkv)
-    if uses_fused_kernel:
-      block_q_sizes += (block_sizes.block_q_dkv,)
-      block_kv_sizes += (block_sizes.block_kv_dkv,)
-    else:
-      block_q_sizes += (block_sizes.block_q_dq,)
-      block_kv_sizes += (block_sizes.block_kv_dq,)
+    if use_custom_kernel:
+      bq = 4864
+      bkv = 1024
+      bkv_compute = 1024
+      bkv_compute_in = 1024
+      heads_per_tile = 1
 
-    block_q = max(*block_q_sizes)
-    query, kv_size, query_seq_len = _pad_data_for_flash(query, heads, block_q)
-    block_kv = max(*block_kv_sizes)
-    key, _, key_seq_len = _pad_data_for_flash(key, heads, block_kv)
-    value, _, _ = _pad_data_for_flash(value, heads, block_kv)
+      query_scaled = query * 1.44269504
 
-    mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
-    multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
+      query, kv_size, query_seq_len = _pad_data_for_flash(query, heads, bq)
+      key, _, key_seq_len = _pad_data_for_flash(key, heads, bkv)
+      value, _, _ = _pad_data_for_flash(value, heads, bkv)
 
-    q_padded_len = query.shape[2]
-    q_indices = jax.lax.broadcasted_iota(jnp.int32, (q_padded_len,), 0)
-    q_segment_ids = (q_indices < query_seq_len).astype(jnp.int32)
+      bsizes = custom_splash._BlockSizes(block_q=bq, block_kv=bkv, block_kv_compute=bkv_compute)
 
-    kv_padded_len = key.shape[2]
-    kv_indices = jax.lax.broadcasted_iota(jnp.int32, (kv_padded_len,), 0)
-    kv_segment_ids = (kv_indices < key_seq_len).astype(jnp.int32)
+      splash_kernel = custom_splash.make_splash_mha(
+          block_sizes=bsizes,
+          bkv_compute_in=bkv_compute_in,
+          orig_q_seq_len=query_seq_len,
+          orig_kv_seq_len=key_seq_len,
+          heads_per_tile=heads_per_tile,
+      )
 
-    # Reuse the standard flash-attention masking convention by zeroing invalid
-    # KV positions in the segment ids passed down to splash.
-    if attention_mask is not None:
-      mask_len = min(key_seq_len, attention_mask.shape[1])
-      kv_mask_for_batch = attention_mask[0, :mask_len]
-      if key_seq_len > mask_len:
-        extra_valid = jnp.ones((key_seq_len - mask_len,), dtype=jnp.int32)
-        kv_mask_for_batch = jnp.concatenate([kv_mask_for_batch, extra_valid], axis=0)
-      if kv_padded_len > key_seq_len:
-        padding = jnp.zeros((kv_padded_len - key_seq_len,), dtype=jnp.int32)
-        kv_mask_padded = jnp.concatenate([kv_mask_for_batch, padding], axis=0)
+      vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0))
+      attention_output = vmapped_splash(query_scaled, key, value)
+      attention_output = jnp.swapaxes(attention_output, 2, 3)
+      attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
+    else:
+      # Run the same local splash kernel as standard TPU flash attention, but now
+      # on full-sequence / fewer-heads tensors produced by the all-to-all above.
+      uses_fused_kernel = block_sizes.use_fused_bwd_kernel
+      block_q_sizes = (block_sizes.block_q, block_sizes.block_q_dkv)
+      block_kv_sizes = (block_sizes.block_kv, block_sizes.block_kv_dkv)
+      if uses_fused_kernel:
+        block_q_sizes += (block_sizes.block_q_dkv,)
+        block_kv_sizes += (block_sizes.block_kv_dkv,)
       else:
-        kv_mask_padded = kv_mask_for_batch
-      kv_segment_ids = (kv_segment_ids * kv_mask_padded).astype(jnp.int32)
+        block_q_sizes += (block_sizes.block_q_dq,)
+        block_kv_sizes += (block_sizes.block_kv_dq,)
+
+      block_q = max(*block_q_sizes)
+      query, kv_size, query_seq_len = _pad_data_for_flash(query, heads, block_q)
+      block_kv = max(*block_kv_sizes)
+      key, _, key_seq_len = _pad_data_for_flash(key, heads, block_kv)
+      value, _, _ = _pad_data_for_flash(value, heads, block_kv)
+
+      mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
+      multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
+
+      q_padded_len = query.shape[2]
+      q_indices = jax.lax.broadcasted_iota(jnp.int32, (q_padded_len,), 0)
+      q_segment_ids = (q_indices < query_seq_len).astype(jnp.int32)
+
+      kv_padded_len = key.shape[2]
+      kv_indices = jax.lax.broadcasted_iota(jnp.int32, (kv_padded_len,), 0)
+      kv_segment_ids = (kv_indices < key_seq_len).astype(jnp.int32)
+
+      # Reuse the standard flash-attention masking convention by zeroing invalid
+      # KV positions in the segment ids passed down to splash.
+      if attention_mask is not None:
+        mask_len = min(key_seq_len, attention_mask.shape[1])
+        kv_mask_for_batch = attention_mask[0, :mask_len]
+        if key_seq_len > mask_len:
+          extra_valid = jnp.ones((key_seq_len - mask_len,), dtype=jnp.int32)
+          kv_mask_for_batch = jnp.concatenate([kv_mask_for_batch, extra_valid], axis=0)
+        if kv_padded_len > key_seq_len:
+          padding = jnp.zeros((kv_padded_len - key_seq_len,), dtype=jnp.int32)
+          kv_mask_padded = jnp.concatenate([kv_mask_for_batch, padding], axis=0)
+        else:
+          kv_mask_padded = kv_mask_for_batch
+        kv_segment_ids = (kv_segment_ids * kv_mask_padded).astype(jnp.int32)
 
-    segment_ids = splash_attention_kernel.SegmentIds(q=q_segment_ids, kv=kv_segment_ids)
-    if not mask_padding_tokens:
-      segment_ids = None
+      segment_ids = splash_attention_kernel.SegmentIds(q=q_segment_ids, kv=kv_segment_ids)
+      if not mask_padding_tokens:
+        segment_ids = None
 
-    splash_kernel = splash_attention_kernel.make_splash_mha(
-        mask=multi_head_mask,
-        head_shards=1,
-        q_seq_shards=1,
-        block_sizes=block_sizes,
-        save_residuals=False,
-        residual_checkpoint_name=residual_checkpoint_name,
-    )
-    vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
-    attention_output = vmapped_splash(query, key, value, segment_ids)
-    attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
+      splash_kernel = splash_attention_kernel.make_splash_mha(
+          mask=multi_head_mask,
+          head_shards=1,
+          q_seq_shards=1,
+          block_sizes=block_sizes,
+          save_residuals=False,
+          residual_checkpoint_name=residual_checkpoint_name,
+      )
+      vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
+      attention_output = vmapped_splash(query, key, value, segment_ids)
+      attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
 
     # Restore the original layout expected by the rest of the model:
     # head-sharded / full-sequence -> sequence-sharded / full-heads.
@@ -642,94 +673,6 @@ def wrap_ulysses_attention(query, key, value):
   return x
 
 
-def _ulysses_custom_attention(
-    query: jax.Array,
-    key: jax.Array,
-    value: jax.Array,
-    heads: int,
-    mesh: Mesh,
-    axis_names_q: AxisNames,
-    axis_names_kv: AxisNames,
-    flash_block_sizes: BlockSizes,
-    dtype: jnp.dtype = jnp.float32,
-    mask_padding_tokens: bool = False,
-    residual_checkpoint_name: str | None = None,
-    attention_mask: jax.Array = None,
-) -> jax.Array:
-  """Ulysses sequence-parallel attention with custom fast kernel."""
-  axis_name = "context"
-  num_shards = mesh.shape[axis_name]
-
-  # Reshape to [b, h, s, d] and pad sequence for even context-axis splitting.
-  query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_shards)
-  key, _ = _reshape_data_for_flash(key, heads, num_shards)
-  value, _ = _reshape_data_for_flash(value, heads, num_shards)
-  num_heads = query.shape[1]
-  if num_heads % num_shards != 0:
-    raise ValueError(
-        "Ulysses attention requires the number of heads to be divisible by the context shard count, "
-        f"got heads={num_heads} and context_shards={num_shards}."
-    )
-
-  q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
-  kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
-
-  @functools.partial(
-      jax.shard_map,
-      mesh=mesh,
-      in_specs=(q_axis_names, kv_axis_names, kv_axis_names),
-      out_specs=q_axis_names,
-      check_vma=False,
-  )
-  def wrap_ulysses_attention(query, key, value):
-    query = jax.lax.all_to_all(query, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
-    key = jax.lax.all_to_all(key, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
-    value = jax.lax.all_to_all(value, axis_name=axis_name, split_axis=1, concat_axis=2, tiled=True)
-
-    bq = 2048
-    bkv = 2048
-    bkv_compute = 1024
-    bkv_compute_in = 256
-    heads_per_tile = 1
-
-    query_scaled = query * 1.44269504
-
-    query, kv_size, query_seq_len = _pad_data_for_flash(query, heads, bq)
-    key, _, key_seq_len = _pad_data_for_flash(key, heads, bkv)
-    value, _, _ = _pad_data_for_flash(value, heads, bkv)
-
-    bsizes = custom_splash._BlockSizes(block_q=bq, block_kv=bkv, block_kv_compute=bkv_compute)
-
-    splash_kernel = custom_splash.make_splash_mha(
-        block_sizes=bsizes,
-        bkv_compute_in=bkv_compute_in,
-        orig_q_seq_len=query_seq_len,
-        orig_kv_seq_len=key_seq_len,
-        heads_per_tile=heads_per_tile,
-    )
-
-    vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0))
-    attention_output = vmapped_splash(query_scaled, key, value)
-    attention_output = jnp.swapaxes(attention_output, 2, 3)
-
-    attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
-
-    attention_output = jax.lax.all_to_all(attention_output, axis_name=axis_name, split_axis=2, concat_axis=1, tiled=True)
-    return attention_output
-
-  devices_in_batch_sharding = mesh.shape["data"] * (mesh.shape["fsdp"] if "fsdp" in mesh.shape else 1)
-  if not (query.shape[0] / devices_in_batch_sharding).is_integer():
-    max_logging.log(
-        "Warning, batch dimension should be shardable among the devices in data and fsdp"
-        f" axis, batch dimension: {query.shape[0]}, devices_in_batch_sharding: {devices_in_batch_sharding}"
-    )
-  x = wrap_ulysses_attention(query, key, value)
-  x = x[:, :, :orig_q_seq_len, :]
-  x = _reshape_heads_to_head_dim(x)
-
-  return x
-
-
 def _apply_attention_dot(
     query: Array,
     key: Array,
@@ -865,7 +808,7 @@ def _apply_attention(
         query, key, value, dtype, heads, dim_head, scale, split_head_dim, float32_qk_product, use_memory_efficient_attention
     )
   elif attention_kernel == "ulysses_custom":
-    return _ulysses_custom_attention(
+    return _ulysses_attention(
         query,
         key * scale,
         value,
@@ -878,6 +821,7 @@ def _apply_attention(
         mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name=residual_checkpoint_name,
         attention_mask=attention_mask,
+        use_custom_kernel=True,
     )
   elif attention_kernel == "ulysses":
     return _ulysses_attention(
diff --git a/src/maxdiffusion/models/custom_splash_attention.py b/src/maxdiffusion/models/custom_splash_attention.py
@@ -1,3 +1,19 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
 """Custom Pallas flash attention kernel for TPU."""
 
 import functools