Add tokamax_ulysses

Perseus14 · Perseus14 · commit 7dfc26e433e0 · 2026-04-22T16:22:51.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -507,6 +507,7 @@ def _ulysses_attention(
     mask_padding_tokens: bool = True,
     residual_checkpoint_name: str | None = None,
     attention_mask: jax.Array = None,
+    attention_kernel: str = "ulysses",
 ) -> jax.Array:
   """Ulysses sequence-parallel attention.
 
@@ -530,7 +531,9 @@ def _ulysses_attention(
         "Ulysses attention requires the number of heads to be divisible by the context shard count, "
         f"got heads={num_heads} and context_shards={num_shards}."
     )
-  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, "flash")
+  
+  inner_kernel = "tokamax_flash" if attention_kernel == "tokamax_ulysses" else "flash"
+  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, inner_kernel)
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
@@ -597,14 +600,26 @@ def wrap_ulysses_attention(query, key, value):
     if not mask_padding_tokens:
       segment_ids = None
 
-    splash_kernel = splash_attention_kernel.make_splash_mha(
-        mask=multi_head_mask,
-        head_shards=1,
-        q_seq_shards=1,
-        block_sizes=block_sizes,
-        save_residuals=False,
-        residual_checkpoint_name=residual_checkpoint_name,
-    )
+    if attention_kernel == "tokamax_ulysses":
+      mask = tokamax_splash_attention_mask.FullMask(
+          _shape=(query.shape[2], key.shape[2]),
+      )
+      splash_kernel = tokamax_splash_attention_kernel.make_splash_mha(
+          mask=mask,
+          q_seq_shards=1,
+          config=convert_to_tokamax_splash_config(block_sizes, residual_checkpoint_name=residual_checkpoint_name),
+          save_residuals=False,
+      )
+    else:
+      splash_kernel = splash_attention_kernel.make_splash_mha(
+          mask=multi_head_mask,
+          head_shards=1,
+          q_seq_shards=1,
+          block_sizes=block_sizes,
+          save_residuals=False,
+          residual_checkpoint_name=residual_checkpoint_name,
+      )
+      
     vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
     attention_output = vmapped_splash(query, key, value, segment_ids)
     attention_output = attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
@@ -747,7 +762,7 @@ def _apply_attention(
   seq_len_idx = 1
   if query.ndim == 4:
     seq_len_idx = 2
-  if attention_kernel in ["flash", "tokamax_flash", "ulysses"]:
+  if attention_kernel in ["flash", "tokamax_flash", "ulysses", "tokamax_ulysses"]:
     can_use_flash_attention = (
         query.shape[seq_len_idx] >= flash_min_seq_length
         and key.shape[seq_len_idx] >= flash_min_seq_length
@@ -759,7 +774,7 @@ def _apply_attention(
     return _apply_attention_dot(
         query, key, value, dtype, heads, dim_head, scale, split_head_dim, float32_qk_product, use_memory_efficient_attention
     )
-  elif attention_kernel == "ulysses":
+  elif attention_kernel in ["ulysses", "tokamax_ulysses"]:
     return _ulysses_attention(
         query,
         key * scale,
@@ -773,6 +788,7 @@ def _apply_attention(
         mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name=residual_checkpoint_name,
         attention_mask=attention_mask,
+        attention_kernel=attention_kernel,
     )
   elif attention_kernel in ["flash", "tokamax_flash"]:
     return _tpu_flash_attention(
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -214,7 +214,7 @@ def user_init(raw_keys):
     # Verify qkv is sharded across sequence.
     attention = raw_keys["attention"]
     uses_ring_attention = "ring" in attention
-    uses_ulysses_attention = attention == "ulysses"
+    uses_ulysses_attention = attention in ["ulysses", "tokamax_ulysses"]
     uses_uniform_sequence_sharding = raw_keys["attention_sharding_uniform"]
     if uses_ring_attention or uses_ulysses_attention or uses_uniform_sequence_sharding:
       max_logging.log(
diff --git a/src/maxdiffusion/tests/attention_test.py b/src/maxdiffusion/tests/attention_test.py
@@ -442,5 +442,79 @@ def fake_kernel(q, k, v, segment_ids):
     self.assertTrue(jnp.array_equal(output, expected))
 
 
+  def test_tokamax_ulysses_attention_matches_tokamax_flash(self):
+    """Tokamax Flash and Tokamax Ulysses should agree when the local splash kernel is shared."""
+    batch = 2
+    length = 6
+    heads = 4
+    head_depth = 3
+    query = jnp.arange(batch * length * heads * head_depth, dtype=jnp.float32).reshape(batch, length, heads * head_depth)
+    key = query + 100.0
+    value = query + 200.0
+    mesh = self._ulysses_mesh()
+
+    def fake_make_splash_mha(**unused_kwargs):
+      def fake_kernel(q, k, v, segment_ids):
+        del k, segment_ids
+        return q + jnp.mean(v, axis=1, keepdims=True)
+
+      return fake_kernel
+
+    with mock.patch.object(
+        attention_flax.tokamax_splash_attention_kernel,
+        "make_splash_mha",
+        side_effect=fake_make_splash_mha,
+    ):
+      with mesh, nn_partitioning.axis_rules(self._flash_axis_rules()):
+        flash_output = attention_flax._tpu_flash_attention(
+            query,
+            key,
+            value,
+            heads=heads,
+            mesh=mesh,
+            axis_names_q=(
+                attention_flax.BATCH,
+                attention_flax.SELF_ATTN_HEAD,
+                attention_flax.SELF_ATTN_Q_LENGTH,
+                attention_flax.D_KV,
+            ),
+            axis_names_kv=(
+                attention_flax.BATCH,
+                attention_flax.SELF_ATTN_HEAD,
+                attention_flax.SELF_ATTN_KV_LENGTH,
+                attention_flax.D_KV,
+            ),
+            flash_block_sizes=self._ulysses_block_sizes(),
+            dtype=jnp.float32,
+            attention_kernel="tokamax_flash",
+        )
+
+      with mesh, nn_partitioning.axis_rules(self._ulysses_axis_rules()):
+        ulysses_output = attention_flax._ulysses_attention(
+            query,
+            key,
+            value,
+            heads=heads,
+            mesh=mesh,
+            axis_names_q=(
+                attention_flax.BATCH,
+                attention_flax.SELF_ATTN_HEAD,
+                attention_flax.SELF_ATTN_Q_LENGTH,
+                attention_flax.D_KV,
+            ),
+            axis_names_kv=(
+                attention_flax.BATCH,
+                attention_flax.SELF_ATTN_HEAD,
+                attention_flax.SELF_ATTN_KV_LENGTH,
+                attention_flax.D_KV,
+            ),
+            flash_block_sizes=self._ulysses_block_sizes(),
+            dtype=jnp.float32,
+            attention_kernel="tokamax_ulysses",
+        )
+
+    self.assertEqual(flash_output.shape, ulysses_output.shape)
+    self.assertTrue(jnp.array_equal(flash_output, ulysses_output))
+
 if __name__ == "__main__":
   absltest.main()