Merge main

coolkp · coolkp · commit 69d2a30b97c8 · 2025-11-11T22:38:25.000Z
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -302,7 +302,7 @@ def wrap_flash_attention(query, key, value):
       splash_kernel = tokamax_splash_attention_kernel.make_splash_mha(
           mask=mask,
           q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
-          config=convert_to_tokamax_splash_config(block_sizes),
+          config=convert_to_tokamax_splash_config(block_sizes, residual_checkpoint_name=residual_checkpoint_name),
           save_residuals=True if attention_kernel == "ring" else False,
       )
     else:
@@ -312,6 +312,7 @@ def wrap_flash_attention(query, key, value):
           q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
           block_sizes=block_sizes,
           save_residuals=True if attention_kernel == "ring" else False,
+          residual_checkpoint_name=residual_checkpoint_name
       )
     vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -285,7 +285,7 @@ def __init__(
         attention_kernel=attention,
         dropout=dropout,
         is_self_attention=True,
-        mask_padding_tokens=mask_padding_tokens
+        mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name="self_attn",
     )
 
@@ -306,7 +306,7 @@ def __init__(
         attention_kernel=attention,
         dropout=dropout,
         is_self_attention=False,
-        mask_padding_tokens=mask_padding_tokens
+        mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name="cross_attn",
     )
     assert cross_attn_norm is True