passing in attention_mask

eltsai · eltsai · commit 4957a705a512 · 2026-04-13T21:56:37.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -614,6 +614,7 @@ def _apply_attention(
         attention_kernel,
         mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name=residual_checkpoint_name,
+        attention_mask=attention_mask,
     )
   elif "ring" in attention_kernel:
     return _tpu_flash_attention(
@@ -628,6 +629,7 @@ def _apply_attention(
         dtype,
         attention_kernel,
         mask_padding_tokens=mask_padding_tokens,
+        attention_mask=attention_mask,
     )
   elif attention_kernel == "cudnn_flash_te":
     return _cudnn_flash_attention(query, key, value, heads, mesh, dpa_layer)
@@ -1218,7 +1220,9 @@ def __call__(
       value_proj = checkpoint_name(value_proj, "value_proj")
 
       with jax.named_scope("apply_attention"):
-        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+        attn_output = self.attention_op.apply_attention(
+            query_proj, key_proj, value_proj, attention_mask=encoder_attention_mask
+        )
 
     else:
       # NEW PATH for I2V CROSS-ATTENTION
@@ -1462,7 +1466,7 @@ def __call__(self, hidden_states, encoder_hidden_states=None, attention_mask=Non
     key_proj = key_proj.transpose(0, 2, 1, 3).reshape(key_proj.shape[0], key_proj.shape[2], -1)
     value_proj = value_proj.transpose(0, 2, 1, 3).reshape(value_proj.shape[0], value_proj.shape[2], -1)
 
-    attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+    attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj, attention_mask=attention_mask)
     context_attn_output = None
 
     if encoder_hidden_states is not None: