Trying text_mask 2\3

prishajain1 · prishajain1 · commit 4907d0855974 · 2026-01-11T20:23:06.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1110,7 +1110,10 @@ def __call__(
       value_proj = checkpoint_name(value_proj, "value_proj")
 
       with jax.named_scope("apply_attention"):
-        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+        if is_self_attention:
+          attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+        else:
+          attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj, attention_mask=encoder_attention_mask)
 
     else:
       # NEW PATH for I2V CROSS-ATTENTION
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -587,6 +587,7 @@ def __call__(
       timestep: jax.Array,
       encoder_hidden_states: jax.Array,
       encoder_hidden_states_image: Optional[jax.Array] = None,
+      encoder_attention_mask: Optional[jax.Array] = None,
       return_dict: bool = True,
       attention_kwargs: Optional[Dict[str, Any]] = None,
       deterministic: bool = True,
@@ -606,17 +607,30 @@ def __call__(
       hidden_states = self.patch_embedding(hidden_states)
       hidden_states = jax.lax.collapse(hidden_states, 1, -1)
     with self.conditional_named_scope("condition_embedder"):
-      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask = self.condition_embedder(
+      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, image_attention_mask = self.condition_embedder(
           timestep, encoder_hidden_states, encoder_hidden_states_image
       )
     timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
+    # Handle masks for T2V vs I2V
     if encoder_hidden_states_image is not None:
+        # I2V case: concatenate image and text embeddings
         encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states], axis=1)
-        if encoder_attention_mask is not None:
-            text_mask = jnp.ones((encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]), dtype=jnp.int32)
-            encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
+        
+        # Build combined mask: [image_mask | text_mask]
+        if image_attention_mask is not None:
+            # We have image mask from embedder
+            if encoder_attention_mask is not None:
+                # Use passed text mask (from pipeline)
+                combined_mask = jnp.concatenate([image_attention_mask, encoder_attention_mask], axis=1)
+            else:
+                # No text mask passed, use all-ones (old behavior for backward compat)
+                text_len = encoder_hidden_states.shape[1] - image_attention_mask.shape[1]
+                text_mask = jnp.ones((encoder_hidden_states.shape[0], text_len), dtype=jnp.int32)
+                combined_mask = jnp.concatenate([image_attention_mask, text_mask], axis=1)
+            encoder_attention_mask = combined_mask
         encoder_hidden_states = encoder_hidden_states.astype(hidden_states.dtype)
+    # For T2V: encoder_attention_mask is already the text mask passed from pipeline
 
     if self.scan_layers:
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -777,13 +777,13 @@ def transformer_forward_pass(
   # jax.debug.print("[DEBUG transformer_forward_pass] encoder_attention_mask shape: {}", 
   #                 encoder_attention_mask.shape if encoder_attention_mask is not None else "None")
   
-  # For now, DON'T pass the mask - just accept it
+  # Now actually pass the mask to the transformer
   noise_pred = wan_transformer(
       hidden_states=latents, 
       timestep=timestep, 
       encoder_hidden_states=prompt_embeds, 
-      encoder_hidden_states_image=encoder_hidden_states_image
-      # encoder_attention_mask=encoder_attention_mask  # TODO: Add this next
+      encoder_hidden_states_image=encoder_hidden_states_image,
+      encoder_attention_mask=encoder_attention_mask
   )
   if do_classifier_free_guidance:
     bsz = latents.shape[0] // 2