text attn fix

prishajain1 · prishajain1 · commit bfec1e4f1100 · 2026-01-11T16:31:25.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1110,7 +1110,7 @@ def __call__(
       value_proj = checkpoint_name(value_proj, "value_proj")
 
       with jax.named_scope("apply_attention"):
-        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj, attention_mask=attention_mask)
 
     else:
       # NEW PATH for I2V CROSS-ATTENTION
@@ -1134,14 +1134,17 @@ def __call__(
         # It contains the image mask: [1]*257 + [0]*127 for 257 real image tokens padded to 384
         if encoder_attention_mask is not None:
             encoder_attention_mask_img = encoder_attention_mask[:, :padded_img_len]
+            encoder_hidden_states_text = encoder_attention_mask[:,padded_img_len:]
         else:
             # Fallback: no mask means treat all as valid
             encoder_attention_mask_img = None
+            encoder_hidden_states_text = None
       else:
         # If no image_seq_len is specified, treat all as text
         encoder_hidden_states_img = None
         encoder_hidden_states_text = encoder_hidden_states
         encoder_attention_mask_img = None
+        encoder_attention_mask_text = encoder_attention_mask
       
       if self.qk_norm:
         with self.conditional_named_scope("attn_q_norm"):
@@ -1179,7 +1182,7 @@ def __call__(
 
         # Attention - tensors are (B, S, D)
         with self.conditional_named_scope("cross_attn_text_apply"):
-          attn_output_text = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text)
+          attn_output_text = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text, attention_mask=encoder_attention_mask_text)
         with self.conditional_named_scope("cross_attn_img_apply"):
           # Pass encoder_attention_mask_img for image cross-attention to mask padded tokens
           attn_output_img = self.attention_op.apply_attention(query_proj_img, key_proj_img, value_proj_img, attention_mask=encoder_attention_mask_img)
@@ -1192,7 +1195,7 @@ def __call__(
         value_proj_text = checkpoint_name(value_proj_text, "value_proj_text")
 
         with self.conditional_named_scope("cross_attn_text_apply"):
-          attn_output = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text)
+          attn_output = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text, attention_mask=encoder_attention_mask_text)
 
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -587,6 +587,7 @@ def __call__(
       timestep: jax.Array,
       encoder_hidden_states: jax.Array,
       encoder_hidden_states_image: Optional[jax.Array] = None,
+      encoder_attention_mask: Optional[jax.Array] = None,
       return_dict: bool = True,
       attention_kwargs: Optional[Dict[str, Any]] = None,
       deterministic: bool = True,
@@ -606,17 +607,30 @@ def __call__(
       hidden_states = self.patch_embedding(hidden_states)
       hidden_states = jax.lax.collapse(hidden_states, 1, -1)
     with self.conditional_named_scope("condition_embedder"):
-      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask = self.condition_embedder(
+      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, image_attention_mask = self.condition_embedder(
           timestep, encoder_hidden_states, encoder_hidden_states_image
       )
     timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
+    # Handle attention mask for I2V vs T2V
     if encoder_hidden_states_image is not None:
+        # I2V case: concatenate [image | text] embeddings
         encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states], axis=1)
-        if encoder_attention_mask is not None:
-            text_mask = jnp.ones((encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]), dtype=jnp.int32)
-            encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
+        
+        # Build combined attention mask: [image_mask | text_mask]
+        if image_attention_mask is not None:
+            # Image mask from embedder (e.g., [1]*257 + [0]*127 for padded image)
+            if encoder_attention_mask is not None:
+                # Use the text mask passed from pipeline
+                combined_mask = jnp.concatenate([image_attention_mask, encoder_attention_mask], axis=1)
+            else:
+                # No text mask provided, assume all text tokens are valid (old behavior)
+                text_len = encoder_hidden_states.shape[1] - image_attention_mask.shape[1]
+                text_mask = jnp.ones((encoder_hidden_states.shape[0], text_len), dtype=jnp.int32)
+                combined_mask = jnp.concatenate([image_attention_mask, text_mask], axis=1)
+            encoder_attention_mask = combined_mask
         encoder_hidden_states = encoder_hidden_states.astype(hidden_states.dtype)
+    # For T2V: encoder_attention_mask is already the text mask passed from pipeline
 
     if self.scan_layers:
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -440,12 +440,24 @@ def _get_t5_prompt_embeds(
         [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
     )
 
+    # Create attention mask: 1 for real tokens, 0 for padded tokens
+    text_attention_mask = torch.zeros((batch_size, max_sequence_length), dtype=torch.long)
+    for i, seq_len_i in enumerate(seq_lens):
+        text_attention_mask[i, :seq_len_i] = 1
+
     # duplicate text embeddings for each generation per prompt, using mps friendly method
     _, seq_len, _ = prompt_embeds.shape
     prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
     prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    
+    # Duplicate attention mask for each generation per prompt
+    text_attention_mask = text_attention_mask.repeat(1, num_videos_per_prompt)
+    text_attention_mask = text_attention_mask.view(batch_size * num_videos_per_prompt, max_sequence_length)
+    
+    # Convert to JAX array
+    text_attention_mask = jnp.array(text_attention_mask.numpy())
 
-    return prompt_embeds
+    return prompt_embeds, text_attention_mask
 
   def encode_prompt(
       self,
@@ -459,24 +471,28 @@ def encode_prompt(
     prompt = [prompt] if isinstance(prompt, str) else prompt
     batch_size = len(prompt)
     if prompt_embeds is None:
-      prompt_embeds = self._get_t5_prompt_embeds(
+      prompt_embeds, text_attention_mask = self._get_t5_prompt_embeds(
           prompt=prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      text_attention_mask = None
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
       negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-      negative_prompt_embeds = self._get_t5_prompt_embeds(
+      negative_prompt_embeds, negative_text_attention_mask = self._get_t5_prompt_embeds(
           prompt=negative_prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      negative_text_attention_mask = None
 
-    return prompt_embeds, negative_prompt_embeds
+    return prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask
 
   def prepare_latents(
       self,
@@ -687,7 +703,7 @@ def _prepare_model_inputs(
       batch_size = len(prompt)
 
       with jax.named_scope("Encode-Prompt"):
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+        prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
             max_sequence_length=max_sequence_length,
@@ -715,12 +731,16 @@ def _prepare_model_inputs(
       latents = jax.device_put(latents, data_sharding)
       prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
       negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
+      if text_attention_mask is not None:
+        text_attention_mask = jax.device_put(text_attention_mask, data_sharding)
+      if negative_text_attention_mask is not None:
+        negative_text_attention_mask = jax.device_put(negative_text_attention_mask, data_sharding)
 
       scheduler_state = self.scheduler.set_timesteps(
           self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
       )
 
-      return latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames
+      return latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames
 
   @abstractmethod
   def __call__(self, **kwargs):
@@ -738,9 +758,16 @@ def transformer_forward_pass(
     do_classifier_free_guidance,
     guidance_scale,
     encoder_hidden_states_image=None,
+    encoder_attention_mask=None,
 ):
   wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
-  noise_pred = wan_transformer(hidden_states=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, encoder_hidden_states_image=encoder_hidden_states_image)
+  noise_pred = wan_transformer(
+      hidden_states=latents,
+      timestep=timestep,
+      encoder_hidden_states=prompt_embeds,
+      encoder_hidden_states_image=encoder_hidden_states_image,
+      encoder_attention_mask=encoder_attention_mask,
+  )
   if do_classifier_free_guidance:
     bsz = latents.shape[0] // 2
     noise_cond = noise_pred[:bsz]  # First half = conditional
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -89,7 +89,7 @@ def __call__(
     negative_prompt_embeds: Optional[jax.Array] = None,
     vae_only: bool = False,
   ):
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -122,6 +122,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -137,10 +139,15 @@ def run_inference_2_1(
     num_inference_steps: int,
     scheduler: FlaxUniPCMultistepScheduler,
     scheduler_state,
+    text_attention_mask: Optional[jnp.array] = None,
+    negative_text_attention_mask: Optional[jnp.array] = None,
 ):
   do_classifier_free_guidance = guidance_scale > 1.0
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    # Concatenate text attention masks for CFG
+    if text_attention_mask is not None and negative_text_attention_mask is not None:
+      text_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
   for step in range(num_inference_steps):
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
     if do_classifier_free_guidance:
@@ -156,6 +163,7 @@ def run_inference_2_1(
         prompt_embeds,
         do_classifier_free_guidance=do_classifier_free_guidance,
         guidance_scale=guidance_scale,
+        encoder_attention_mask=text_attention_mask,
     )
 
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()