text attn mask fix

prishajain1 · prishajain1 · commit 3538f1a3041e · 2026-01-11T13:49:58.000+05:30
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -151,18 +151,24 @@ def __init__(
       )
 
   def __call__(
-      self, timestep: jax.Array, encoder_hidden_states: jax.Array, encoder_hidden_states_image: Optional[jax.Array] = None
+      self, timestep: jax.Array, encoder_hidden_states: jax.Array, encoder_hidden_states_image: Optional[jax.Array] = None, text_attention_mask: Optional[jax.Array] = None
   ):
     timestep = self.timesteps_proj(timestep)
     temb = self.time_embedder(timestep)
     with jax.named_scope("time_proj"):
       timestep_proj = self.time_proj(self.act_fn(temb))
 
     encoder_hidden_states = self.text_embedder(encoder_hidden_states)
-    encoder_attention_mask = None
+    # Start with text attention mask (can be None for backward compatibility)
+    encoder_attention_mask = text_attention_mask
+    
     if encoder_hidden_states_image is not None:
-      encoder_hidden_states_image, encoder_attention_mask = self.image_embedder(encoder_hidden_states_image)
-    return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask
+      # For I2V: image embedder returns image embeddings and image mask
+      encoder_hidden_states_image, image_attention_mask = self.image_embedder(encoder_hidden_states_image)
+      # Store image mask separately - will be concatenated with text mask in WanModel
+      encoder_attention_mask = image_attention_mask
+    
+    return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask, text_attention_mask
 
 
 class ApproximateGELU(nnx.Module):
@@ -373,6 +379,7 @@ def __call__(
       rotary_emb: jax.Array,
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
+      encoder_attention_mask: Optional[jax.Array] = None,
   ):
     with self.conditional_named_scope("transformer_block"):
       shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
@@ -409,6 +416,7 @@ def __call__(
               encoder_hidden_states=encoder_hidden_states,
               deterministic=deterministic,
               rngs=rngs,
+              encoder_attention_mask=encoder_attention_mask,
           )
         with self.conditional_named_scope("cross_attn_residual"):
           hidden_states = hidden_states + attn_output
@@ -585,6 +593,7 @@ def __call__(
       timestep: jax.Array,
       encoder_hidden_states: jax.Array,
       encoder_hidden_states_image: Optional[jax.Array] = None,
+      text_attention_mask: Optional[jax.Array] = None,
       return_dict: bool = True,
       attention_kwargs: Optional[Dict[str, Any]] = None,
       deterministic: bool = True,
@@ -604,24 +613,36 @@ def __call__(
       hidden_states = self.patch_embedding(hidden_states)
       hidden_states = jax.lax.collapse(hidden_states, 1, -1)
     with self.conditional_named_scope("condition_embedder"):
-      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask = self.condition_embedder(
-          timestep, encoder_hidden_states, encoder_hidden_states_image
+      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image, encoder_attention_mask, text_mask_from_embedder = self.condition_embedder(
+          timestep, encoder_hidden_states, encoder_hidden_states_image, text_attention_mask
       )
     timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
+    # Use text_attention_mask if provided, otherwise fall back to text_mask_from_embedder
+    if text_attention_mask is None:
+        text_attention_mask = text_mask_from_embedder
+
     if encoder_hidden_states_image is not None:
+        # I2V case: concatenate image + text embeddings and their masks
         encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states], axis=1)
-        if encoder_attention_mask is not None:
-            text_mask = jnp.ones((encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]), dtype=jnp.int32)
-            encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
+        if encoder_attention_mask is not None and text_attention_mask is not None:
+            # Concatenate image mask + text mask (both are real masks now!)
+            encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_attention_mask], axis=1)
+        elif text_attention_mask is not None:
+            # Only text mask available (shouldn't happen in I2V, but handle gracefully)
+            encoder_attention_mask = text_attention_mask
+        # else: encoder_attention_mask remains as-is (image mask only)
         encoder_hidden_states = encoder_hidden_states.astype(hidden_states.dtype)
+    elif text_attention_mask is not None:
+        # T2V case: only text, use text mask directly
+        encoder_attention_mask = text_attention_mask
 
     if self.scan_layers:
 
       def scan_fn(carry, block):
         hidden_states_carry, rngs_carry = carry
         hidden_states = block(
-            hidden_states_carry, encoder_hidden_states, timestep_proj, rotary_emb, deterministic, rngs_carry
+            hidden_states_carry, encoder_hidden_states, timestep_proj, rotary_emb, deterministic, rngs_carry, encoder_attention_mask
         )
         new_carry = (hidden_states, rngs_carry)
         return new_carry, None
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -419,9 +419,6 @@ def _get_t5_prompt_embeds(
       num_videos_per_prompt: int = 1,
       max_sequence_length: int = 226,
   ):
-    jax.debug.print("Prompt Shape")
-    jax.debug.print(f"Length of prompt list: {len(prompt)}")
-
     prompt = [prompt] if isinstance(prompt, str) else prompt
     prompt = [prompt_clean(u) for u in prompt]
     batch_size = len(prompt)
@@ -436,25 +433,29 @@ def _get_t5_prompt_embeds(
         return_tensors="pt",
     )
     text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
-    print("Text Input IDS")
-    print(text_input_ids)
-    print(text_input_ids.shape)
-    print("Mask")
-    print(mask)
-    print(mask.shape)
     seq_lens = mask.gt(0).sum(dim=1).long()
     prompt_embeds = self.text_encoder(text_input_ids, mask).last_hidden_state
     prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
     prompt_embeds = torch.stack(
         [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
     )
 
+    # Create attention mask: 1 for real tokens, 0 for padded tokens
+    # This mask reflects the actual content after trimming and re-padding with zeros
+    text_attention_mask = torch.zeros((batch_size, max_sequence_length), dtype=torch.int32, device=mask.device)
+    for i, length in enumerate(seq_lens):
+        text_attention_mask[i, :length] = 1
+
     # duplicate text embeddings for each generation per prompt, using mps friendly method
     _, seq_len, _ = prompt_embeds.shape
     prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
     prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    
+    # duplicate attention mask for each generation per prompt
+    text_attention_mask = text_attention_mask.repeat(1, num_videos_per_prompt)
+    text_attention_mask = text_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
 
-    return prompt_embeds
+    return prompt_embeds, text_attention_mask
 
   def encode_prompt(
       self,
@@ -467,25 +468,30 @@ def encode_prompt(
   ):
     prompt = [prompt] if isinstance(prompt, str) else prompt
     batch_size = len(prompt)
+    text_attention_mask = None
+    negative_text_attention_mask = None
+    
     if prompt_embeds is None:
-      prompt_embeds = self._get_t5_prompt_embeds(
+      prompt_embeds, text_attention_mask = self._get_t5_prompt_embeds(
           prompt=prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
+      text_attention_mask = jnp.array(text_attention_mask.detach().numpy(), dtype=jnp.int32)
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
       negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-      negative_prompt_embeds = self._get_t5_prompt_embeds(
+      negative_prompt_embeds, negative_text_attention_mask = self._get_t5_prompt_embeds(
           prompt=negative_prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
+      negative_text_attention_mask = jnp.array(negative_text_attention_mask.detach().numpy(), dtype=jnp.int32)
 
-    return prompt_embeds, negative_prompt_embeds
+    return prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask
 
   def prepare_latents(
       self,
@@ -617,7 +623,7 @@ def _prepare_model_inputs_i2v(
     effective_batch_size = batch_size * num_videos_per_prompt
 
     # 1. Encode Prompts
-    prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+    prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask = self.encode_prompt(
         prompt=prompt,
         negative_prompt=negative_prompt,
         num_videos_per_prompt=num_videos_per_prompt,
@@ -662,8 +668,10 @@ def _prepare_model_inputs_i2v(
     prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
     negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
     image_embeds = jax.device_put(image_embeds, data_sharding)
+    text_attention_mask = jax.device_put(text_attention_mask, data_sharding)
+    negative_text_attention_mask = jax.device_put(negative_text_attention_mask, data_sharding)
 
-    return prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size
+    return prompt_embeds, negative_prompt_embeds, image_embeds, text_attention_mask, negative_text_attention_mask, effective_batch_size
 
 
   def _prepare_model_inputs(
@@ -696,7 +704,7 @@ def _prepare_model_inputs(
       batch_size = len(prompt)
 
       with jax.named_scope("Encode-Prompt"):
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+        prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
             max_sequence_length=max_sequence_length,
@@ -724,12 +732,14 @@ def _prepare_model_inputs(
       latents = jax.device_put(latents, data_sharding)
       prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
       negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
+      text_attention_mask = jax.device_put(text_attention_mask, data_sharding)
+      negative_text_attention_mask = jax.device_put(negative_text_attention_mask, data_sharding)
 
       scheduler_state = self.scheduler.set_timesteps(
           self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
       )
 
-      return latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames
+      return latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames
 
   @abstractmethod
   def __call__(self, **kwargs):
@@ -747,9 +757,10 @@ def transformer_forward_pass(
     do_classifier_free_guidance,
     guidance_scale,
     encoder_hidden_states_image=None,
+    text_attention_mask=None,
 ):
   wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
-  noise_pred = wan_transformer(hidden_states=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, encoder_hidden_states_image=encoder_hidden_states_image)
+  noise_pred = wan_transformer(hidden_states=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, encoder_hidden_states_image=encoder_hidden_states_image, text_attention_mask=text_attention_mask)
   if do_classifier_free_guidance:
     bsz = latents.shape[0] // 2
     noise_cond = noise_pred[:bsz]  # First half = conditional
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -89,7 +89,7 @@ def __call__(
     negative_prompt_embeds: Optional[jax.Array] = None,
     vae_only: bool = False,
   ):
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -122,6 +122,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -133,6 +135,8 @@ def run_inference_2_1(
     latents: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    text_attention_mask: jnp.array,
+    negative_text_attention_mask: jnp.array,
     guidance_scale: float,
     num_inference_steps: int,
     scheduler: FlaxUniPCMultistepScheduler,
@@ -141,6 +145,7 @@ def run_inference_2_1(
   do_classifier_free_guidance = guidance_scale > 1.0
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    text_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
   for step in range(num_inference_steps):
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
     if do_classifier_free_guidance:
@@ -156,6 +161,7 @@ def run_inference_2_1(
         prompt_embeds,
         do_classifier_free_guidance=do_classifier_free_guidance,
         guidance_scale=guidance_scale,
+        text_attention_mask=text_attention_mask,
     )
 
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -100,7 +100,7 @@ def __call__(
     negative_prompt_embeds: jax.Array = None,
     vae_only: bool = False,
   ):
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -139,6 +139,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -153,6 +155,8 @@ def run_inference_2_2(
     latents: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    text_attention_mask: jnp.array,
+    negative_text_attention_mask: jnp.array,
     guidance_scale_low: float,
     guidance_scale_high: float,
     boundary: int,
@@ -163,21 +167,24 @@ def run_inference_2_2(
   do_classifier_free_guidance = guidance_scale_low > 1.0 or guidance_scale_high > 1.0
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    text_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
 
   def low_noise_branch(operands):
-    latents, timestep, prompt_embeds = operands
+    latents, timestep, prompt_embeds, text_attention_mask = operands
     return transformer_forward_pass(
         low_noise_graphdef, low_noise_state, low_noise_rest,
         latents, timestep, prompt_embeds,
-        do_classifier_free_guidance, guidance_scale_low
+        do_classifier_free_guidance, guidance_scale_low,
+        text_attention_mask=text_attention_mask
     )
 
   def high_noise_branch(operands):
-    latents, timestep, prompt_embeds = operands
+    latents, timestep, prompt_embeds, text_attention_mask = operands
     return transformer_forward_pass(
         high_noise_graphdef, high_noise_state, high_noise_rest,
         latents, timestep, prompt_embeds,
-        do_classifier_free_guidance, guidance_scale_high
+        do_classifier_free_guidance, guidance_scale_high,
+        text_attention_mask=text_attention_mask
     )
 
   for step in range(num_inference_steps):
@@ -195,7 +202,7 @@ def high_noise_branch(operands):
         use_high_noise,
         high_noise_branch,
         low_noise_branch,
-        (latents, timestep, prompt_embeds)
+        (latents, timestep, prompt_embeds, text_attention_mask)
     )
 
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -171,7 +171,7 @@ def __call__(
         max_logging.log(f"Adjusted num_frames to: {num_frames}")
     num_frames = max(num_frames, 1)
 
-    prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size = self._prepare_model_inputs_i2v(
+    prompt_embeds, negative_prompt_embeds, image_embeds, text_attention_mask, negative_text_attention_mask, effective_batch_size = self._prepare_model_inputs_i2v(
         prompt, image, negative_prompt, num_videos_per_prompt, max_sequence_length,
         prompt_embeds, negative_prompt_embeds, image_embeds, last_image
     )
@@ -212,6 +212,8 @@ def __call__(
     prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
     negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
     image_embeds = jax.device_put(image_embeds, data_sharding)
+    text_attention_mask = jax.device_put(text_attention_mask, data_sharding)
+    negative_text_attention_mask = jax.device_put(negative_text_attention_mask, data_sharding)
     if first_frame_mask is not None:
         first_frame_mask = jax.device_put(first_frame_mask, data_sharding)
 
@@ -233,6 +235,8 @@ def __call__(
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
           image_embeds=image_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
           first_frame_mask=first_frame_mask,
           scheduler_state=scheduler_state,
           rng=inference_rng,
@@ -252,6 +256,8 @@ def run_inference_2_1_i2v(
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
     image_embeds: jnp.array,
+    text_attention_mask: jnp.array,
+    negative_text_attention_mask: jnp.array,
     guidance_scale: float,
     num_inference_steps: int,
     scheduler: FlaxUniPCMultistepScheduler,
@@ -265,6 +271,7 @@ def run_inference_2_1_i2v(
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
     image_embeds = jnp.concatenate([image_embeds, image_embeds], axis=0)
     condition = jnp.concatenate([condition] * 2)
+    text_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
   for step in range(num_inference_steps):
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
     latents_input = latents
@@ -280,6 +287,7 @@ def run_inference_2_1_i2v(
         do_classifier_free_guidance=do_classifier_free_guidance,
         guidance_scale=guidance_scale,
         encoder_hidden_states_image=image_embeds,
+        text_attention_mask=text_attention_mask,
     )
     noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents, return_dict=False)  
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py