Trying text_mask 1

prishajain1 · prishajain1 · commit acc7452e4274 · 2026-01-11T19:20:45.000+05:30
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -434,18 +434,36 @@ def _get_t5_prompt_embeds(
     )
     text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
     seq_lens = mask.gt(0).sum(dim=1).long()
+    
+    # DEBUG
+    print(f"[DEBUG _get_t5_prompt_embeds] seq_lens: {seq_lens.tolist()}, mask shape: {mask.shape}")
+    
     prompt_embeds = self.text_encoder(text_input_ids, mask).last_hidden_state
     prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
     prompt_embeds = torch.stack(
         [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
     )
 
+    # Create text attention mask
+    text_attention_mask = torch.zeros((batch_size, max_sequence_length), dtype=torch.long)
+    for i, seq_len_i in enumerate(seq_lens):
+        text_attention_mask[i, :seq_len_i] = 1
+    
+    print(f"[DEBUG _get_t5_prompt_embeds] text_attention_mask shape: {text_attention_mask.shape}, sum: {text_attention_mask.sum(dim=1).tolist()}")
+
     # duplicate text embeddings for each generation per prompt, using mps friendly method
     _, seq_len, _ = prompt_embeds.shape
     prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
     prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    
+    # Duplicate mask
+    text_attention_mask = text_attention_mask.repeat(1, num_videos_per_prompt)
+    text_attention_mask = text_attention_mask.view(batch_size * num_videos_per_prompt, max_sequence_length)
+    text_attention_mask_jax = jnp.array(text_attention_mask.numpy())
+    
+    print(f"[DEBUG _get_t5_prompt_embeds] After duplication - mask shape: {text_attention_mask_jax.shape}")
 
-    return prompt_embeds
+    return prompt_embeds, text_attention_mask_jax
 
   def encode_prompt(
       self,
@@ -459,24 +477,31 @@ def encode_prompt(
     prompt = [prompt] if isinstance(prompt, str) else prompt
     batch_size = len(prompt)
     if prompt_embeds is None:
-      prompt_embeds = self._get_t5_prompt_embeds(
+      prompt_embeds, text_attention_mask = self._get_t5_prompt_embeds(
           prompt=prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      text_attention_mask = None
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
       negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-      negative_prompt_embeds = self._get_t5_prompt_embeds(
+      negative_prompt_embeds, negative_text_attention_mask = self._get_t5_prompt_embeds(
           prompt=negative_prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      negative_text_attention_mask = None
+    
+    print(f"[DEBUG encode_prompt] text_attention_mask: {text_attention_mask.shape if text_attention_mask is not None else None}")
+    print(f"[DEBUG encode_prompt] negative_text_attention_mask: {negative_text_attention_mask.shape if negative_text_attention_mask is not None else None}")
 
-    return prompt_embeds, negative_prompt_embeds
+    return prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask
 
   def prepare_latents(
       self,
@@ -687,13 +712,15 @@ def _prepare_model_inputs(
       batch_size = len(prompt)
 
       with jax.named_scope("Encode-Prompt"):
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+        prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
             max_sequence_length=max_sequence_length,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
         )
+        
+        print(f"[DEBUG _prepare_model_inputs] Got masks - text: {text_attention_mask.shape if text_attention_mask is not None else None}, neg: {negative_text_attention_mask.shape if negative_text_attention_mask is not None else None}")
 
       num_channel_latents = self._get_num_channel_latents()
       if latents is None:
@@ -715,12 +742,16 @@ def _prepare_model_inputs(
       latents = jax.device_put(latents, data_sharding)
       prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
       negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
+      if text_attention_mask is not None:
+        text_attention_mask = jax.device_put(text_attention_mask, data_sharding)
+      if negative_text_attention_mask is not None:
+        negative_text_attention_mask = jax.device_put(negative_text_attention_mask, data_sharding)
 
       scheduler_state = self.scheduler.set_timesteps(
           self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
       )
 
-      return latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames
+      return latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames
 
   @abstractmethod
   def __call__(self, **kwargs):
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -89,7 +89,7 @@ def __call__(
     negative_prompt_embeds: Optional[jax.Array] = None,
     vae_only: bool = False,
   ):
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -103,6 +103,9 @@ def __call__(
         negative_prompt_embeds,
         vae_only,
     )
+    
+    print(f"[DEBUG WAN21T2V __call__] text_attention_mask: {text_attention_mask.shape if text_attention_mask is not None else None}")
+    print(f"[DEBUG WAN21T2V __call__] negative_text_attention_mask: {negative_text_attention_mask.shape if negative_text_attention_mask is not None else None}")
 
     graphdef, state, rest_of_state = nnx.split(self.transformer, nnx.Param, ...)