Trying text_mask 2

prishajain1 · prishajain1 · commit 00077c12b058 · 2026-01-11T19:33:31.000+05:30
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -769,9 +769,22 @@ def transformer_forward_pass(
     do_classifier_free_guidance,
     guidance_scale,
     encoder_hidden_states_image=None,
+    encoder_attention_mask=None,
 ):
   wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
-  noise_pred = wan_transformer(hidden_states=latents, timestep=timestep, encoder_hidden_states=prompt_embeds, encoder_hidden_states_image=encoder_hidden_states_image)
+  
+  # DEBUG: Print mask info (only compiles once due to jit)
+  # jax.debug.print("[DEBUG transformer_forward_pass] encoder_attention_mask shape: {}", 
+  #                 encoder_attention_mask.shape if encoder_attention_mask is not None else "None")
+  
+  # For now, DON'T pass the mask - just accept it
+  noise_pred = wan_transformer(
+      hidden_states=latents, 
+      timestep=timestep, 
+      encoder_hidden_states=prompt_embeds, 
+      encoder_hidden_states_image=encoder_hidden_states_image
+      # encoder_attention_mask=encoder_attention_mask  # TODO: Add this next
+  )
   if do_classifier_free_guidance:
     bsz = latents.shape[0] // 2
     noise_cond = noise_pred[:bsz]  # First half = conditional
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -125,6 +125,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -140,10 +142,22 @@ def run_inference_2_1(
     num_inference_steps: int,
     scheduler: FlaxUniPCMultistepScheduler,
     scheduler_state,
+    text_attention_mask: Optional[jnp.array] = None,
+    negative_text_attention_mask: Optional[jnp.array] = None,
 ):
   do_classifier_free_guidance = guidance_scale > 1.0
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    # Concatenate masks for CFG: [positive_mask | negative_mask]
+    if text_attention_mask is not None and negative_text_attention_mask is not None:
+      encoder_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
+      print(f"[DEBUG run_inference_2_1] Concatenated mask shape: {encoder_attention_mask.shape}")
+      print(f"[DEBUG run_inference_2_1] Mask sums - pos: {text_attention_mask.sum()}, neg: {negative_text_attention_mask.sum()}, combined: {encoder_attention_mask.sum()}")
+    else:
+      encoder_attention_mask = None
+  else:
+    encoder_attention_mask = text_attention_mask
+    
   for step in range(num_inference_steps):
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
     if do_classifier_free_guidance:
@@ -159,7 +173,11 @@ def run_inference_2_1(
         prompt_embeds,
         do_classifier_free_guidance=do_classifier_free_guidance,
         guidance_scale=guidance_scale,
+        encoder_attention_mask=encoder_attention_mask if step == 0 else encoder_attention_mask,  # Pass mask
     )
+    
+    if step == 0:
+      print(f"[DEBUG run_inference_2_1] Step 0 - passed encoder_attention_mask shape: {encoder_attention_mask.shape if encoder_attention_mask is not None else None}")
 
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
   return latents