Trying text_mask 6

prishajain1 · prishajain1 · commit e7bd680355cf · 2026-01-11T20:57:24.000+05:30
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -100,7 +100,8 @@ def __call__(
     negative_prompt_embeds: jax.Array = None,
     vae_only: bool = False,
   ):
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    (latents, prompt_embeds, negative_prompt_embeds, text_attention_mask, 
+     negative_text_attention_mask, scheduler_state, num_frames) = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -139,6 +140,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          text_attention_mask=text_attention_mask,
+          negative_text_attention_mask=negative_text_attention_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -153,6 +156,8 @@ def run_inference_2_2(
     latents: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    text_attention_mask: Optional[jax.Array],
+    negative_text_attention_mask: Optional[jax.Array],
     guidance_scale_low: float,
     guidance_scale_high: float,
     boundary: int,
@@ -163,21 +168,27 @@ def run_inference_2_2(
   do_classifier_free_guidance = guidance_scale_low > 1.0 or guidance_scale_high > 1.0
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    if text_attention_mask is not None and negative_text_attention_mask is not None:
+      encoder_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
+    else:
+      encoder_attention_mask = None
 
   def low_noise_branch(operands):
     latents, timestep, prompt_embeds = operands
     return transformer_forward_pass(
         low_noise_graphdef, low_noise_state, low_noise_rest,
         latents, timestep, prompt_embeds,
-        do_classifier_free_guidance, guidance_scale_low
+        do_classifier_free_guidance, guidance_scale_low,
+        encoder_attention_mask=encoder_attention_mask,
     )
 
   def high_noise_branch(operands):
     latents, timestep, prompt_embeds = operands
     return transformer_forward_pass(
         high_noise_graphdef, high_noise_state, high_noise_rest,
         latents, timestep, prompt_embeds,
-        do_classifier_free_guidance, guidance_scale_high
+        do_classifier_free_guidance, guidance_scale_high,
+        encoder_attention_mask=encoder_attention_mask,
     )
 
   for step in range(num_inference_steps):
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -171,7 +171,8 @@ def __call__(
         max_logging.log(f"Adjusted num_frames to: {num_frames}")
     num_frames = max(num_frames, 1)
 
-    prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size = self._prepare_model_inputs_i2v(
+    (prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask,
+     image_embeds, effective_batch_size) = self._prepare_model_inputs_i2v(
         prompt, image, negative_prompt, num_videos_per_prompt, max_sequence_length,
         prompt_embeds, negative_prompt_embeds, image_embeds, last_image
     )
@@ -230,9 +231,11 @@ def __call__(
       latents = p_run_inference(
           latents=latents,
           condition=condition,
-          prompt_embeds=prompt_embeds,
-          negative_prompt_embeds=negative_prompt_embeds,
-          image_embeds=image_embeds,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        text_attention_mask=text_attention_mask,
+        negative_text_attention_mask=negative_text_attention_mask,
+        image_embeds=image_embeds,
           first_frame_mask=first_frame_mask,
           scheduler_state=scheduler_state,
           rng=inference_rng,
@@ -251,6 +254,8 @@ def run_inference_2_1_i2v(
     condition: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    text_attention_mask: Optional[jax.Array],
+    negative_text_attention_mask: Optional[jax.Array],
     image_embeds: jnp.array,
     guidance_scale: float,
     num_inference_steps: int,
@@ -263,6 +268,10 @@ def run_inference_2_1_i2v(
 
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    if text_attention_mask is not None and negative_text_attention_mask is not None:
+      encoder_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
+    else:
+      encoder_attention_mask = None
     image_embeds = jnp.concatenate([image_embeds, image_embeds], axis=0)
     condition = jnp.concatenate([condition] * 2)
   for step in range(num_inference_steps):
@@ -280,6 +289,7 @@ def run_inference_2_1_i2v(
         do_classifier_free_guidance=do_classifier_free_guidance,
         guidance_scale=guidance_scale,
         encoder_hidden_states_image=image_embeds,
+        encoder_attention_mask=encoder_attention_mask,
     )
     noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents, return_dict=False)  
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py
@@ -165,7 +165,8 @@ def __call__(
         max_logging.log(f"Adjusted num_frames to: {num_frames}")
     num_frames = max(num_frames, 1)
 
-    prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size = self._prepare_model_inputs_i2v(
+    (prompt_embeds, negative_prompt_embeds, text_attention_mask, negative_text_attention_mask,
+     image_embeds, effective_batch_size) = self._prepare_model_inputs_i2v(
         prompt, image, negative_prompt, num_videos_per_prompt, max_sequence_length,
         prompt_embeds, negative_prompt_embeds, image_embeds, last_image
     )
@@ -224,6 +225,8 @@ def __call__(
         boundary=boundary_timestep,
         num_inference_steps=num_inference_steps,
         scheduler=self.scheduler,
+        text_attention_mask=text_attention_mask,
+        negative_text_attention_mask=negative_text_attention_mask,
         image_embeds=image_embeds,
         first_frame_mask=first_frame_mask,
     )
@@ -250,6 +253,8 @@ def run_inference_2_2_i2v(
     condition: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    text_attention_mask: Optional[jax.Array],
+    negative_text_attention_mask: Optional[jax.Array],
     image_embeds: jnp.array,
     first_frame_mask: Optional[jnp.array],
     guidance_scale_low: float,
@@ -261,14 +266,25 @@ def run_inference_2_2_i2v(
     rng: jax.Array,
 ):
     do_classifier_free_guidance = guidance_scale_low > 1.0 or guidance_scale_high > 1.0
+    
+    if do_classifier_free_guidance:
+        prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+        if text_attention_mask is not None and negative_text_attention_mask is not None:
+            encoder_attention_mask = jnp.concatenate([text_attention_mask, negative_text_attention_mask], axis=0)
+        else:
+            encoder_attention_mask = None
+        image_embeds = jnp.concatenate([image_embeds, image_embeds], axis=0)
+        condition = jnp.concatenate([condition] * 2)
+    
     def high_noise_branch(operands):
         latents_input, ts_input, pe_input, ie_input = operands
         latents_input = jnp.transpose(latents_input, (0, 4, 1, 2, 3))
         noise_pred, latents_out = transformer_forward_pass(
             high_noise_graphdef, high_noise_state, high_noise_rest,
             latents_input, ts_input, pe_input,
             do_classifier_free_guidance=do_classifier_free_guidance, guidance_scale=guidance_scale_high,
-            encoder_hidden_states_image=ie_input
+            encoder_hidden_states_image=ie_input,
+            encoder_attention_mask=encoder_attention_mask,
         )
         return noise_pred, latents_out
 
@@ -279,7 +295,8 @@ def low_noise_branch(operands):
             low_noise_graphdef, low_noise_state, low_noise_rest,
             latents_input, ts_input, pe_input,
             do_classifier_free_guidance=do_classifier_free_guidance, guidance_scale=guidance_scale_low,
-            encoder_hidden_states_image=ie_input
+            encoder_hidden_states_image=ie_input,
+            encoder_attention_mask=encoder_attention_mask,
         )
         return noise_pred, latents_out