Add attention mask support for Wan model

Perseus14 · Perseus14 · commit 1590039bacda · 2026-04-15T12:48:51.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1179,7 +1179,7 @@ def __call__(
       value_proj = checkpoint_name(value_proj, "value_proj")
 
       with jax.named_scope("apply_attention"):
-        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+        attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj, attention_mask=encoder_attention_mask)
 
     else:
       # NEW PATH for I2V CROSS-ATTENTION
@@ -1206,9 +1206,11 @@ def __call__(
         # It contains the image mask: [1]*257 + [0]*127 for 257 real image tokens padded to 384
         if encoder_attention_mask is not None:
           encoder_attention_mask_img = encoder_attention_mask[:, :padded_img_len]
+          encoder_attention_mask_text = encoder_attention_mask[:, padded_img_len:]
         else:
           # Fallback: no mask means treat all as valid (for dot product attention)
           encoder_attention_mask_img = None
+          encoder_attention_mask_text = None
       else:
         # If no image_seq_len is specified, treat all as text
         encoder_hidden_states_img = None
@@ -1257,7 +1259,7 @@ def __call__(
 
         # Attention - tensors are (B, S, D)
         with self.conditional_named_scope("cross_attn_text_apply"):
-          attn_output_text = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text)
+          attn_output_text = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text, attention_mask=encoder_attention_mask_text)
         with self.conditional_named_scope("cross_attn_img_apply"):
           # Pass encoder_attention_mask_img for image cross-attention to mask padded tokens
           attn_output_img = self.attention_op.apply_attention(
@@ -1272,7 +1274,7 @@ def __call__(
         value_proj_text = checkpoint_name(value_proj_text, "value_proj_text")
 
         with self.conditional_named_scope("cross_attn_text_apply"):
-          attn_output = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text)
+          attn_output = self.attention_op.apply_attention(query_proj_text, key_proj_text, value_proj_text, attention_mask=encoder_attention_mask)
 
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -606,6 +606,7 @@ def compute_kv_cache(
       encoder_hidden_states: jax.Array,
       encoder_hidden_states_image: Optional[jax.Array] = None,
       timestep: Optional[jax.Array] = None,
+      text_mask: Optional[jax.Array] = None,
   ) -> Tuple[Dict[str, Tuple[jax.Array, jax.Array]], Optional[jax.Array]]:
     if timestep is None:
       batch_size = encoder_hidden_states.shape[0]
@@ -623,11 +624,15 @@ def compute_kv_cache(
     if encoder_hidden_states_image is not None:
       encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states], axis=1)
       if encoder_attention_mask is not None:
-        text_mask = jnp.ones(
-            (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]),
-            dtype=jnp.int32,
-        )
+        if text_mask is None:
+          text_mask = jnp.ones(
+              (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]),
+              dtype=jnp.int32,
+          )
         encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
+    else:
+      if encoder_attention_mask is None:
+        encoder_attention_mask = text_mask
 
     if self.scan_layers:
       @nnx.vmap(in_axes=(0, None, None), out_axes=0, transform_metadata={nnx.PARTITION_NAME: "layers_per_stage"})
@@ -665,6 +670,7 @@ def __call__(
       kv_cache: Optional[Dict[str, Tuple[jax.Array, jax.Array]]] = None,
       rotary_emb: Optional[jax.Array] = None,
       encoder_attention_mask: Optional[jax.Array] = None,
+      text_mask: Optional[jax.Array] = None,
   ) -> Union[jax.Array, Tuple[jax.Array, jax.Array], Dict[str, jax.Array]]:
     hidden_states = nn.with_logical_constraint(hidden_states, ("batch", None, None, None, None))
     batch_size, _, num_frames, height, width = hidden_states.shape
@@ -694,14 +700,17 @@ def __call__(
       encoder_attention_mask = encoder_attention_mask
     else:
       encoder_attention_mask = encoder_attention_mask_out
+      if encoder_attention_mask is None:
+        encoder_attention_mask = text_mask
 
     if encoder_hidden_states_image is not None:
       encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states_out], axis=1)
       if kv_cache is None and encoder_attention_mask is not None:
-        text_mask = jnp.ones(
-            (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]),
-            dtype=jnp.int32,
-        )
+        if text_mask is None:
+          text_mask = jnp.ones(
+              (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]),
+              dtype=jnp.int32,
+          )
         encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
       encoder_hidden_states = encoder_hidden_states.astype(hidden_states.dtype)
     else:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -473,7 +473,11 @@ def _get_t5_prompt_embeds(
     prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
     prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
-    return prompt_embeds
+    mask = mask.repeat(1, num_videos_per_prompt)
+    mask = mask.view(batch_size * num_videos_per_prompt, seq_len)
+    mask = jnp.array(mask.detach().numpy(), dtype=jnp.int32)
+
+    return prompt_embeds, mask
 
   def encode_prompt(
       self,
@@ -483,28 +487,36 @@ def encode_prompt(
       max_sequence_length: int = 226,
       prompt_embeds: jax.Array = None,
       negative_prompt_embeds: jax.Array = None,
+      prompt_mask: jax.Array = None,
+      negative_prompt_mask: jax.Array = None,
   ):
     prompt = [prompt] if isinstance(prompt, str) else prompt
     if prompt_embeds is None:
-      prompt_embeds = self._get_t5_prompt_embeds(
+      prompt_embeds, prompt_mask = self._get_t5_prompt_embeds(
           prompt=prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      if prompt_mask is None:
+        prompt_mask = jnp.ones((prompt_embeds.shape[0], prompt_embeds.shape[1]), dtype=jnp.int32)
 
     if negative_prompt_embeds is None:
       batch_size = len(prompt_embeds)
       negative_prompt = negative_prompt or ""
       negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-      negative_prompt_embeds = self._get_t5_prompt_embeds(
+      negative_prompt_embeds, negative_prompt_mask = self._get_t5_prompt_embeds(
           prompt=negative_prompt,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
       negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
+    else:
+      if negative_prompt_mask is None:
+        negative_prompt_mask = jnp.ones((negative_prompt_embeds.shape[0], negative_prompt_embeds.shape[1]), dtype=jnp.int32)
 
-    return prompt_embeds, negative_prompt_embeds
+    return prompt_embeds, prompt_mask, negative_prompt_embeds, negative_prompt_mask
 
   def prepare_latents(
       self,
@@ -647,7 +659,7 @@ def _prepare_model_inputs_i2v(
     effective_batch_size = batch_size * num_videos_per_prompt
 
     # 1. Encode Prompts
-    prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+    prompt_embeds, prompt_mask, negative_prompt_embeds, negative_prompt_mask = self.encode_prompt(
         prompt=prompt,
         negative_prompt=negative_prompt,
         num_videos_per_prompt=num_videos_per_prompt,
@@ -691,8 +703,10 @@ def _prepare_model_inputs_i2v(
     prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
     negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
     image_embeds = jax.device_put(image_embeds, data_sharding)
+    prompt_mask = jax.device_put(prompt_mask, data_sharding)
+    negative_prompt_mask = jax.device_put(negative_prompt_mask, data_sharding)
 
-    return prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size
+    return prompt_embeds, negative_prompt_embeds, image_embeds, effective_batch_size, prompt_mask, negative_prompt_mask
 
   def _prepare_model_inputs(
       self,
@@ -724,7 +738,7 @@ def _prepare_model_inputs(
       batch_size = len(prompt)
 
       with jax.named_scope("Encode-Prompt"):
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+        prompt_embeds, prompt_mask, negative_prompt_embeds, negative_prompt_mask = self.encode_prompt(
             prompt=prompt,
             negative_prompt=negative_prompt,
             max_sequence_length=max_sequence_length,
@@ -752,12 +766,14 @@ def _prepare_model_inputs(
       latents = jax.device_put(latents, data_sharding)
       prompt_embeds = jax.device_put(prompt_embeds, data_sharding)
       negative_prompt_embeds = jax.device_put(negative_prompt_embeds, data_sharding)
+      prompt_mask = jax.device_put(prompt_mask, data_sharding)
+      negative_prompt_mask = jax.device_put(negative_prompt_mask, data_sharding)
 
       scheduler_state = self.scheduler.set_timesteps(
           self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
       )
 
-      return latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames
+      return latents, prompt_embeds, negative_prompt_embeds, prompt_mask, negative_prompt_mask, scheduler_state, num_frames
 
   @abstractmethod
   def __call__(self, **kwargs):
@@ -782,6 +798,7 @@ def transformer_forward_pass(
     kv_cache=None,
     rotary_emb=None,
     encoder_attention_mask=None,
+    text_mask=None,
 ):
   wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
   outputs = wan_transformer(
@@ -795,6 +812,7 @@ def transformer_forward_pass(
       kv_cache=kv_cache,
       rotary_emb=rotary_emb,
       encoder_attention_mask=encoder_attention_mask,
+      text_mask=text_mask,
   )
 
   if return_residual:
@@ -828,6 +846,7 @@ def transformer_forward_pass_full_cfg(
     kv_cache=None,
     rotary_emb=None,
     encoder_attention_mask=None,
+    text_mask=None,
 ):
   """Full CFG forward pass.
 
@@ -849,6 +868,7 @@ def transformer_forward_pass_full_cfg(
       kv_cache=kv_cache,
       rotary_emb=rotary_emb,
       encoder_attention_mask=encoder_attention_mask,
+      text_mask=text_mask,
   )
   noise_cond = noise_pred[:bsz]
   noise_uncond = noise_pred[bsz:]
@@ -873,6 +893,7 @@ def transformer_forward_pass_cfg_cache(
     kv_cache=None,
     rotary_emb=None,
     encoder_attention_mask=None,
+    text_mask=None,
 ):
   """CFG-Cache forward pass with FFT frequency-domain compensation.
 
@@ -901,6 +922,7 @@ def transformer_forward_pass_cfg_cache(
       kv_cache=kv_cache,
       rotary_emb=rotary_emb,
       encoder_attention_mask=encoder_attention_mask,
+      text_mask=text_mask,
   )
 
   # FFT over spatial dims (H, W) — last 2 dims of [B, C, F, H, W]
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -111,7 +111,7 @@ def __call__(
           "CFG cache accelerates classifier-free guidance, which is disabled when guidance_scale <= 1.0."
       )
 
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, prompt_mask, negative_prompt_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -152,6 +152,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          prompt_mask=prompt_mask,
+          negative_prompt_mask=negative_prompt_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -164,6 +166,8 @@ def run_inference_2_1(
     latents: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    prompt_mask: jnp.array,
+    negative_prompt_mask: jnp.array,
     guidance_scale: float,
     num_inference_steps: int,
     scheduler: FlaxUniPCMultistepScheduler,
@@ -216,8 +220,12 @@ def run_inference_2_1(
   # Pre-split embeds once, outside the loop.
   prompt_cond_embeds = prompt_embeds
   prompt_embeds_combined = None
+  prompt_mask_combined = None
   if do_cfg:
     prompt_embeds_combined = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    prompt_mask_combined = jnp.concatenate([prompt_mask, negative_prompt_mask], axis=0)
+  else:
+    prompt_mask_combined = prompt_mask
 
   # Pre-compute cache schedule and phase-dependent weights.
   # t₀ = midpoint step; before t₀ boost low-freq, after boost high-freq.
@@ -257,7 +265,9 @@ def run_inference_2_1(
   encoder_attention_mask = None
 
   if use_kv_cache:
-    kv_cache, encoder_attention_mask = transformer_obj.compute_kv_cache(prompt_embeds_combined if do_cfg else prompt_cond_embeds)
+    kv_cache, encoder_attention_mask = transformer_obj.compute_kv_cache(prompt_embeds_combined if do_cfg else prompt_cond_embeds, text_mask=prompt_mask_combined)
+  else:
+    encoder_attention_mask = prompt_mask_combined
 
   if use_magcache and do_cfg:
     magcache_init = init_magcache(num_inference_steps, retention_ratio, mag_ratios_base)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -131,7 +131,7 @@ def __call__(
           "SenCache requires classifier-free guidance to be enabled for both transformer phases."
       )
 
-    latents, prompt_embeds, negative_prompt_embeds, scheduler_state, num_frames = self._prepare_model_inputs(
+    latents, prompt_embeds, negative_prompt_embeds, prompt_mask, negative_prompt_mask, scheduler_state, num_frames = self._prepare_model_inputs(
         prompt,
         negative_prompt,
         height,
@@ -176,6 +176,8 @@ def __call__(
           latents=latents,
           prompt_embeds=prompt_embeds,
           negative_prompt_embeds=negative_prompt_embeds,
+          prompt_mask=prompt_mask,
+          negative_prompt_mask=negative_prompt_mask,
       )
       latents = self._denormalize_latents(latents)
     return self._decode_latents_to_video(latents)
@@ -191,6 +193,8 @@ def run_inference_2_2(
     latents: jnp.array,
     prompt_embeds: jnp.array,
     negative_prompt_embeds: jnp.array,
+    prompt_mask: jnp.array,
+    negative_prompt_mask: jnp.array,
     guidance_scale_low: float,
     guidance_scale_high: float,
     boundary: int,
@@ -223,6 +227,9 @@ def run_inference_2_2(
   prompt_embeds_combined = (
       jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0) if do_classifier_free_guidance else prompt_embeds
   )
+  prompt_mask_combined = (
+      jnp.concatenate([prompt_mask, negative_prompt_mask], axis=0) if do_classifier_free_guidance else prompt_mask
+  )
 
   low_transformer = nnx.merge(low_noise_graphdef, low_noise_state, low_noise_rest)
   
@@ -236,10 +243,13 @@ def run_inference_2_2(
   encoder_attention_mask_high = None
 
   if use_kv_cache:
-    kv_cache_low, encoder_attention_mask_low = low_transformer.compute_kv_cache(prompt_embeds_combined)
+    kv_cache_low, encoder_attention_mask_low = low_transformer.compute_kv_cache(prompt_embeds_combined, text_mask=prompt_mask_combined)
     
     high_transformer = nnx.merge(high_noise_graphdef, high_noise_state, high_noise_rest)
-    kv_cache_high, encoder_attention_mask_high = high_transformer.compute_kv_cache(prompt_embeds_combined)
+    kv_cache_high, encoder_attention_mask_high = high_transformer.compute_kv_cache(prompt_embeds_combined, text_mask=prompt_mask_combined)
+  else:
+    encoder_attention_mask_low = prompt_mask_combined
+    encoder_attention_mask_high = prompt_mask_combined
 
   # ── SenCache path (arXiv:2602.24208) ──
   if use_sen_cache and do_classifier_free_guidance:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py