Flag for using segment ids and masking padding tokens in attention

coolkp · coolkp · commit 54ae6e729045 · 2025-11-10T20:54:29.000Z
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -58,6 +58,7 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 0
+mask_padding_tokens: True
 dropout: 0.1
 
 flash_block_sizes: {
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -67,6 +67,7 @@ def delete_file(file_path: str):
   # this leads to CUDA OOMs. WAR for now is to hide GPUs from TF
   # tf.config.set_visible_devices([], "GPU")
 if "xla_tpu_spmd_rng_bit_generator_unsafe" not in os.environ.get("LIBTPU_INIT_ARGS", ""):
+  max_logging.log("Enabling unsafe RNG bit generator for TPU SPMD.")
   os.environ["LIBTPU_INIT_ARGS"] = (
       os.environ.get("LIBTPU_INIT_ARGS", "") + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"
   )
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -181,6 +181,7 @@ def _tpu_flash_attention(
     flash_block_sizes: BlockSizes,
     dtype: jnp.dtype = jnp.float32,
     attention_kernel: str = "flash",
+    mask_padding_tokens: bool = True,
 ) -> jax.Array:
   """TPU Flash Attention"""
 
@@ -248,6 +249,8 @@ def wrap_flash_attention(query, key, value):
     )
     vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
 
+    if not mask_padding_tokens:
+      segment_ids = None
     if attention_kernel == "flash":
       attention_output = vmapped_splash(query, key, value, segment_ids)
     else:
@@ -287,6 +290,8 @@ def ring_scan_body(carry, _):
         (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1)
 
         attention_output = o_final / l_final[..., None]
+      else:
+        raise ValueError("ring attention requires fsdp > 1")
 
     return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
 
@@ -427,6 +432,7 @@ def _apply_attention(
     axis_names_kv: AxisNames,
     flash_block_sizes: BlockSizes,
     dpa_layer: Callable,
+    mask_padding_tokens: bool = True,
 ):
   """Routes to different attention kernels."""
   _check_attention_inputs(query, key, value)
@@ -457,10 +463,12 @@ def _apply_attention(
         flash_block_sizes,
         dtype,
         attention_kernel,
+        mask_padding_tokens=mask_padding_tokens,
     )
   elif attention_kernel == "ring":
     return _tpu_flash_attention(
-        query, key * scale, value, heads, mesh, axis_names_q, axis_names_kv, flash_block_sizes, dtype, attention_kernel
+        query, key * scale, value, heads, mesh, axis_names_q, axis_names_kv, flash_block_sizes, dtype, attention_kernel, 
+        mask_padding_tokens=mask_padding_tokens,
     )
   elif attention_kernel == "cudnn_flash_te":
     return _cudnn_flash_attention(query, key, value, heads, mesh, dpa_layer)
@@ -591,6 +599,7 @@ def __init__(
       flash_block_sizes: BlockSizes = None,
       dtype: DType = jnp.float32,
       quant: Quant = None,
+      mask_padding_tokens: bool = True,
   ):
     self.dpa_layer = None
     if attention_kernel == "cudnn_flash_te":
@@ -610,6 +619,7 @@ def __init__(
     self.flash_block_sizes = flash_block_sizes
     self.dtype = dtype
     self.quant = quant
+    self.mask_padding_tokens = mask_padding_tokens
 
   def apply_attention(self, query: Array, key: Array, value: Array):
     return _apply_attention(
@@ -630,6 +640,7 @@ def apply_attention(self, query: Array, key: Array, value: Array):
         axis_names_kv=self.axis_names_kv,
         flash_block_sizes=self.flash_block_sizes,
         dpa_layer=self.dpa_layer,
+        mask_padding_tokens=self.mask_padding_tokens,
     )
 
 
@@ -719,6 +730,7 @@ def __init__(
       qkv_bias: bool = False,
       quant: Quant = None,
       is_self_attention: bool = True,
+      mask_padding_tokens: bool = True,
   ):
     if attention_kernel == "cudnn_flash_te":
       raise NotImplementedError(f"Wan 2.1 has not been tested with {attention_kernel}")
@@ -757,6 +769,7 @@ def __init__(
         flash_block_sizes=flash_block_sizes,
         dtype=dtype,
         quant=quant,
+        mask_padding_tokens=mask_padding_tokens,
     )
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -263,6 +263,7 @@ def __init__(
       precision: jax.lax.Precision = None,
       attention: str = "dot_product",
       dropout: float = 0.0,
+      mask_padding_tokens: bool = True,
   ):
 
     # 1. Self-attention
@@ -283,6 +284,7 @@ def __init__(
         attention_kernel=attention,
         dropout=dropout,
         is_self_attention=True,
+        mask_padding_tokens=mask_padding_tokens
     )
 
     # 1. Cross-attention
@@ -302,6 +304,7 @@ def __init__(
         attention_kernel=attention,
         dropout=dropout,
         is_self_attention=False,
+        mask_padding_tokens=mask_padding_tokens
     )
     assert cross_attn_norm is True
     self.norm2 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=True)
@@ -404,6 +407,7 @@ def __init__(
       remat_policy: str = "None",
       names_which_can_be_saved: list = [],
       names_which_can_be_offloaded: list = [],
+      mask_padding_tokens: bool = True,
   ):
     inner_dim = num_attention_heads * attention_head_dim
     out_channels = out_channels or in_channels
@@ -458,6 +462,7 @@ def init_block(rngs):
           precision=precision,
           attention=attention,
           dropout=dropout,
+          mask_padding_tokens=mask_padding_tokens,
       )
 
     self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -90,6 +90,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["names_which_can_be_offloaded"] = config.names_which_can_be_offloaded
   wan_config["flash_min_seq_length"] = config.flash_min_seq_length
   wan_config["dropout"] = config.dropout
+  wan_config["mask_padding_tokens"] = config.mask_padding_tokens
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@ def delete_file(file_path: str):`
`67`	`67`	`# this leads to CUDA OOMs. WAR for now is to hide GPUs from TF`
`68`	`68`	`# tf.config.set_visible_devices([], "GPU")`
`69`	`69`	`if "xla_tpu_spmd_rng_bit_generator_unsafe" not in os.environ.get("LIBTPU_INIT_ARGS", ""):`
	`70`	`+ max_logging.log("Enabling unsafe RNG bit generator for TPU SPMD.")`
`70`	`71`	`os.environ["LIBTPU_INIT_ARGS"] = (`
`71`	`72`	`os.environ.get("LIBTPU_INIT_ARGS", "") + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"`
`72`	`73`	`)`