Revert "Integrate tokamax ring attention as optional attention kernel for WAN 2.1" (#305)

eltsai · Perseus14 · commit 6f3262251871 · 2026-01-15T18:31:24.000Z
This reverts commit f68c7b0. Co-authored-by: Elisa Tsai <elisatsai@google.com>
diff --git a/src/maxdiffusion/configs/base_wan_lora_14b.yml b/src/maxdiffusion/configs/base_wan_lora_14b.yml
@@ -61,6 +61,16 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 4096
+ 
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 dropout: 0.1
 
 #flash_block_sizes: {
@@ -145,8 +155,9 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 logical_axis_rules: [
                       ['batch', 'data'],
                       ['activation_batch', 'data'],
+                      ['activation_self_attn_heads', ['fsdp', 'tensor']], 
+                      ['activation_cross_attn_q_length', ['fsdp', 'tensor']],
                       ['activation_length', 'fsdp'],
-                      
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
@@ -321,8 +332,10 @@ quantization: ''
 quantization_local_shard_count: -1
 compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
 use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the transformer of WAN will be quantized using qwix.
-# Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
-quantization_calibration_method: "absmax"
+# Quantization calibration method used for weights, activations and bwd. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
+weight_quantization_calibration_method: "absmax"
+act_quantization_calibration_method: "absmax"
+bwd_quantization_calibration_method: "absmax"
 qwix_module_path: ".*"
 
 # Eval model on per eval_every steps. -1 means don't eval.
diff --git a/src/maxdiffusion/configs/base_wan_lora_27b.yml b/src/maxdiffusion/configs/base_wan_lora_27b.yml
@@ -61,6 +61,15 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 4096
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 dropout: 0.1
 
 #flash_block_sizes: {
@@ -145,8 +154,9 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 logical_axis_rules: [
                       ['batch', 'data'],
                       ['activation_batch', 'data'],
+                      ['activation_self_attn_heads', ['fsdp', 'tensor']], 
+                      ['activation_cross_attn_q_length', ['fsdp', 'tensor']],
                       ['activation_length', 'fsdp'],
-                      
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
@@ -333,8 +343,10 @@ quantization: ''
 quantization_local_shard_count: -1
 compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
 use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the transformer of WAN will be quantized using qwix.
-# Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
-quantization_calibration_method: "absmax"
+# Quantization calibration method used for weights, activations and bwd. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
+weight_quantization_calibration_method: "absmax"
+act_quantization_calibration_method: "absmax"
+bwd_quantization_calibration_method: "absmax"
 qwix_module_path: ".*"
 
 # Eval model on per eval_every steps. -1 means don't eval.
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -25,7 +25,6 @@
 from google.cloud import storage
 import flax
 from maxdiffusion.common_types import WAN2_1, WAN2_2
-from flax import nnx
 from maxdiffusion.loaders.wan_lora_nnx_loader import Wan2_1NnxLoraLoader, Wan2_2NnxLoraLoader
 
 
diff --git a/src/maxdiffusion/loaders/wan_lora_nnx_loader.py b/src/maxdiffusion/loaders/wan_lora_nnx_loader.py
@@ -25,8 +25,8 @@
 
 class Wan2_1NnxLoraLoader(LoRABaseMixin):
   """
-  Handles loading LoRA weights into NNX-based WAN models.
-  Assumes WAN pipeline contains 'high_noise_transformer' and 'low_noise_transformer'
+  Handles loading LoRA weights into NNX-based WAN 2.1 model.
+  Assumes WAN pipeline contains 'transformer'
   attributes that are NNX Modules.
   """
 
@@ -62,7 +62,7 @@ def load_lora_weights(
 
 class Wan2_2NnxLoraLoader(LoRABaseMixin):
   """
-  Handles loading LoRA weights into NNX-based WAN models.
+  Handles loading LoRA weights into NNX-based WAN 2.2 model.
   Assumes WAN pipeline contains 'high_noise_transformer' and 'low_noise_transformer'
   attributes that are NNX Modules.
   """
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -27,7 +27,6 @@
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_kernel
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_mask as tokamax_splash_attention_mask
 from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_attention_kernel
-from tokamax._src.ops.experimental.tpu.splash_attention import ring_attention_kernel as tokamax_ring_attention_kernel
 from einops import rearrange
 from .. import common_types, max_logging
 
@@ -305,92 +304,62 @@ def wrap_flash_attention(query, key, value):
           mask=mask,
           q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
           config=convert_to_tokamax_splash_config(block_sizes, residual_checkpoint_name=residual_checkpoint_name),
-          save_residuals=True if "ring" in attention_kernel else False,
-      )
-    elif attention_kernel == "tokamax_ring":
-      mask = tokamax_splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]),)
-      splash_kernel = tokamax_ring_attention_kernel.make_ring_attention(
-          mask=mask,
-          is_mqa=False,
-          config=convert_to_tokamax_splash_config(block_sizes, residual_checkpoint_name=residual_checkpoint_name),
-          save_residuals=True,
-          ring_axis="fsdp",
+          save_residuals=True if attention_kernel == "ring" else False,
       )
     else:
       splash_kernel = splash_attention_kernel.make_splash_mha(
           mask=multi_head_mask,
           head_shards=1,  # the sizes of the axis is sharding over heads
           q_seq_shards=1,  # the sizes of the axis is sharding over seq_len
           block_sizes=block_sizes,
-          save_residuals=True if "ring" in attention_kernel else False,
+          save_residuals=True if attention_kernel == "ring" else False,
           residual_checkpoint_name=residual_checkpoint_name
       )
+    vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
 
-    if attention_kernel == "tokamax_ring":
-      # For tokamax_ring, use the kernel directly without vmap
-      # The ring attention kernel handles the ring topology internally
-      if not mask_padding_tokens:
-        segment_ids = None
-      attention_output = splash_kernel(
-          fwd_mask_info=None,
-          dkv_mask_info=None,
-          q=query,
-          k=key,
-          v=value,
-          segment_ids=segment_ids,
-          is_mqa=False,
-          config=convert_to_tokamax_splash_config(block_sizes, residual_checkpoint_name=residual_checkpoint_name),
-          mask_value=-jnp.inf,
-          mask_function=None,
-          fwd_mask_sparsity=1.0,
-          save_residuals=True,
-      )
+    if not mask_padding_tokens:
+      segment_ids = None
+    if attention_kernel in ["flash", "tokamax_flash"]:
+      attention_output = vmapped_splash(query, key, value, segment_ids)
     else:
-      vmapped_splash = jax.vmap(splash_kernel, in_axes=(0, 0, 0, None))
-
-      if not mask_padding_tokens:
-        segment_ids = None
-      if attention_kernel in ["flash", "tokamax_flash"]:
-        attention_output = vmapped_splash(query, key, value, segment_ids)
-      else:
-        if num_fsdp_shards > 1:
-          out, (lse,) = vmapped_splash(query, key, value, segment_ids)
-          m = lse.astype(jnp.float32)
-          l = jnp.exp(lse - m)
-          o = out.astype(jnp.float32) * l[..., None]
+      if num_fsdp_shards > 1:
+        out, (lse,) = vmapped_splash(query, key, value, segment_ids)
+        m = lse.astype(jnp.float32)
+        l = jnp.exp(lse - m)
+        o = out.astype(jnp.float32) * l[..., None]
 
-          perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)]
+        perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)]
 
-          k1 = jax.lax.ppermute(key, axis_name="fsdp", perm=perm)
-          v1 = jax.lax.ppermute(value, axis_name="fsdp", perm=perm)
+        k1 = jax.lax.ppermute(key, axis_name="fsdp", perm=perm)
+        v1 = jax.lax.ppermute(value, axis_name="fsdp", perm=perm)
 
-          def ring_scan_body(carry, _):
-            m, l, o, k_current, v_current = carry
-            k_next = jax.lax.ppermute(k_current, axis_name="fsdp", perm=perm)
-            v_next = jax.lax.ppermute(v_current, axis_name="fsdp", perm=perm)
+        def ring_scan_body(carry, _):
+          m, l, o, k_current, v_current = carry
+          k_next = jax.lax.ppermute(k_current, axis_name="fsdp", perm=perm)
+          v_next = jax.lax.ppermute(v_current, axis_name="fsdp", perm=perm)
 
-            out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids)
+          out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids)
 
-            m_chunk = lse_chunk.astype(jnp.float32)
-            m_old = m
-            m = jnp.maximum(m_old, m_chunk)
+          m_chunk = lse_chunk.astype(jnp.float32)
+          m_old = m
+          m = jnp.maximum(m_old, m_chunk)
 
-            exp_m_diff = jnp.exp(m_old - m)
-            exp_m_chunk_diff = jnp.exp(m_chunk - m)
+          exp_m_diff = jnp.exp(m_old - m)
+          exp_m_chunk_diff = jnp.exp(m_chunk - m)
 
-            l = l * exp_m_diff + jnp.exp(lse_chunk - m)
-            o = o * exp_m_diff[..., None]
-            o += exp_m_chunk_diff[..., None] * out_chunk.astype(jnp.float32)
+          l = l * exp_m_diff + jnp.exp(lse_chunk - m)
+          o = o * exp_m_diff[..., None]
+          o += exp_m_chunk_diff[..., None] * out_chunk.astype(jnp.float32)
 
-            # Return the updated state for the next iteration
-            return (m, l, o, k_next, v_next), None
+          # Return the updated state for the next iteration
+          return (m, l, o, k_next, v_next), None
 
-          initial_carry = (m, l, o, k1, v1)
-          (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1)
+        initial_carry = (m, l, o, k1, v1)
+        (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1)
 
-          attention_output = o_final / l_final[..., None]
-        else:
-          raise ValueError("ring attention requires fsdp > 1")
+        attention_output = o_final / l_final[..., None]
+      else:
+        raise ValueError("ring attention requires fsdp > 1")
 
     return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
 
@@ -566,7 +535,7 @@ def _apply_attention(
         mask_padding_tokens=mask_padding_tokens,
         residual_checkpoint_name=residual_checkpoint_name,
     )
-  elif "ring" in attention_kernel:
+  elif attention_kernel == "ring":
     return _tpu_flash_attention(
         query, key * scale, value, heads, mesh, axis_names_q, axis_names_kv, flash_block_sizes, dtype, attention_kernel,
         mask_padding_tokens=mask_padding_tokens,
@@ -577,7 +546,6 @@ def _apply_attention(
     raise ValueError(f"Unexpected attention kernel {attention_kernel=}.")
 
 
-
 def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
   """Multi-head dot product attention with a limited number of queries."""
   num_kv, num_heads, k_features = key.shape[-3:]
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py