Flag for using same sequence sharding for self and cross

coolkp · coolkp · commit b0bc3a30b906 · 2025-11-11T21:33:30.000Z
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -58,7 +58,8 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 0
-mask_padding_tokens: True
+mask_padding_tokens: True # Whether to mask padding tokens in attention computation.
+attention_sharding_uniform: True # same sequence sharding rules applied for q in both (self and cross attention)
 dropout: 0.1
 
 flash_block_sizes: {
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -27,7 +27,7 @@
 from . import max_logging
 from . import max_utils
 from .models.wan.wan_utils import CAUSVID_TRANSFORMER_MODEL_NAME_OR_PATH, WAN_21_FUSION_X_MODEL_NAME_OR_PATH
-from maxdiffusion.common_types import LENGTH, KV_LENGTH, RING_ATTENTION_AXIS_RULES
+from maxdiffusion.common_types import LENGTH, KV_LENGTH, RING_ATTENTION_AXIS_RULES, SEQUENCE_PARALLEL_AXIS_RULES
 
 
 def string_to_bool(s: str) -> bool:
@@ -179,8 +179,8 @@ def user_init(raw_keys):
 
     raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"])
     # Verify qkv is sharded across sequence.
-    if raw_keys["attention"] == "ring":
-      max_logging.log("Using ring attention, adding sequence sharding to q and kv if not already present.")
+    if raw_keys["attention"] == "ring" or raw_keys["attention_sharding_uniform"]:
+      max_logging.log(f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set.")
       logical_axis_rules = list(raw_keys["logical_axis_rules"])
       max_logging.log(f"Initial logical axis rules: {logical_axis_rules}")
       new_rules = []
@@ -190,10 +190,16 @@ def user_init(raw_keys):
         logical_axis_rules.append(q_seq_sharding)
       if kv_seq_sharding not in logical_axis_rules:
         logical_axis_rules.append(kv_seq_sharding)
-      for ring_attention_axis_rule in RING_ATTENTION_AXIS_RULES:
-        if ring_attention_axis_rule not in logical_axis_rules:
-          max_logging.log(f"Adding ring attention axis rule {ring_attention_axis_rule}")
-          new_rules.append(ring_attention_axis_rule)
+      if raw_keys["attention"] == "ring":
+        for ring_attention_axis_rule in RING_ATTENTION_AXIS_RULES:
+          if ring_attention_axis_rule not in logical_axis_rules:
+            max_logging.log(f"Adding ring attention axis rule {ring_attention_axis_rule}")
+            new_rules.append(ring_attention_axis_rule)
+      else: # attention =flash but sequence parallel sharding requested for both self and cross attention
+        for seq_parallel_axis_rule in SEQUENCE_PARALLEL_AXIS_RULES:
+          if seq_parallel_axis_rule not in logical_axis_rules:
+            max_logging.log(f"Adding sequence parallel attention axis rule {seq_parallel_axis_rule}")
+            new_rules.append(seq_parallel_axis_rule)
       raw_keys["logical_axis_rules"] = tuple(new_rules) + tuple(logical_axis_rules)
       max_logging.log(f"Final logical axis rules: {raw_keys['logical_axis_rules']}")