add split self/cross attention sharding rules and configuration

eltsai · eltsai · commit 015ccc2e8305 · 2025-12-09T05:36:54.000Z
diff --git a/src/maxdiffusion/common_types.py b/src/maxdiffusion/common_types.py
@@ -33,7 +33,11 @@
 BlockSizes = splash_attention_kernel.BlockSizes
 
 AxisNames = tuple[str, ...]
-
+# Physical axis names for device meshes.
+DATA = "data"
+FSDP = "fsdp"
+TENSOR = "tensor"
+# Logical axis names for model parameters and activations.
 BATCH = "activation_batch"
 LENGTH = "activation_length"
 KV_LENGTH = "activation_kv_length"
@@ -44,4 +48,31 @@
 KEEP_2 = "activation_keep_2"
 CONV_OUT = "activation_conv_out_channels"
 
+# For setting self/cross attention independently in splash kernel
+SELF_ATTN_HEAD = "activation_self_attn_heads"
+SELF_ATTN_Q_LENGTH = "activation_self_attn_q_length"
+SELF_ATTN_KV_LENGTH = "activation_self_attn_kv_length"
+CROSS_ATTN_HEAD = "activation_cross_attn_heads"
+CROSS_ATTN_Q_LENGTH = "activation_cross_attn_q_length"
+CROSS_ATTN_KV_LENGTH = "activation_cross_attn_kv_length"
+
 WAN_MODEL = "Wan2.1"
+
+### Common axis rules for ring attention ###
+RING_ATTENTION_AXIS_RULES = [
+        [SELF_ATTN_HEAD, None],
+        [SELF_ATTN_Q_LENGTH, FSDP],
+        [SELF_ATTN_KV_LENGTH, FSDP],
+        [CROSS_ATTN_HEAD, None],
+        [CROSS_ATTN_Q_LENGTH, FSDP],
+        [CROSS_ATTN_KV_LENGTH, FSDP],
+]
+
+SEQUENCE_PARALLEL_AXIS_RULES = [
+        [SELF_ATTN_HEAD, None],
+        [SELF_ATTN_Q_LENGTH, FSDP],
+        [SELF_ATTN_KV_LENGTH, None],
+        [CROSS_ATTN_HEAD, None],
+        [CROSS_ATTN_Q_LENGTH, FSDP],
+        [CROSS_ATTN_KV_LENGTH, None],
+]
diff --git a/src/maxdiffusion/configs/base14.yml b/src/maxdiffusion/configs/base14.yml
@@ -50,6 +50,15 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/configs/base21.yml b/src/maxdiffusion/configs/base21.yml
@@ -49,6 +49,15 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/configs/base_2_base.yml b/src/maxdiffusion/configs/base_2_base.yml
@@ -50,6 +50,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # to override default block sizes for flash attention
 # flash_block_sizes:
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -63,6 +63,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 
 flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
diff --git a/src/maxdiffusion/configs/base_flux_dev_multi_res.yml b/src/maxdiffusion/configs/base_flux_dev_multi_res.yml
@@ -63,6 +63,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 
 #flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
diff --git a/src/maxdiffusion/configs/base_flux_schnell.yml b/src/maxdiffusion/configs/base_flux_schnell.yml
@@ -62,6 +62,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {
   "block_q" : 256,
   "block_kv_compute" : 256,
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -60,18 +60,27 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
-flash_min_seq_length: 4096
+flash_min_seq_length: 0
+
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 dropout: 0.1
 
 flash_block_sizes: {
-  "block_q" : 1024,
-  "block_kv_compute" : 256,
-  "block_kv" : 1024,
-  "block_q_dkv" : 1024,
-  "block_kv_dkv" : 1024,
-  "block_kv_dkv_compute" : 256,
-  "block_q_dq" : 1024,
-  "block_kv_dq" : 1024
+  "block_q" : 2048,
+  "block_kv_compute" : 512,
+  "block_kv" : 2048,
+  "block_q_dkv" : 2048,
+  "block_kv_dkv" : 2048,
+  "block_kv_dkv_compute" : 512,
+  "use_fused_bwd_kernel": True
 }
 # Use on v6e
 # flash_block_sizes: {
@@ -80,11 +89,22 @@ flash_block_sizes: {
 #   "block_kv" : 2048,
 #   "block_q_dkv" : 3024,
 #   "block_kv_dkv" : 2048,
-#   "block_kv_dkv_compute" : 2048,
+#   "block_kv_dkv_compute" : 1024,
 #   "block_q_dq" : 3024,
 #   "block_kv_dq" : 2048,
 #   "use_fused_bwd_kernel": False,
 # }
+# Use on v5p
+# flash_block_sizes: {
+#   "block_q" : 3024,
+#   "block_kv_compute" : 1024,
+#   "block_kv" : 2048,
+#   "block_q_dkv" : 1024,
+#   "block_kv_dkv" : 3072,
+#   "block_kv_dkv_compute" : 256,
+#   "block_q_dq" : 1024,
+#   "block_kv_dq" : 3072
+# }
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -145,8 +165,9 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 logical_axis_rules: [
                       ['batch', 'data'],
                       ['activation_batch', 'data'],
+                      ['activation_self_attn_heads', ['fsdp', 'tensor']], 
+                      ['activation_cross_attn_q_length', ['fsdp', 'tensor']],
                       ['activation_length', 'fsdp'],
-                      
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
@@ -280,7 +301,7 @@ flow_shift: 3.0
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
 num_inference_steps: 30
-fps: 24
+fps: 16
 save_final_checkpoint: False
 
 # SDXL Lightning parameters
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -61,6 +61,15 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
 flash_min_seq_length: 4096
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 dropout: 0.1
 
 flash_block_sizes: {
diff --git a/src/maxdiffusion/configs/base_xl.yml b/src/maxdiffusion/configs/base_xl.yml
@@ -50,6 +50,15 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/configs/base_xl_lightning.yml b/src/maxdiffusion/configs/base_xl_lightning.yml
@@ -48,6 +48,15 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -27,7 +27,7 @@
 from . import max_logging
 from . import max_utils
 from .models.wan.wan_utils import CAUSVID_TRANSFORMER_MODEL_NAME_OR_PATH, WAN_21_FUSION_X_MODEL_NAME_OR_PATH
-from maxdiffusion.common_types import LENGTH, KV_LENGTH
+from maxdiffusion.common_types import LENGTH, KV_LENGTH, RING_ATTENTION_AXIS_RULES, SEQUENCE_PARALLEL_AXIS_RULES
 
 
 def string_to_bool(s: str) -> bool:
@@ -179,14 +179,30 @@ def user_init(raw_keys):
 
     raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"])
     # Verify qkv is sharded across sequence.
-    if raw_keys["attention"] == "ring":
+    if raw_keys["attention"] == "ring" or raw_keys["attention_sharding_uniform"]:
+      max_logging.log(f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set.")
       logical_axis_rules = list(raw_keys["logical_axis_rules"])
+      max_logging.log(f"Initial logical axis rules: {logical_axis_rules}")
+      new_rules = []
       q_seq_sharding = (LENGTH, "fsdp")
       kv_seq_sharding = (KV_LENGTH, "fsdp")
       if q_seq_sharding not in logical_axis_rules:
         logical_axis_rules.append(q_seq_sharding)
       if kv_seq_sharding not in logical_axis_rules:
         logical_axis_rules.append(kv_seq_sharding)
+      if raw_keys["attention"] == "ring":
+        for ring_attention_axis_rule in RING_ATTENTION_AXIS_RULES:
+          if ring_attention_axis_rule not in logical_axis_rules:
+            max_logging.log(f"Adding ring attention axis rule {ring_attention_axis_rule}")
+            new_rules.append(ring_attention_axis_rule)
+      else: # attention =flash but sequence parallel sharding requested for both self and cross attention
+        for seq_parallel_axis_rule in SEQUENCE_PARALLEL_AXIS_RULES:
+          if seq_parallel_axis_rule not in logical_axis_rules:
+            max_logging.log(f"Adding sequence parallel attention axis rule {seq_parallel_axis_rule}")
+            new_rules.append(seq_parallel_axis_rule)
+      raw_keys["logical_axis_rules"] = tuple(new_rules) + tuple(logical_axis_rules)
+      max_logging.log(f"Final logical axis rules: {raw_keys['logical_axis_rules']}")
+
       raw_keys["logical_axis_rules"] = tuple(logical_axis_rules)
 
     raw_keys["data_sharding"] = _lists_to_tuples(raw_keys["data_sharding"])