AI-Hypercomputer
diff --git a/‎docs/wan_animate_ulysses_fsdp_walkthrough.md‎
Lines changed: 515 additions & 0 deletions b/‎docs/wan_animate_ulysses_fsdp_walkthrough.md‎
Lines changed: 515 additions & 0 deletions
diff --git a/‎src/maxdiffusion/common_types.py‎
Lines changed: 31 additions & 13 deletions b/‎src/maxdiffusion/common_types.py‎
Lines changed: 31 additions & 13 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 7 additions & 7 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 6 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 7 additions & 7 deletions b/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_animate_27b.yml‎
Lines changed: 6 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_animate_27b.yml‎
Lines changed: 6 additions & 6 deletions
@@ -66,21 +66,39 @@
 
 WAN_MODEL = "Wan2.1"
 
-### Common axis rules for ring attention ###
+### Common axis rules for attention sharding ###
 RING_ATTENTION_AXIS_RULES = [
-    [SELF_ATTN_HEAD, None],
-    [SELF_ATTN_Q_LENGTH, CONTEXT],
-    [SELF_ATTN_KV_LENGTH, CONTEXT],
-    [CROSS_ATTN_HEAD, None],
-    [CROSS_ATTN_Q_LENGTH, CONTEXT],
-    [CROSS_ATTN_KV_LENGTH, CONTEXT],
+    (SELF_ATTN_HEAD, None),
+    (SELF_ATTN_Q_LENGTH, CONTEXT),
+    (SELF_ATTN_KV_LENGTH, CONTEXT),
+    (CROSS_ATTN_HEAD, None),
+    (CROSS_ATTN_Q_LENGTH, CONTEXT),
+    (CROSS_ATTN_KV_LENGTH, CONTEXT),
 ]
 
 SEQUENCE_PARALLEL_AXIS_RULES = [
-    [SELF_ATTN_HEAD, None],
-    [SELF_ATTN_Q_LENGTH, CONTEXT],
-    [SELF_ATTN_KV_LENGTH, None],
-    [CROSS_ATTN_HEAD, None],
-    [CROSS_ATTN_Q_LENGTH, CONTEXT],
-    [CROSS_ATTN_KV_LENGTH, None],
+    (SELF_ATTN_HEAD, None),
+    (SELF_ATTN_Q_LENGTH, CONTEXT),
+    (SELF_ATTN_KV_LENGTH, None),
+    (CROSS_ATTN_HEAD, None),
+    (CROSS_ATTN_Q_LENGTH, CONTEXT),
+    (CROSS_ATTN_KV_LENGTH, None),
+]
+
+ULYSSES_ATTENTION_AXIS_RULES = [
+    (SELF_ATTN_HEAD, None),
+    (SELF_ATTN_Q_LENGTH, CONTEXT),
+    (SELF_ATTN_KV_LENGTH, CONTEXT),
+    (CROSS_ATTN_HEAD, None),
+    (CROSS_ATTN_Q_LENGTH, CONTEXT),
+    (CROSS_ATTN_KV_LENGTH, CONTEXT),
+]
+
+ULYSSES_FSDP_ATTENTION_AXIS_RULES = [
+    (SELF_ATTN_HEAD, None),
+    (SELF_ATTN_Q_LENGTH, FSDP),
+    (SELF_ATTN_KV_LENGTH, FSDP),
+    (CROSS_ATTN_HEAD, None),
+    (CROSS_ATTN_Q_LENGTH, FSDP),
+    (CROSS_ATTN_KV_LENGTH, FSDP),
 ]
@@ -60,7 +60,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
+attention: 'flash' # Supported attention: dot_product, flash, ulysses, ulysses_fsdp, cudnn_flash_te, ring
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
@@ -166,19 +166,19 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 # conv_in : conv.shape[2] weight
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
-                      ['batch', ['data', 'fsdp']],
-                      ['activation_batch', ['data', 'fsdp']],
+                      ['batch', 'data'],
+                      ['activation_batch', 'data'],
                       ['activation_self_attn_heads', ['context', 'tensor']],
                       ['activation_cross_attn_q_length', ['context', 'tensor']],
                       ['activation_length', 'context'],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', 'fsdp'],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', 'data'],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
@@ -386,4 +386,4 @@ eval_data_dir: ""
 enable_generate_video_for_eval: False # This will increase the used TPU memory.
 eval_max_number_of_samples_in_bucket: 60 # The number of samples per bucket for evaluation. This is calculated by num_eval_samples / len(timesteps_list).
 
-enable_ssim: False
+enable_ssim: False
@@ -60,7 +60,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
+attention: 'flash' # Supported attention: dot_product, flash, ulysses, ulysses_fsdp, cudnn_flash_te, ring
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
@@ -143,19 +143,19 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 # conv_in : conv.shape[2] weight
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
-                      ['batch', ['data', 'fsdp']],
-                      ['activation_batch', ['data', 'fsdp']],
+                      ['batch', 'data'],
+                      ['activation_batch', 'data'],
                       ['activation_self_attn_heads', ['context', 'tensor']],
                       ['activation_cross_attn_q_length', ['context', 'tensor']],
                       ['activation_length', 'context'],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', 'fsdp'],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', 'data'],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
 
@@ -60,7 +60,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
+attention: 'flash' # Supported attention: dot_product, flash, ulysses, ulysses_fsdp, cudnn_flash_te, ring
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
@@ -154,19 +154,19 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 # conv_in : conv.shape[2] weight
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
-                      ['batch', ['data', 'fsdp']],
-                      ['activation_batch', ['data', 'fsdp']],
+                      ['batch', 'data'],
+                      ['activation_batch', 'data'],
                       ['activation_self_attn_heads', ['context', 'tensor']], 
                       ['activation_cross_attn_q_length', ['context', 'tensor']],
                       ['activation_length', 'context'],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', 'fsdp'],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', 'data'],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
@@ -364,4 +364,4 @@ eval_data_dir: ""
 enable_generate_video_for_eval: False # This will increase the used TPU memory.
 eval_max_number_of_samples_in_bucket: 60 # The number of samples per bucket for evaluation. This is calculated by num_eval_samples / len(timesteps_list).
 
-enable_ssim: False
+enable_ssim: False
@@ -62,7 +62,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
+attention: 'flash' # Supported attention: dot_product, flash, ulysses, ulysses_fsdp, cudnn_flash_te, ring
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
@@ -156,19 +156,19 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 # conv_in : conv.shape[2] weight
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
-                      ['batch', ['data', 'fsdp']],
-                      ['activation_batch', ['data', 'fsdp']],
+                      ['batch', 'data'],
+                      ['activation_batch', 'data'],
                       ['activation_self_attn_heads', ['context', 'tensor']],
                       ['activation_cross_attn_q_length', ['context', 'tensor']],
                       ['activation_length', 'context'],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', 'fsdp'],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', 'data'],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]