Fix sharding attempt #1 -> 128 TPU test needed

eltsai · eltsai · commit 2c361e41f012 · 2025-12-05T22:59:54.000Z
diff --git a/src/maxdiffusion/configs/base14.yml b/src/maxdiffusion/configs/base14.yml
@@ -58,7 +58,7 @@ mask_padding_tokens: True
 # 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
 # 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
 #    in cross attention q.
-attention_sharding_uniform: True 
+attention_sharding_uniform: False 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -70,7 +70,7 @@ mask_padding_tokens: True
 # 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
 # 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
 #    in cross attention q.
-attention_sharding_uniform: True 
+attention_sharding_uniform: False 
 dropout: 0.1
 
 flash_block_sizes: {
@@ -165,7 +165,8 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 logical_axis_rules: [
                       ['batch', 'data'],
                       ['activation_batch', 'data'],
-                      ['activation_self_attn_heads', ['fsdp', 'tensor']], 
+                      ['activation_self_attn_q_length', 'fsdp'],
+                      ['activation_self_attn_kv_length', 'fsdp'],
                       ['activation_cross_attn_q_length', ['fsdp', 'tensor']],
                       ['activation_length', 'fsdp'],
                       ['activation_heads', 'tensor'],
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -187,6 +187,8 @@ def __init__(
 
   def __call__(self, x: jax.Array) -> jax.Array:
     x = self.proj(x)
+    jax.debug.print("ApproximateGELU activation shape: {shape}", shape=x.shape)
+    jax.debug.inspect_array_sharding(x, callback=print)
     return nnx.gelu(x)
 
 
@@ -245,7 +247,11 @@ def conditional_named_scope(self, name: str):
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
     with self.conditional_named_scope("mlp_up_proj_and_gelu"):
+      jax.debug.print(f"MLP input shape: {{shape}}", shape=hidden_states.shape)
+      jax.debug.inspect_array_sharding(hidden_states, callback=print)
       hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
+      jax.debug.print(f"MLP intermediate activation shape: {{shape}}", shape=hidden_states.shape)
+      jax.debug.inspect_array_sharding(hidden_states, callback=print)
       hidden_states = checkpoint_name(hidden_states, "ffn_activation")
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     with self.conditional_named_scope("mlp_down_proj"):