add fsdp/tp sharding of weights.

jfacevedo-google · jfacevedo-google · commit 2bbf289dfbd5 · 2025-07-15T18:19:04.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -126,11 +126,13 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
                       ['batch', 'data'],
-                      ['activation_length', 'fsdp'],
+                      ['activation_length', ['fsdp','tensor']],
+                      ['activation_kv_length', 'fsdp'],
                       ['activation_heads', 'tensor'],
                       ['activation_batch', 'data'],
-                      ['mlp','tensor'],
-                      ['embed','fsdp'],
+                      #['mlp','tensor'],
+                      ['embed',['fsdp', 'tensor']],
+                      ['heads', ['tensor', 'fsdp']],
                       ['norm', 'tensor'],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -187,9 +187,9 @@ def _tpu_flash_attention(
   value, _, _ = _reshape_data_for_flash(value, heads, block_sizes.block_kv_compute, num_fsdp_shards)
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
-  flash_axis_names_splash_kernel: AxisNames = (HEAD, LENGTH)
-  axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
-  named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
+  #flash_axis_names_splash_kernel: AxisNames = (HEAD, LENGTH)
+  #axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
+  #named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
 
   shard_head_size = mesh.shape["tensor"]
 
@@ -210,7 +210,7 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
 
   multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
   splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
-  segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
+  #segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
 
   @functools.partial(
       shard_map.shard_map,
@@ -219,7 +219,7 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
           q_axis_names,
           kv_axis_names,
           kv_axis_names,
-          segment_axis_names_splash_kernel,
+          None,
       ),
       out_specs=q_axis_names,
       check_rep=False,
@@ -686,7 +686,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.key = nnx.Linear(
@@ -697,7 +697,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.value = nnx.Linear(
@@ -708,7 +708,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.proj_attn = nnx.Linear(
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -99,7 +99,7 @@ def __init__(
                 "mlp",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
     )
 
     if cond_proj_dim is not None:
@@ -131,7 +131,7 @@ def __init__(
                 "embed",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     if post_act_fn is None:
@@ -275,11 +275,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "embed",
                 "mlp",
+                "embed",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
     )
     self.act_1 = get_activation(act_fn)
 
@@ -294,11 +294,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "mlp",
                 "embed",
+                "mlp",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
   def __call__(self, caption):
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -123,7 +123,7 @@ def __init__(
                 "mlp",
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        #bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
     )
     self.text_embedder = NNXPixArtAlphaTextProjection(
         rngs=rngs,
@@ -170,6 +170,13 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        kernel_init=nnx.with_partitioning(
+            nnx.initializers.xavier_uniform(),
+            (
+                "embed",
+                "mlp",
+            ),
+        ),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array: