test different sharding strategy

entrpn · entrpn · commit 76f5bd713eb0 · 2025-09-04T17:39:11.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -138,7 +138,14 @@ logical_axis_rules: [
                       ['mlp','tensor'],
                       ['embed','fsdp'],
                       ['heads', 'tensor'],
-                      ['norm', 'tensor'],
+                      ["qkv", ["tensor", "fsdp"]],
+                      ["proj_out", ["tensor", "fsdp"]],
+                      ["timestep_ln1", ["tensor", "fsdp"]],
+                      ["timestep_ln2", ["tensor", "fsdp"]],
+                      ["text_proj_ln1", ["tensor", "fsdp"]],
+                      ["text_proj_ln2", ["tensor", "fsdp"]],
+                      ["ffn_lin1", ["tensor", "fsdp"]],
+                      ["ffn_lin2", ["tensor", "fsdp"]],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
                       ['conv_out', 'fsdp'],
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -734,7 +734,7 @@ def __init__(
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
     # Dims are [num_blocks, embed, heads]
-    kernel_axes = ("embed", "heads")
+    kernel_axes = (None, "qkv")
     qkv_init_kernel = nnx.with_partitioning(nnx.initializers.lecun_normal(), kernel_axes)
 
     self.query = nnx.Linear(
@@ -747,7 +747,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            ("embed",),
+            ("qkv"),
         ),
     )
 
@@ -761,7 +761,7 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            ("embed",),
+            ("qkv",),
         ),
     )
 
@@ -775,22 +775,19 @@ def __init__(
         precision=precision,
         bias_init=nnx.with_partitioning(
             nnx.initializers.zeros,
-            ("embed",),
+            ("qkv",),
         ),
     )
 
     self.proj_attn = nnx.Linear(
         rngs=rngs,
         in_features=self.inner_dim,
         out_features=self.inner_dim,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed")),
+        kernel_init=nnx.with_partitioning(
+          nnx.initializers.lecun_normal(), ("proj_out", None)),
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        bias_init=nnx.with_partitioning(
-            nnx.initializers.zeros,
-            ("heads",),
-        ),
     )
 
     self.drop_out = nnx.Dropout(dropout)
@@ -803,21 +800,13 @@ def __init__(
           rngs=rngs,
           epsilon=eps,
           dtype=dtype,
-          scale_init=nnx.with_partitioning(
-              nnx.initializers.ones,
-              ("norm",),
-          ),
           param_dtype=weights_dtype,
       )
 
       self.norm_k = nnx.RMSNorm(
           num_features=self.inner_dim,
           rngs=rngs,
           dtype=dtype,
-          scale_init=nnx.with_partitioning(
-              nnx.initializers.ones,
-              ("norm",),
-          ),
           param_dtype=weights_dtype,
       )
 
@@ -845,8 +834,6 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
   ) -> jax.Array:
-    hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
-    encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     dtype = hidden_states.dtype
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
@@ -855,6 +842,11 @@ def __call__(
     key_proj = self.key(encoder_hidden_states)
     value_proj = self.value(encoder_hidden_states)
 
+    query_proj = jax.lax.with_sharding_constraint(query_proj, PartitionSpec("data", ("tensor", "fsdp"), None))
+    key_proj = jax.lax.with_sharding_constraint(key_proj, PartitionSpec("data", ("tensor", "fsdp"), None))
+    value_proj = jax.lax.with_sharding_constraint(value_proj, PartitionSpec("data", ("tensor", "fsdp"), None))
+    
+
     if self.qk_norm:
       query_proj = self.norm_q(query_proj)
       key_proj = self.norm_k(key_proj)
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -95,11 +95,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "embed",
-                "mlp",
+                "timestep_ln1",
+                None,
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("timestep_ln1",),)
     )
 
     if cond_proj_dim is not None:
@@ -127,11 +127,10 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "mlp",
-                "embed",
+                None,
+                "timestep_ln2"
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     if post_act_fn is None:
@@ -275,11 +274,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "embed",
-                "mlp",
+                "text_proj_ln1",
+                None,
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("text_proj_ln1",)),
     )
     self.act_1 = get_activation(act_fn)
 
@@ -294,11 +293,10 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "mlp",
-                "embed",
+                None,
+                "text_proj_ln2"
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
   def __call__(self, caption):
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -176,11 +176,11 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "mlp",
-                "embed",
+                "ffn_lin1",
+                None
             ),
         ),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("ffn_lin1",)),
     )
 
   def __call__(self, x: jax.Array) -> jax.Array:
@@ -229,8 +229,8 @@ def __init__(
         kernel_init=nnx.with_partitioning(
             nnx.initializers.xavier_uniform(),
             (
-                "embed",
-                "mlp",
+                None,
+                "ffn_lin2",
             ),
         ),
     )
@@ -485,6 +485,9 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
   ) -> Union[jax.Array, Dict[str, jax.Array]]:
+
+    encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data",))
+
     batch_size, _, num_frames, height, width = hidden_states.shape
     p_t, p_h, p_w = self.config.patch_size
     post_patch_num_frames = num_frames // p_t
@@ -497,6 +500,8 @@ def __call__(
     hidden_states = self.patch_embedding(hidden_states)
     hidden_states = jax.lax.collapse(hidden_states, 1, -1)
 
+    hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", ("tensor", "fsdp"), None))
+
     temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
         timestep, encoder_hidden_states, encoder_hidden_states_image
     )