update shardings in attn.

jfacevedo-google · jfacevedo-google · commit d5b6da3eb8ff · 2025-07-11T22:35:30.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -132,11 +132,10 @@ logical_axis_rules: [
                       ['mlp','tensor'],
                       ['embed','fsdp'],
                       ['heads', 'tensor'],
-                      ['norm', 'fsdp'],
+                      ['norm', 'tensor'],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'fsdp'],
-                      ['conv_in', 'fsdp']
+                      ['conv_in', 'fsdp'],
                     ]
 data_sharding: [['data', 'fsdp', 'tensor']]
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -686,6 +686,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.key = nnx.Linear(
@@ -696,6 +697,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.value = nnx.Linear(
@@ -706,6 +708,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.proj_attn = nnx.Linear(

Original file line number	Diff line number	Diff line change
`@@ -686,6 +686,7 @@ def __init__(`
`686`	`686`	`dtype=dtype,`
`687`	`687`	`param_dtype=weights_dtype,`
`688`	`688`	`precision=precision,`
	`689`	`+ bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),`
`689`	`690`	`)`
`690`	`691`
`691`	`692`	`self.key = nnx.Linear(`
`@@ -696,6 +697,7 @@ def __init__(`
`696`	`697`	`dtype=dtype,`
`697`	`698`	`param_dtype=weights_dtype,`
`698`	`699`	`precision=precision,`
	`700`	`+ bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),`
`699`	`701`	`)`
`700`	`702`
`701`	`703`	`self.value = nnx.Linear(`
`@@ -706,6 +708,7 @@ def __init__(`
`706`	`708`	`dtype=dtype,`
`707`	`709`	`param_dtype=weights_dtype,`
`708`	`710`	`precision=precision,`
	`711`	`+ bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),`
`709`	`712`	`)`
`710`	`713`
`711`	`714`	`self.proj_attn = nnx.Linear(`