sharding added

prishajain1 · prishajain1 · commit 83585ed1b443 · 2026-01-14T14:18:54.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -716,7 +716,7 @@ def __init__(self, rngs: nnx.Rngs, dim: int, dim_out: Optional[int] = None, mult
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "embed")),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", None)),
         bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
     self.act = get_activation(activation_fn)

Original file line number	Diff line number	Diff line change
`@@ -716,7 +716,7 @@ def __init__(self, rngs: nnx.Rngs, dim: int, dim_out: Optional[int] = None, mult`
`716`	`716`	`dtype=dtype,`
`717`	`717`	`param_dtype=weights_dtype,`
`718`	`718`	`precision=precision,`
`719`		`- kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "embed")),`
	`719`	`+ kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", None)),`
`720`	`720`	`bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),`
`721`	`721`	`)`
`722`	`722`	`self.act = get_activation(activation_fn)`