changes for spatial sharding

prishajain1 · prishajain1 · commit 46d392fccb04 · 2025-12-26T16:43:43.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -19,6 +19,7 @@
 import jax
 import jax.numpy as jnp
 from flax import nnx
+from jax.sharding import PartitionSpec as P
 from ...configuration_utils import ConfigMixin
 from ..modeling_flax_utils import FlaxModelMixin, get_activation
 from ... import common_types
@@ -116,6 +117,7 @@ def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) ->
       x_padded = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
       x_padded = x
+    x_padded = jax.lax.with_sharding_constraint(x_padded, P(None, None, 'fsdp', None, None))
     out = self.conv(x_padded)
     return out
 
@@ -336,6 +338,7 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           x = x.reshape(b, t * 2, h, w, c)
     t = x.shape[1]
     x = x.reshape(b * t, h, w, c)
+    x = jax.lax.with_sharding_constraint(x, P(None, 'fsdp', None, None))
     x = self.resample(x)
     h_new, w_new, c_new = x.shape[1:]
     x = x.reshape(b, t, h_new, w_new, c_new)
@@ -486,6 +489,8 @@ def __call__(self, x: jax.Array):
     identity = x
     batch_size, time, height, width, channels = x.shape
 
+    x = jax.lax.with_sharding_constraint(x, P(None, None, 'fsdp', None, None))
+
     x = x.reshape(batch_size * time, height, width, channels)
     x = self.norm(x)