trying way2

prishajain1 · prishajain1 · commit 919bba386a0f · 2026-02-26T19:39:41.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -119,27 +119,6 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) -> jax.Array:
-    # OPTIMIZATION: Spatial Partitioning during execution
-    if self.mesh is not None and "context" in self.mesh.axis_names:
-      height = x.shape[2]
-      width = x.shape[3]
-      num_context_devices = self.mesh.shape["context"]
-
-      shard_axis = "context" if (height % num_context_devices == 0) else None
-      shard_width_axis = None
-      if shard_axis is None and width % num_context_devices == 0:
-        shard_width_axis = "context"
-
-      x = jax.lax.with_sharding_constraint(
-          x, jax.sharding.PartitionSpec("data", None, shard_axis, shard_width_axis, None)
-      )
-      
-      # Debug logging
-      if shard_axis or shard_width_axis:
-        jax.debug.print(
-            "Spatial sharding applied: height_axis={}, width_axis={} for shape {}", 
-            shard_axis, shard_width_axis, x.shape
-        )
 
     current_padding = list(self._causal_padding)  # Mutable copy
     padding_needed = self._depth_padding_before
@@ -165,6 +144,20 @@ def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) ->
       x_padded = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
       x_padded = x
+  
+    if self.mesh is not None and "context" in self.mesh.axis_names:
+      height = x_padded.shape[2]
+      width = x_padded.shape[3]
+      num_context_devices = self.mesh.shape["context"]
+
+      shard_axis = "context" if (height % num_context_devices == 0) else None
+      shard_width_axis = None
+      if shard_axis is None and width % num_context_devices == 0:
+        shard_width_axis = "context"
+
+      x_padded = jax.lax.with_sharding_constraint(
+          x_padded, jax.sharding.PartitionSpec("data", None, shard_axis, shard_width_axis, None)
+      )
 
     out = self.conv(x_padded)
     return out