trying full jit compile with spatial sharding

prishajain1 · prishajain1 · commit c1af3385e347 · 2026-02-26T21:50:59.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -156,7 +156,7 @@ def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) ->
         shard_width_axis = "context"
 
       x_padded = jax.lax.with_sharding_constraint(
-          x_padded, jax.sharding.PartitionSpec(None, None, shard_axis, shard_width_axis, None)
+          x_padded, jax.sharding.PartitionSpec("data", None, shard_axis, shard_width_axis, None)
       )
 
     out = self.conv(x_padded)
@@ -1125,24 +1125,27 @@ def _encode(self, x: jax.Array, feat_cache: AutoencoderKLWanCache):
       x = jnp.transpose(x, (0, 2, 3, 4, 1))
       assert x.shape[-1] == 3, f"Expected input shape (N, D, H, W, 3), got {x.shape}"
 
-    t = x.shape[1]
-    iter_ = 1 + (t - 1) // 4
+    # Swap to (T, B, H, W, C) for scanning over time
+    x_scan = jnp.swapaxes(x, 0, 1)
     enc_feat_map = feat_cache._enc_feat_map
 
-    for i in range(iter_):
+    def scan_fn(carry_cache, input_frame):
+      # Expand time dimension to 1 for the encoder
+      input_frame = jnp.expand_dims(input_frame, 1)
+      # idx is restarted at 0 for each chunk/frame conceptually
       enc_conv_idx = 0
-      if i == 0:
-        out, enc_feat_map, enc_conv_idx = self.encoder(x[:, :1, :, :, :], feat_cache=enc_feat_map, feat_idx=enc_conv_idx)
-      else:
-        out_, enc_feat_map, enc_conv_idx = self.encoder(
-            x[:, 1 + 4 * (i - 1) : 1 + 4 * i, :, :, :],
-            feat_cache=enc_feat_map,
-            feat_idx=enc_conv_idx,
-        )
-        out = jnp.concatenate([out, out_], axis=1)
+      out_frame, new_cache, _ = self.encoder(input_frame, feat_cache=carry_cache, feat_idx=enc_conv_idx)
+      out_frame = jnp.squeeze(out_frame, 1)
+      return new_cache, out_frame
 
-    # Update back to the wrapper object if needed, but for result we use local vars
-    feat_cache._enc_feat_map = enc_feat_map
+    # Perform JAX scan
+    final_enc_feat_map, encoded_frames = jax.lax.scan(scan_fn, enc_feat_map, x_scan)
+    
+    # Swap back to (B, T, ... )
+    out = jnp.swapaxes(encoded_frames, 0, 1)
+
+    # Update back to the wrapper object if needed
+    feat_cache._enc_feat_map = final_enc_feat_map
 
     enc = self.quant_conv(out)
     mu, logvar = enc[:, :, :, :, : self.z_dim], enc[:, :, :, :, self.z_dim :]
@@ -1169,29 +1172,43 @@ def _decode(
 
     dec_feat_map = feat_cache._feat_map
 
-    for i in range(iter_):
-      conv_idx = 0
-      if i == 0:
-        out, dec_feat_map, conv_idx = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=dec_feat_map, feat_idx=conv_idx)
-      else:
-        out_, dec_feat_map, conv_idx = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=dec_feat_map, feat_idx=conv_idx)
-
-        # This is to bypass an issue where frame[1] should be frame[2] and vise versa.
-        # Ideally shouldn't need to do this however, can't find where the frame is going out of sync.
-        # Most likely due to an incorrect reshaping in the decoder.
-        fm1, fm2, fm3, fm4 = out_[:, 0, :, :, :], out_[:, 1, :, :, :], out_[:, 2, :, :, :], out_[:, 3, :, :, :]
-        # When batch_size is 0, expand batch dim for concatenation
-        # else, expand frame dim for concatenation so that batch dim stays intact.
-        axis = 0
-        if fm1.shape[0] > 1:
-          axis = 1
-
-        if len(fm1.shape) == 4:
-          fm1 = jnp.expand_dims(fm1, axis=axis)
-          fm2 = jnp.expand_dims(fm2, axis=axis)
-          fm3 = jnp.expand_dims(fm3, axis=axis)
-          fm4 = jnp.expand_dims(fm4, axis=axis)
-        out = jnp.concatenate([out, fm1, fm3, fm2, fm4], axis=1)
+    # Evaluate the first frame manually to establish the initial cache.
+    # The decoder returns 1 frame on the first step, and 4 frames on subsequent steps due to temporal upsampling.
+    out_0, dec_feat_map, _ = self.decoder(x[:, 0:1, :, :, :], feat_cache=dec_feat_map, feat_idx=0)
+    out = out_0
+
+    # Process remaining frames using jax.lax.scan (requires homogenous output shapes)
+    if iter_ > 1:
+      x_rest = x[:, 1:, :, :, :]
+      x_scan = jnp.swapaxes(x_rest, 0, 1) # (T-1, B, H, W, C)
+
+      def scan_fn(carry_cache, input_frame):
+        input_frame = jnp.expand_dims(input_frame, 1) # (B, 1, H, W, C)
+        out_frames, new_cache, _ = self.decoder(input_frame, feat_cache=carry_cache, feat_idx=0)
+
+        # Bypass an issue where frame[1] should be frame[2] and vice versa.
+        # Ensure dimensionality allows straightforward slicing:
+        fm1 = out_frames[:, 0:1, ...]
+        fm2 = out_frames[:, 1:2, ...]
+        fm3 = out_frames[:, 2:3, ...]
+        fm4 = out_frames[:, 3:4, ...]
+        
+        fixed_out_frames = jnp.concatenate([fm1, fm3, fm2, fm4], axis=1)
+        return new_cache, fixed_out_frames
+
+      dec_feat_map, scanned_out_frames = jax.lax.scan(scan_fn, dec_feat_map, x_scan)
+
+      # scanned_out_frames is (T-1, B, 4, H, W, C)
+      B = scanned_out_frames.shape[1]
+      T_minus_1 = scanned_out_frames.shape[0]
+      H, W, C = scanned_out_frames.shape[3], scanned_out_frames.shape[4], scanned_out_frames.shape[5]
+
+      # Swap back to (B, T-1, 4, H, W, C)
+      scanned_out_frames = jnp.swapaxes(scanned_out_frames, 0, 1)
+      # Flatten the temporal axes to (B, (T-1)*4, H, W, C)
+      scanned_out_frames = jnp.reshape(scanned_out_frames, (B, T_minus_1 * 4, H, W, C))
+
+      out = jnp.concatenate([out_0, scanned_out_frames], axis=1)
 
     feat_cache._feat_map = dec_feat_map