padding fix

prishajain1 · prishajain1 · commit 712cc4aaa317 · 2025-12-26T19:29:04.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -95,47 +95,37 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) -> jax.Array:
-    current_padding = list(self._causal_padding)  # Mutable copy
+    current_padding = list(self._causal_padding)
     padding_needed = self._depth_padding_before
 
     if cache_x is not None and padding_needed > 0:
-      # Ensure cache has same spatial/channel dims, potentially different depth
       assert cache_x.shape[0] == x.shape[0] and cache_x.shape[2:] == x.shape[2:], "Cache spatial/channel dims mismatch"
       cache_len = cache_x.shape[1]
-      x = jnp.concatenate([cache_x, x], axis=1)  # Concat along depth (D)
-
+      x = jnp.concatenate([cache_x, x], axis=1)
       padding_needed -= cache_len
       if padding_needed < 0:
-        # Cache longer than needed padding, trim from start
         x = x[:, -padding_needed:, ...]
-        current_padding[1] = (0, 0)  # No explicit padding needed now
+        current_padding[1] = (0, 0)
       else:
-        # Update depth padding needed
         current_padding[1] = (padding_needed, 0)
 
-    # Apply padding if any dimension requires it
     padding_to_apply = tuple(current_padding)
     if any(p > 0 for dim_pads in padding_to_apply for p in dim_pads):
       x_internal = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
       x_internal = x
 
-    h_dim_after_conv_padding = x_internal.shape[2]
-    pad_h_fsdp = 0
-    if self.mesh and 'fsdp' in self.mesh.axis_names:
-        fsdp_size = self.mesh.shape['fsdp']
-        if fsdp_size > 1:
-            if h_dim_after_conv_padding % fsdp_size != 0:
-                pad_h_fsdp = fsdp_size - (h_dim_after_conv_padding % fsdp_size)
-                h_padding = ((0, 0), (0, 0), (0, pad_h_fsdp), (0, 0), (0, 0))
-                x_internal = jnp.pad(x_internal, h_padding, mode="constant", constant_values=0.0)
+    # REMOVED FSDP PADDING LOGIC FROM HERE
+    # Sharding constraints are fine, but JAX will error if not divisible.
+    # This will be handled in the calling block.
     if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
       x_internal = jax.lax.with_sharding_constraint(x_internal, P(None, None, 'fsdp', None, None))
 
     out = self.conv(x_internal)
     return out
 
 
+
 class WanRMS_norm(nnx.Module):
 
   def __init__(
@@ -328,6 +318,7 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
     # Input x: (N, D, H, W, C), assume C = self.dim
     b, t, h, w, c = x.shape
     assert c == self.dim
+    original_h = h
 
     if self.mode == "upsample3d":
       if feat_cache is not None:
@@ -351,32 +342,37 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           x = x.reshape(b, t, h, w, 2, c)
           x = jnp.stack([x[:, :, :, :, 0, :], x[:, :, :, :, 1, :]], axis=1)
           x = x.reshape(b, t * 2, h, w, c)
+      # Update t and h as they might have changed in upsample3d
       t = x.shape[1]
       h = x.shape[2]
+      # original_h remains the height *before* this block's operations
 
     x_reshaped = x.reshape(b * t, h, w, c)
+    current_h = x_reshaped.shape[1]
 
-    original_h = x_reshaped.shape[1]
+    # --- FSDP Spatial Padding ---
     pad_h_fsdp = 0
     if self.mesh and 'fsdp' in self.mesh.axis_names:
         fsdp_size = self.mesh.shape['fsdp']
         if fsdp_size > 1:
-            if original_h % fsdp_size != 0:
-                pad_h_fsdp = fsdp_size - (original_h % fsdp_size)
+            if current_h % fsdp_size != 0:
+                pad_h_fsdp = fsdp_size - (current_h % fsdp_size)
                 h_padding = ((0, 0), (0, pad_h_fsdp), (0, 0), (0, 0))
                 x_reshaped = jnp.pad(x_reshaped, h_padding, mode="constant", constant_values=0.0)
+    # --- End FSDP Spatial Padding ---
 
     if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
         x_reshaped = jax.lax.with_sharding_constraint(x_reshaped, P(None, 'fsdp', None, None))
 
     resampled_x = self.resample(x_reshaped)
 
+    # --- FSDP Spatial Slicing ---
     if pad_h_fsdp > 0:
       if "upsample" in self.mode:
           scale_factor_h = 1.0
           if isinstance(self.resample, nnx.Sequential) and isinstance(self.resample.layers[0], WanUpsample):
              scale_factor_h = self.resample.layers[0].scale_factor[0]
-          target_h = int(original_h * scale_factor_h)
+          target_h = int(current_h * scale_factor_h)
           resampled_x = resampled_x[:, :target_h, :, :]
       elif "downsample" in self.mode:
           stride_h = 1
@@ -386,8 +382,14 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
                stride_h = self.resample.strides[0]
 
           if stride_h > 1:
-              target_h = original_h // stride_h
+              # kernel_size and padding affect output size,
+              # For "VALID" in ZeroPaddedConv2D (which has no other padding), out = (in - kernel + stride) // stride
+              # Since we added padding for FSDP, we want the size as if no FSDP padding was added.
+              k_h = self.resample.conv.kernel_size[0]
+              target_h = (current_h - k_h + stride_h) // stride_h
               resampled_x = resampled_x[:, :target_h, :, :]
+          # If stride_h is 1, no slicing needed as the size doesn't shrink.
+    # --- End FSDP Spatial Slicing ---
 
     h_new, w_new, c_new = resampled_x.shape[1:]
     x = resampled_x.reshape(b, t, h_new, w_new, c_new)
@@ -403,10 +405,10 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           x = self.time_conv(jnp.concatenate([feat_cache[idx][:, -1:, :, :, :], x], axis=1))
           feat_cache[idx] = cache_x
           feat_idx[0] += 1
-
     return x
 
 
+
 class WanResidualBlock(nnx.Module):
 
   def __init__(
@@ -421,6 +423,7 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
+    self.mesh = mesh
     self.nonlinearity = get_activation(non_linearity)
 
     # layers
@@ -464,39 +467,54 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
-    # Apply shortcut connection
-    h = self.conv_shortcut(x)
+    original_shape = x.shape
+    original_h = original_shape[2]
+    original_w = original_shape[3]
+    pad_h_fsdp = 0
+    pad_w_fsdp = 0
+    x_padded = x
 
-    x = self.norm1(x)
-    x = self.nonlinearity(x)
+    if self.mesh and 'fsdp' in self.mesh.axis_names:
+        fsdp_size = self.mesh.shape['fsdp']
+        if fsdp_size > 1:
+            if original_h % fsdp_size != 0:
+                pad_h_fsdp = fsdp_size - (original_h % fsdp_size)
+            # Assuming width is not sharded on fsdp, add if needed
+            # if original_w % fsdp_size != 0:
+            #     pad_w_fsdp = fsdp_size - (original_w % fsdp_size)
 
-    if feat_cache is not None:
-      idx = feat_idx[0]
-      cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
-      if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
-        cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
-      x = self.conv1(x, feat_cache[idx], idx)
-      feat_cache[idx] = cache_x
-      feat_idx[0] += 1
-    else:
-      x = self.conv1(x)
+            if pad_h_fsdp > 0 or pad_w_fsdp > 0:
+                 h_padding = ((0, 0), (0, 0), (0, pad_h_fsdp), (0, pad_w_fsdp), (0, 0))
+                 x_padded = jnp.pad(x, h_padding, mode="constant", constant_values=0.0)
+
+    h = self.conv_shortcut(x_padded)
+
+    temp_x = self.norm1(x_padded)
+    temp_x = self.nonlinearity(temp_x)
+    temp_x = self.conv1(temp_x, cache_x=feat_cache[idx] if feat_cache else None)
+    temp_x = self.norm2(temp_x)
+    temp_x = self.nonlinearity(temp_x)
+    temp_x = self.conv2(temp_x, cache_x=feat_cache[idx] if feat_cache else None)
+
+    # --- Crop temp_x to match h's spatial dimensions ---
+    h_height, h_width = h.shape[2], h.shape[3]
+    x_height, x_width = temp_x.shape[2], temp_x.shape[3]
+
+    if x_height > h_height:
+        ch = (x_height - h_height) // 2
+        temp_x = temp_x[:, :, ch:ch + h_height, :, :]
+    if x_width > h_width:
+        cw = (x_width - h_width) // 2
+        temp_x = temp_x[:, :, :, cw:cw + h_width, :]
+    # --- End Crop ---
+
+    res_x = temp_x + h
+
+    if pad_h_fsdp > 0 or pad_w_fsdp > 0:
+        res_x = res_x[:, :, :original_h, :original_w, :]
+    return res_x
 
-    x = self.norm2(x)
-    x = self.nonlinearity(x)
-    idx = feat_idx[0]
 
-    if feat_cache is not None:
-      idx = feat_idx[0]
-      cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
-      if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
-        cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
-      x = self.conv2(x, feat_cache[idx])
-      feat_cache[idx] = cache_x
-      feat_idx[0] += 1
-    else:
-      x = self.conv2(x)
-    x = x + h
-    return x
 
 
 class WanAttentionBlock(nnx.Module):
@@ -535,25 +553,27 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array):
-
     identity = x
     batch_size, time, height, width, channels = x.shape
     original_h = height
 
+    # --- FSDP Spatial Padding ---
     pad_h_fsdp = 0
+    x_padded = x
     if self.mesh and 'fsdp' in self.mesh.axis_names:
         fsdp_size = self.mesh.shape['fsdp']
         if fsdp_size > 1:
             if original_h % fsdp_size != 0:
                 pad_h_fsdp = fsdp_size - (original_h % fsdp_size)
                 h_padding = ((0, 0), (0, 0), (0, pad_h_fsdp), (0, 0), (0, 0))
-                x = jnp.pad(x, h_padding, mode="constant", constant_values=0.0)
+                x_padded = jnp.pad(x, h_padding, mode="constant", constant_values=0.0)
+    # --- End FSDP Spatial Padding ---
 
     if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
-      x = jax.lax.with_sharding_constraint(x, P(None, None, 'fsdp', None, None))
+      x_padded = jax.lax.with_sharding_constraint(x_padded, P(None, None, 'fsdp', None, None))
 
-    current_height = x.shape[2]
-    x_reshaped = x.reshape(batch_size * time, current_height, width, channels)
+    current_height = x_padded.shape[2]
+    x_reshaped = x_padded.reshape(batch_size * time, current_height, width, channels)
     x_normed = self.norm(x_reshaped)
 
     qkv = self.to_qkv(x_normed)
@@ -568,12 +588,16 @@ def __call__(self, x: jax.Array):
 
     x_proj = self.proj(attn_out)
     x_proj = x_proj.reshape(batch_size, time, current_height, width, channels)
+
+    # --- FSDP Spatial Slicing ---
     if pad_h_fsdp > 0:
         x_proj = x_proj[:, :, :original_h, :, :]
+    # --- End FSDP Spatial Slicing ---
 
     return x_proj + identity
 
 
+
 class WanMidBlock(nnx.Module):
 
   def __init__(