spatial sharding added

prishajain1 · prishajain1 · commit 10aea6ab9a61 · 2025-12-26T19:36:46.000+05:30
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -57,8 +57,6 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
-    
-    self.mesh = mesh
     self.kernel_size = _canonicalize_tuple(kernel_size, 3, "kernel_size")
     self.stride = _canonicalize_tuple(stride, 3, "stride")
     padding_tuple = _canonicalize_tuple(padding, 3, "padding")  # (D, H, W) padding amounts
@@ -95,37 +93,35 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) -> jax.Array:
-    current_padding = list(self._causal_padding)
+    current_padding = list(self._causal_padding)  # Mutable copy
     padding_needed = self._depth_padding_before
 
     if cache_x is not None and padding_needed > 0:
+      # Ensure cache has same spatial/channel dims, potentially different depth
       assert cache_x.shape[0] == x.shape[0] and cache_x.shape[2:] == x.shape[2:], "Cache spatial/channel dims mismatch"
       cache_len = cache_x.shape[1]
-      x = jnp.concatenate([cache_x, x], axis=1)
+      x = jnp.concatenate([cache_x, x], axis=1)  # Concat along depth (D)
+
       padding_needed -= cache_len
       if padding_needed < 0:
+        # Cache longer than needed padding, trim from start
         x = x[:, -padding_needed:, ...]
-        current_padding[1] = (0, 0)
+        current_padding[1] = (0, 0)  # No explicit padding needed now
       else:
+        # Update depth padding needed
         current_padding[1] = (padding_needed, 0)
 
+    # Apply padding if any dimension requires it
     padding_to_apply = tuple(current_padding)
     if any(p > 0 for dim_pads in padding_to_apply for p in dim_pads):
-      x_internal = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
+      x_padded = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
-      x_internal = x
-
-    # REMOVED FSDP PADDING LOGIC FROM HERE
-    # Sharding constraints are fine, but JAX will error if not divisible.
-    # This will be handled in the calling block.
-    if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
-      x_internal = jax.lax.with_sharding_constraint(x_internal, P(None, None, 'fsdp', None, None))
-
-    out = self.conv(x_internal)
+      x_padded = x
+    x_padded = jax.lax.with_sharding_constraint(x_padded, P(None, None, 'fsdp', None, None))
+    out = self.conv(x_padded)
     return out
 
 
-
 class WanRMS_norm(nnx.Module):
 
   def __init__(
@@ -229,7 +225,6 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
-    self.mesh = mesh
     self.dim = dim
     self.mode = mode
     self.time_conv = nnx.data(None)
@@ -318,7 +313,6 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
     # Input x: (N, D, H, W, C), assume C = self.dim
     b, t, h, w, c = x.shape
     assert c == self.dim
-    original_h = h
 
     if self.mode == "upsample3d":
       if feat_cache is not None:
@@ -342,57 +336,12 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           x = x.reshape(b, t, h, w, 2, c)
           x = jnp.stack([x[:, :, :, :, 0, :], x[:, :, :, :, 1, :]], axis=1)
           x = x.reshape(b, t * 2, h, w, c)
-      # Update t and h as they might have changed in upsample3d
-      t = x.shape[1]
-      h = x.shape[2]
-      # original_h remains the height *before* this block's operations
-
-    x_reshaped = x.reshape(b * t, h, w, c)
-    current_h = x_reshaped.shape[1]
-
-    # --- FSDP Spatial Padding ---
-    pad_h_fsdp = 0
-    if self.mesh and 'fsdp' in self.mesh.axis_names:
-        fsdp_size = self.mesh.shape['fsdp']
-        if fsdp_size > 1:
-            if current_h % fsdp_size != 0:
-                pad_h_fsdp = fsdp_size - (current_h % fsdp_size)
-                h_padding = ((0, 0), (0, pad_h_fsdp), (0, 0), (0, 0))
-                x_reshaped = jnp.pad(x_reshaped, h_padding, mode="constant", constant_values=0.0)
-    # --- End FSDP Spatial Padding ---
-
-    if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
-        x_reshaped = jax.lax.with_sharding_constraint(x_reshaped, P(None, 'fsdp', None, None))
-
-    resampled_x = self.resample(x_reshaped)
-
-    # --- FSDP Spatial Slicing ---
-    if pad_h_fsdp > 0:
-      if "upsample" in self.mode:
-          scale_factor_h = 1.0
-          if isinstance(self.resample, nnx.Sequential) and isinstance(self.resample.layers[0], WanUpsample):
-             scale_factor_h = self.resample.layers[0].scale_factor[0]
-          target_h = int(current_h * scale_factor_h)
-          resampled_x = resampled_x[:, :target_h, :, :]
-      elif "downsample" in self.mode:
-          stride_h = 1
-          if isinstance(self.resample, ZeroPaddedConv2D):
-              stride_h = self.resample.conv.strides[0]
-          elif isinstance(self.resample, nnx.Conv):
-               stride_h = self.resample.strides[0]
-
-          if stride_h > 1:
-              # kernel_size and padding affect output size,
-              # For "VALID" in ZeroPaddedConv2D (which has no other padding), out = (in - kernel + stride) // stride
-              # Since we added padding for FSDP, we want the size as if no FSDP padding was added.
-              k_h = self.resample.conv.kernel_size[0]
-              target_h = (current_h - k_h + stride_h) // stride_h
-              resampled_x = resampled_x[:, :target_h, :, :]
-          # If stride_h is 1, no slicing needed as the size doesn't shrink.
-    # --- End FSDP Spatial Slicing ---
-
-    h_new, w_new, c_new = resampled_x.shape[1:]
-    x = resampled_x.reshape(b, t, h_new, w_new, c_new)
+    t = x.shape[1]
+    x = x.reshape(b * t, h, w, c)
+    x = jax.lax.with_sharding_constraint(x, P(None, 'fsdp', None, None))
+    x = self.resample(x)
+    h_new, w_new, c_new = x.shape[1:]
+    x = x.reshape(b, t, h_new, w_new, c_new)
 
     if self.mode == "downsample3d":
       if feat_cache is not None:
@@ -405,8 +354,8 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           x = self.time_conv(jnp.concatenate([feat_cache[idx][:, -1:, :, :, :], x], axis=1))
           feat_cache[idx] = cache_x
           feat_idx[0] += 1
-    return x
 
+    return x
 
 
 class WanResidualBlock(nnx.Module):
@@ -423,7 +372,6 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
-    self.mesh = mesh
     self.nonlinearity = get_activation(non_linearity)
 
     # layers
@@ -467,54 +415,39 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
-    original_shape = x.shape
-    original_h = original_shape[2]
-    original_w = original_shape[3]
-    pad_h_fsdp = 0
-    pad_w_fsdp = 0
-    x_padded = x
-
-    if self.mesh and 'fsdp' in self.mesh.axis_names:
-        fsdp_size = self.mesh.shape['fsdp']
-        if fsdp_size > 1:
-            if original_h % fsdp_size != 0:
-                pad_h_fsdp = fsdp_size - (original_h % fsdp_size)
-            # Assuming width is not sharded on fsdp, add if needed
-            # if original_w % fsdp_size != 0:
-            #     pad_w_fsdp = fsdp_size - (original_w % fsdp_size)
-
-            if pad_h_fsdp > 0 or pad_w_fsdp > 0:
-                 h_padding = ((0, 0), (0, 0), (0, pad_h_fsdp), (0, pad_w_fsdp), (0, 0))
-                 x_padded = jnp.pad(x, h_padding, mode="constant", constant_values=0.0)
-
-    h = self.conv_shortcut(x_padded)
+    # Apply shortcut connection
+    h = self.conv_shortcut(x)
 
-    temp_x = self.norm1(x_padded)
-    temp_x = self.nonlinearity(temp_x)
-    temp_x = self.conv1(temp_x, cache_x=feat_cache[idx] if feat_cache else None)
-    temp_x = self.norm2(temp_x)
-    temp_x = self.nonlinearity(temp_x)
-    temp_x = self.conv2(temp_x, cache_x=feat_cache[idx] if feat_cache else None)
-
-    # --- Crop temp_x to match h's spatial dimensions ---
-    h_height, h_width = h.shape[2], h.shape[3]
-    x_height, x_width = temp_x.shape[2], temp_x.shape[3]
-
-    if x_height > h_height:
-        ch = (x_height - h_height) // 2
-        temp_x = temp_x[:, :, ch:ch + h_height, :, :]
-    if x_width > h_width:
-        cw = (x_width - h_width) // 2
-        temp_x = temp_x[:, :, :, cw:cw + h_width, :]
-    # --- End Crop ---
-
-    res_x = temp_x + h
+    x = self.norm1(x)
+    x = self.nonlinearity(x)
 
-    if pad_h_fsdp > 0 or pad_w_fsdp > 0:
-        res_x = res_x[:, :, :original_h, :original_w, :]
-    return res_x
+    if feat_cache is not None:
+      idx = feat_idx[0]
+      cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
+      if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
+        cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
+      x = self.conv1(x, feat_cache[idx], idx)
+      feat_cache[idx] = cache_x
+      feat_idx[0] += 1
+    else:
+      x = self.conv1(x)
 
+    x = self.norm2(x)
+    x = self.nonlinearity(x)
+    idx = feat_idx[0]
 
+    if feat_cache is not None:
+      idx = feat_idx[0]
+      cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
+      if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
+        cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
+      x = self.conv2(x, feat_cache[idx])
+      feat_cache[idx] = cache_x
+      feat_idx[0] += 1
+    else:
+      x = self.conv2(x)
+    x = x + h
+    return x
 
 
 class WanAttentionBlock(nnx.Module):
@@ -528,7 +461,6 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
-    self.mesh = mesh
     self.dim = dim
     self.norm = WanRMS_norm(rngs=rngs, dim=dim, channel_first=False)
     self.to_qkv = nnx.Conv(
@@ -553,49 +485,32 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array):
+
     identity = x
     batch_size, time, height, width, channels = x.shape
-    original_h = height
-
-    # --- FSDP Spatial Padding ---
-    pad_h_fsdp = 0
-    x_padded = x
-    if self.mesh and 'fsdp' in self.mesh.axis_names:
-        fsdp_size = self.mesh.shape['fsdp']
-        if fsdp_size > 1:
-            if original_h % fsdp_size != 0:
-                pad_h_fsdp = fsdp_size - (original_h % fsdp_size)
-                h_padding = ((0, 0), (0, 0), (0, pad_h_fsdp), (0, 0), (0, 0))
-                x_padded = jnp.pad(x, h_padding, mode="constant", constant_values=0.0)
-    # --- End FSDP Spatial Padding ---
-
-    if self.mesh and 'fsdp' in self.mesh.axis_names and self.mesh.shape['fsdp'] > 1:
-      x_padded = jax.lax.with_sharding_constraint(x_padded, P(None, None, 'fsdp', None, None))
-
-    current_height = x_padded.shape[2]
-    x_reshaped = x_padded.reshape(batch_size * time, current_height, width, channels)
-    x_normed = self.norm(x_reshaped)
-
-    qkv = self.to_qkv(x_normed)
+
+    x = jax.lax.with_sharding_constraint(x, P(None, None, 'fsdp', None, None))
+
+    x = x.reshape(batch_size * time, height, width, channels)
+    x = self.norm(x)
+
+    qkv = self.to_qkv(x)  # Output: (N*D, H, W, C * 3)
+    # qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
     qkv = qkv.reshape(batch_size * time, 1, -1, channels * 3)
     qkv = jnp.transpose(qkv, (0, 1, 3, 2))
     q, k, v = jnp.split(qkv, 3, axis=-2)
     q = jnp.transpose(q, (0, 1, 3, 2))
     k = jnp.transpose(k, (0, 1, 3, 2))
     v = jnp.transpose(v, (0, 1, 3, 2))
-    attn_out = jax.nn.dot_product_attention(q, k, v)
-    attn_out = jnp.squeeze(attn_out, 1).reshape(batch_size * time, current_height, width, channels)
-
-    x_proj = self.proj(attn_out)
-    x_proj = x_proj.reshape(batch_size, time, current_height, width, channels)
-
-    # --- FSDP Spatial Slicing ---
-    if pad_h_fsdp > 0:
-        x_proj = x_proj[:, :, :original_h, :, :]
-    # --- End FSDP Spatial Slicing ---
+    x = jax.nn.dot_product_attention(q, k, v)
+    x = jnp.squeeze(x, 1).reshape(batch_size * time, height, width, channels)
 
-    return x_proj + identity
+    # output projection
+    x = self.proj(x)
+    # Reshape back
+    x = x.reshape(batch_size, time, height, width, channels)
 
+    return x + identity
 
 
 class WanMidBlock(nnx.Module):
@@ -1234,4 +1149,4 @@ def decode(
     decoded = self._decode(z, feat_cache).sample
     if not return_dict:
       return (decoded,)
-    return FlaxDecoderOutput(sample=decoded)
+    return FlaxDecoderOutput(sample=decoded)