fixes jittery decoder frames in vae.

jfacevedo-google · jfacevedo-google · commit 66146b9dcf63 · 2025-05-08T01:00:53.000Z
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -94,50 +94,31 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) -> jax.Array:
-    print("wanCausalConv3d, x min: ", np.min(x))
-    print("wanCausalConv3d, x max: ", np.max(x))
     current_padding = list(self._causal_padding)  # Mutable copy
     padding_needed = self._depth_padding_before
 
     if cache_x is not None and padding_needed > 0:
-      print("WanCausalConv3d, cache.shape: ", cache_x.shape)
-      print("wanCausalConv3d, cache_x min: ", np.min(cache_x))
-      print("wanCausalConv3d, cache_x max: ", np.max(cache_x))
       # Ensure cache has same spatial/channel dims, potentially different depth
       assert cache_x.shape[0] == x.shape[0] and cache_x.shape[2:] == x.shape[2:], "Cache spatial/channel dims mismatch"
       cache_len = cache_x.shape[1]
       x = jnp.concatenate([cache_x, x], axis=1)  # Concat along depth (D)
 
       padding_needed -= cache_len
       if padding_needed < 0:
-        print("wanCausanConv3d, padding_needed < 0")
         # Cache longer than needed padding, trim from start
         x = x[:, -padding_needed:, ...]
         current_padding[1] = (0, 0)  # No explicit padding needed now
       else:
         # Update depth padding needed
-        print("wanCausanConv3d, padding_needed > 0")
         current_padding[1] = (padding_needed, 0)
 
     # Apply padding if any dimension requires it
     padding_to_apply = tuple(current_padding)
-    print("WanCausalConv3d, before padding x shape: ", x.shape)
     if any(p > 0 for dim_pads in padding_to_apply for p in dim_pads):
-      print("WanCausalConv3d, applying padding")
       x_padded = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
-      print("WanCausalConv3d, NOT applying padding")
       x_padded = x
-
-    print("WanCausalConv3d, x shape: ", x_padded.shape)
-    print("wanCausalConv3d, x min: ", np.min(x_padded))
-    print("wanCausalConv3d, x max: ", np.max(x_padded))
-    # if idx == 12:
-    #   breakpoint()
     out = self.conv(x_padded)
-    print("WanCausalConv3d, after conv, x shape: ", out.shape)
-    print("wanCausalConv3d, x min: ", np.min(out))
-    print("wanCausalConv3d, x max: ", np.max(out))
     return out
 
 
@@ -300,8 +281,8 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
           if cache_x.shape[1] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
             # cache last frame of last two chunk
             cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
-          if cache_x.shape[1] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
-            cache_x = jnp.concatenate([jnp.zeros(cache_x.shape), cache_x], dim=1)
+          if cache_x.shape[1] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+            cache_x = jnp.concatenate([jnp.zeros(cache_x.shape), cache_x], axis=1)
           if feat_cache[idx] == "Rep":
             x = self.time_conv(x)
           else:
@@ -364,13 +345,10 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
 
     if feat_cache is not None:
       idx = feat_idx[0]
-      print("Before conv1, idx: ", idx)
       cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
       if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
         cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
       x = self.conv1(x, feat_cache[idx], idx)
-      # if idx == 4:
-      #   breakpoint()
       feat_cache[idx] = cache_x
       feat_idx[0] += 1
     else:
@@ -379,32 +357,18 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
     x = self.norm2(x)
     x = self.nonlinearity(x)
     idx = feat_idx[0]
-    # if idx == 4:
-    #   breakpoint()
 
     if feat_cache is not None:
       idx = feat_idx[0]
-      print("Residual block, idx: ", idx)
-      # if idx == 14:
-      #   breakpoint()
-      print("cache_x min: ", np.min(cache_x))
-      print("cache_x max: ", np.max(cache_x))
       cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
       if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
         cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
-        print("cache_x min: ", np.min(cache_x))
-        print("cache_x max: ", np.max(cache_x))
-        #breakpoint()
       x = self.conv2(x, feat_cache[idx])
       feat_cache[idx] = cache_x
       feat_idx[0] += 1
     else:
       x = self.conv2(x)
-    print("before conv shortcut add: x min", np.min(x))
-    print("before conv shortcut add: x max", np.max(x))
     x = x + h
-    print("after conv shortcut add: x min: ", np.min(x))
-    print("after conv shortcut add: x max: ", np.max(x))
     return x
 
 
@@ -428,16 +392,8 @@ def __call__(self, x: jax.Array):
     #qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
     qkv = qkv.reshape(batch_size * time, 1, -1, channels * 3)
     qkv = jnp.transpose(qkv, (0, 1, 3, 2))
-    print("qkv min: ", np.min(qkv))
-    print("qkv max: ", np.max(qkv))
     #q, k, v = jnp.split(qkv, 3, axis=-1)
     q, k, v = jnp.split(qkv, 3, axis=-2)
-    print("q min: ", np.min(q))
-    print("q max: ", np.max(q))
-    print("k min: ", np.min(k))
-    print("k min: ", np.max(k))
-    print("v min: ", np.min(v))
-    print("v min: ", np.max(v))
     q = jnp.transpose(q, (0, 1, 3, 2))
     k = jnp.transpose(k, (0, 1, 3, 2))
     v = jnp.transpose(v, (0, 1, 3, 2))
@@ -446,10 +402,8 @@ def __call__(self, x: jax.Array):
 
     # output projection
     x = self.proj(x)
-    #breakpoint()
     # Reshape back
     x = x.reshape(batch_size, time, height, width, channels)
-    #breakpoint()
 
     return x + identity
 
@@ -467,20 +421,11 @@ def __init__(self, dim: int, rngs: nnx.Rngs, dropout: float = 0.0, non_linearity
     self.resnets = resnets
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
-    print("WanMidblock...")
     x = self.resnets[0](x, feat_cache, feat_idx)
-    print("WanMidBlock resnets[0], x min: ", np.min(x))
-    print("WanMidBlock resnets[0], x max: ", np.max(x))
     for attn, resnet in zip(self.attentions, self.resnets[1:]):
-      print("WanMidBlock, for loop, attn len: ", len(self.attentions))
-      print("WanMidBlock, for loop, resnets len: ", len(self.resnets))
       if attn is not None:
         x = attn(x)
-        print("WanMidBlock attn[0], x min: ", np.min(x))
-        print("WanMidBlock attn[0], x max: ", np.max(x))
       x = resnet(x, feat_cache, feat_idx)
-      print("WanMidBlock resnets[i], x min: ", np.min(x))
-      print("WanMidBlock resnets[i], x max: ", np.max(x))
     return x
 
 
@@ -888,21 +833,18 @@ def _decode(self, z: jax.Array, return_dict: bool = True) -> Union[FlaxDecoderOu
         out = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
       else:
         out_ = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
-        out = jnp.concatenate([out, out_], axis=1)
-        print("out_.shape: ", out_.shape)
-        print("out_ min: ", np.min(out_))
-        print("out_ max: ", np.max(out_))
-      print("out.shape: ", out.shape)
-      print("out min: ", np.min(out))
-      print("out max: ", np.max(out))
-      for i in range(len(self._feat_map)):
-        if isinstance(self._feat_map[i], jax.Array):
-          print("i: ", i)
-          print("min: ", np.min(self._feat_map[i]))
-          print("max: ", np.max(self._feat_map[i]))
-        else:
-          print(f"feat_map[{i}] : {self._feat_map[i]}")
-      # breakpoint()
+        
+        # This is to bypass an issue where frame[1] should be frame[2] and vise versa. 
+        # Ideally shouldn't need to do this however, can't find where the frame is going out of sync.
+        # Most likely due to an incorrect reshaping in the decoder.
+        fm1, fm2, fm3, fm4 = out_[:, 0, :, :, :], out_[:, 1, :, :, :], out_[:, 2, :, :, :], out_[:, 3, :, :, :]
+        if len(fm1.shape) == 4:
+          fm1 = jnp.expand_dims(fm1, axis=0)
+          fm2 = jnp.expand_dims(fm2, axis=0)
+          fm3 = jnp.expand_dims(fm3, axis=0)
+          fm4 = jnp.expand_dims(fm4, axis=0)
+        
+        out = jnp.concatenate([out, fm1, fm3, fm2, fm4], axis=1)
     out = jnp.clip(out, min=-1.0, max=1.0)
     self.clear_cache()
     if not return_dict: