debug statements

jfacevedo-google · jfacevedo-google · commit 34ebdbea48b2 · 2025-05-07T18:13:23.000Z
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -23,7 +23,7 @@
 from ..modeling_flax_utils import FlaxModelMixin
 from ... import common_types
 from ..vae_flax import (FlaxAutoencoderKLOutput, FlaxDiagonalGaussianDistribution, FlaxDecoderOutput)
-
+import numpy as np
 BlockSizes = common_types.BlockSizes
 
 CACHE_T = 2
@@ -93,33 +93,51 @@ def __init__(
         rngs=rngs,
     )
 
-  def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None) -> jax.Array:
+  def __call__(self, x: jax.Array, cache_x: Optional[jax.Array] = None, idx=-1) -> jax.Array:
+    print("wanCausalConv3d, x min: ", np.min(x))
+    print("wanCausalConv3d, x max: ", np.max(x))
     current_padding = list(self._causal_padding)  # Mutable copy
     padding_needed = self._depth_padding_before
 
     if cache_x is not None and padding_needed > 0:
+      print("WanCausalConv3d, cache.shape: ", cache_x.shape)
+      print("wanCausalConv3d, cache_x min: ", np.min(cache_x))
+      print("wanCausalConv3d, cache_x max: ", np.max(cache_x))
       # Ensure cache has same spatial/channel dims, potentially different depth
       assert cache_x.shape[0] == x.shape[0] and cache_x.shape[2:] == x.shape[2:], "Cache spatial/channel dims mismatch"
       cache_len = cache_x.shape[1]
       x = jnp.concatenate([cache_x, x], axis=1)  # Concat along depth (D)
 
       padding_needed -= cache_len
       if padding_needed < 0:
+        print("wanCausanConv3d, padding_needed < 0")
         # Cache longer than needed padding, trim from start
         x = x[:, -padding_needed:, ...]
         current_padding[1] = (0, 0)  # No explicit padding needed now
       else:
         # Update depth padding needed
+        print("wanCausanConv3d, padding_needed > 0")
         current_padding[1] = (padding_needed, 0)
 
     # Apply padding if any dimension requires it
     padding_to_apply = tuple(current_padding)
+    print("WanCausalConv3d, before padding x shape: ", x.shape)
     if any(p > 0 for dim_pads in padding_to_apply for p in dim_pads):
+      print("WanCausalConv3d, applying padding")
       x_padded = jnp.pad(x, padding_to_apply, mode="constant", constant_values=0.0)
     else:
+      print("WanCausalConv3d, NOT applying padding")
       x_padded = x
 
+    print("WanCausalConv3d, x shape: ", x_padded.shape)
+    print("wanCausalConv3d, x min: ", np.min(x_padded))
+    print("wanCausalConv3d, x max: ", np.max(x_padded))
+    # if idx == 12:
+    #   breakpoint()
     out = self.conv(x_padded)
+    print("WanCausalConv3d, after conv, x shape: ", out.shape)
+    print("wanCausalConv3d, x min: ", np.min(out))
+    print("wanCausalConv3d, x max: ", np.max(out))
     return out
 
 
@@ -346,31 +364,48 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
 
     if feat_cache is not None:
       idx = feat_idx[0]
+      print("Before conv1, idx: ", idx)
       cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
       if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
         cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
-
-      x = self.conv1(x, feat_cache[idx])
+      x = self.conv1(x, feat_cache[idx], idx)
+      # if idx == 4:
+      #   breakpoint()
       feat_cache[idx] = cache_x
       feat_idx[0] += 1
     else:
       x = self.conv1(x)
 
     x = self.norm2(x)
     x = self.nonlinearity(x)
+    idx = feat_idx[0]
+    # if idx == 4:
+    #   breakpoint()
 
     if feat_cache is not None:
       idx = feat_idx[0]
+      print("Residual block, idx: ", idx)
+      # if idx == 14:
+      #   breakpoint()
+      print("cache_x min: ", np.min(cache_x))
+      print("cache_x max: ", np.max(cache_x))
       cache_x = jnp.copy(x[:, -CACHE_T:, :, :, :])
       if cache_x.shape[1] < 2 and feat_cache[idx] is not None:
         cache_x = jnp.concatenate([jnp.expand_dims(feat_cache[idx][:, -1, :, :, :], axis=1), cache_x], axis=1)
+        print("cache_x min: ", np.min(cache_x))
+        print("cache_x max: ", np.max(cache_x))
+        #breakpoint()
       x = self.conv2(x, feat_cache[idx])
       feat_cache[idx] = cache_x
       feat_idx[0] += 1
     else:
       x = self.conv2(x)
-
-    return x + h
+    print("before conv shortcut add: x min", np.min(x))
+    print("before conv shortcut add: x max", np.max(x))
+    x = x + h
+    print("after conv shortcut add: x min: ", np.min(x))
+    print("after conv shortcut add: x max: ", np.max(x))
+    return x
 
 
 class WanAttentionBlock(nnx.Module):
@@ -382,26 +417,51 @@ def __init__(self, dim: int, rngs: nnx.Rngs):
     self.proj = nnx.Conv(in_features=dim, out_features=dim, kernel_size=(1, 1), rngs=rngs)
 
   def __call__(self, x: jax.Array):
-    batch_size, time, height, width, channels = x.shape
+
     identity = x
+    batch_size, time, height, width, channels = x.shape
 
     x = x.reshape(batch_size * time, height, width, channels)
     x = self.norm(x)
 
     qkv = self.to_qkv(x)  # Output: (N*D, H, W, C * 3)
-
-    qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+    #breakpoint()
+    #qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+    qkv = qkv.reshape(batch_size * time, 1, -1, channels * 3)
     qkv = jnp.transpose(qkv, (0, 1, 3, 2))
-    q, k, v = jnp.split(qkv, 3, axis=-1)
-
-    x = jax.nn.dot_product_attention(q, k, v)
+    print("qkv min: ", np.min(qkv))
+    print("qkv max: ", np.max(qkv))
+    #q, k, v = jnp.split(qkv, 3, axis=-1)
+    q, k, v = jnp.split(qkv, 3, axis=-2)
+    print("q min: ", np.min(q))
+    print("q max: ", np.max(q))
+    print("k min: ", np.min(k))
+    print("k min: ", np.max(k))
+    print("v min: ", np.min(v))
+    print("v min: ", np.max(v))
+    #breakpoint()
+    q = jnp.transpose(q, (0, 1, 3, 2))
+    k = jnp.transpose(k, (0, 1, 3, 2))
+    v = jnp.transpose(v, (0, 1, 3, 2))
+    import torch
+    import torch.nn.functional as F
+    q = torch.tensor(np.array(q, dtype=np.float32))
+    k = torch.tensor(np.array(k, dtype=np.float32))
+    v = torch.tensor(np.array(v, dtype=np.float32))
+    #x = jax.nn.dot_product_attention(q, k, v)
+    x = F.scaled_dot_product_attention(q, k, v)
+    print("attn min: ", torch.min(x))
+    print("attn max: ", torch.max(x))
+    #breakpoint()
+    x = jnp.array(x.detach().numpy())
     x = jnp.squeeze(x, 1).reshape(batch_size * time, height, width, channels)
 
     # output projection
     x = self.proj(x)
-
+    #breakpoint()
     # Reshape back
     x = x.reshape(batch_size, time, height, width, channels)
+    #breakpoint()
 
     return x + identity
 
@@ -419,11 +479,20 @@ def __init__(self, dim: int, rngs: nnx.Rngs, dropout: float = 0.0, non_linearity
     self.resnets = resnets
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
+    print("WanMidblock...")
     x = self.resnets[0](x, feat_cache, feat_idx)
+    print("WanMidBlock resnets[0], x min: ", np.min(x))
+    print("WanMidBlock resnets[0], x max: ", np.max(x))
     for attn, resnet in zip(self.attentions, self.resnets[1:]):
+      print("WanMidBlock, for loop, attn len: ", len(self.attentions))
+      print("WanMidBlock, for loop, resnets len: ", len(self.resnets))
       if attn is not None:
         x = attn(x)
+        print("WanMidBlock attn[0], x min: ", np.min(x))
+        print("WanMidBlock attn[0], x max: ", np.max(x))
       x = resnet(x, feat_cache, feat_idx)
+      print("WanMidBlock resnets[i], x min: ", np.min(x))
+      print("WanMidBlock resnets[i], x max: ", np.max(x))
     return x
 
 
@@ -589,7 +658,7 @@ def __init__(
       self,
       rngs: nnx.Rngs,
       dim: int = 128,
-      z_dim: int = 128,
+      z_dim: int = 4,
       dim_mult: List[int] = [1, 2, 4, 4],
       num_res_blocks: int = 2,
       attn_scales=List[float],
@@ -662,7 +731,7 @@ def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
 
     ## middle
     x = self.mid_block(x, feat_cache, feat_idx)
-
+    #breakpoint()
     ## upsamples
     for up_block in self.up_blocks:
       x = up_block(x, feat_cache, feat_idx)
@@ -810,7 +879,6 @@ def _encode(self, x: jax.Array):
     mu, logvar = enc[:, :, :, :, : self.z_dim], enc[:, :, :, :, self.z_dim :]
     enc = jnp.concatenate([mu, logvar], axis=-1)
     self.clear_cache()
-    # return enc
     return enc
 
   def encode(
@@ -833,10 +901,22 @@ def _decode(self, z: jax.Array, return_dict: bool = True) -> Union[FlaxDecoderOu
         out = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
       else:
         out_ = self.decoder(x[:, i : i + 1, :, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
-
         out = jnp.concatenate([out, out_], axis=1)
-
-    out = jnp.clip(out, a_min=-1.0, a_max=1.0)
+        print("out_.shape: ", out_.shape)
+        print("out_ min: ", np.min(out_))
+        print("out_ max: ", np.max(out_))
+      print("out.shape: ", out.shape)
+      print("out min: ", np.min(out))
+      print("out max: ", np.max(out))
+      for i in range(len(self._feat_map)):
+        if isinstance(self._feat_map[i], jax.Array):
+          print("i: ", i)
+          print("min: ", np.min(self._feat_map[i]))
+          print("max: ", np.max(self._feat_map[i]))
+        else:
+          print(f"feat_map[{i}] : {self._feat_map[i]}")
+      # breakpoint()
+    out = jnp.clip(out, min=-1.0, max=1.0)
     self.clear_cache()
     if not return_dict:
       return (out,)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -26,7 +26,6 @@ def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device:
   with jax.default_device(device):
     if hf_download:
       ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder="vae", filename="diffusion_pytorch_model.safetensors")
-    #breakpoint()
     max_logging.log(f"Load and port Wan 2.1 VAE on {device}")
 
     if ckpt_path is not None:
diff --git a/src/maxdiffusion/tests/wan_vae_test.py b/src/maxdiffusion/tests/wan_vae_test.py
@@ -40,7 +40,6 @@
 
 CACHE_T = 2
 
-
 class TorchWanRMS_norm(nn.Module):
   r"""
   A custom RMS normalization layer.
@@ -92,16 +91,18 @@ def __init__(self, dim: int, mode: str) -> None:
           WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
       )
     elif mode == "upsample3d":
-      self.resample = nn.Sequential(
-          WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
-      )
-      self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+      # self.resample = nn.Sequential(
+      #     WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+      # )
+      # self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+      raise Exception("downsample3d not supported")
 
     elif mode == "downsample2d":
       self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
     elif mode == "downsample3d":
-      self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-      self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+      raise Exception("downsample3d not supported")
+      #self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+      #self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
 
     else:
       self.resample = nn.Identity()