wip - test for vae encoder.

jfacevedo-google · jfacevedo-google · commit aeabe27e586c · 2025-04-24T17:55:19.000Z
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -55,14 +55,13 @@ def _canonicalize_tuple(x: Union[int, Sequence[int]], rank: int, name: str) -> T
 class WanCausalConv3d(nnx.Module):
   def __init__(
     self,
+    rngs: nnx.Rngs, # rngs are required for initializing parameters,
     in_channels: int,
     out_channels: int,
     kernel_size: Union[int, Tuple[int, int, int]],
-    *, # Mark subsequent arguments as keyword-only
     stride: Union[int, Tuple[int, int, int]] = 1,
     padding: Union[int, Tuple[int, int, int]] = 0,
     use_bias: bool = True,
-    rngs: nnx.Rngs, # rngs are required for initializing parameters,
     flash_min_seq_length: int = 4096,
     flash_block_sizes: BlockSizes = None,
     mesh: jax.sharding.Mesh = None,
@@ -267,7 +266,13 @@ def __init__(
           rngs=rngs,
         )
       )
-      self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0), rngs=rngs)
+      self.time_conv = WanCausalConv3d(
+        rngs=rngs,
+        in_channels=dim,
+        out_channels=dim * 2,
+        kernel_size=(3, 1, 1),
+        padding=(1, 0, 0),
+      )
     elif mode == "downsample2d":
       # TODO - do I need to transpose?
       self.resample = ZeroPaddedConv2D(
@@ -288,6 +293,15 @@ def __init__(
       self.resample = Identity()
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]) -> jax.Array:
+    # Input x: (N, D, H, W, C), assume C = self.dim
+    n, d, h, w, c = x.shape
+    assert c == self.dim
+
+    x = x.reshape(n*d,h,w,c)
+    x = self.resample(x)
+    h_new, w_new, c_new = x.shape[1:]
+    x = x.reshape(n, d, h_new, w_new, c_new)
+
     return x
   
 class WanResidualBlock(nnx.Module):
@@ -382,7 +396,13 @@ def __init__(
     scale = 1.0
 
     # init block
-    self.conv_in = WanCausalConv3d(3, dims[0], 3, padding=1, rngs=rngs)
+    self.conv_in = WanCausalConv3d(
+      in_channels=3,
+      out_channels=dims[0],
+      kernel_size=3,
+      padding=1,
+      rngs=rngs
+    )
 
     # downsample blocks
     self.down_blocks = []
@@ -400,11 +420,23 @@ def __init__(
         self.down_blocks.append(WanResample(out_dim, mode=mode, rngs=rngs))
       
     # middle_blocks
-    self.mid_block = WanMidBlock(out_dim, dropout, non_linearity, num_layers=1, rngs=rngs)
+    self.mid_block = WanMidBlock(
+      dim=out_dim,
+      rngs=rngs,
+      dropout=dropout,
+      non_linearity=non_linearity,
+      num_layers=1,
+    )
 
     # output blocks
     self.norm_out = WanRMS_norm(out_dim, images=False, rngs=rngs)
-    self.conv_out = WanCausalConv3d(in_channels=out_dim, out_channels=z_dim, kernel_size=3, padding=1)
+    self.conv_out = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=out_dim,
+      out_channels=z_dim,
+      kernel_size=3,
+      padding=1
+    )
   
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
     return x
@@ -487,6 +519,9 @@ def __init__(
     self.conv_out = WanCausalConv3d(in_channels=out_dim, out_channels=3, kernel_size=3, padding=1, rngs=rngs)
   
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
+    breakpoint()
+    x = self.conv_in(x)
+    breakpoint()
     return x
 
 class AutoencoderKLWan(nnx.Module, FlaxModelMixin, ConfigMixin):
@@ -514,6 +549,7 @@ def __init__(
     self.temporal_upsample = temporal_downsample[::-1]
 
     self.encoder = WanEncoder3d(z_dim * 2, z_dim * 2, 1)
+    self.quant_conv = WanCausalConv3d(z_dim * 2, z_dim * 2, 1, rngs=rngs)
     self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1, rngs=rngs)
 
     self.decoder = WanDecoder3d(
@@ -539,10 +575,42 @@ def _count_conv3d(module):
     self._enc_conv_idx = [0]
     self._enc_feat_map = [None] * self._enc_conv_num
 
+  def _encode(self, x: jax.Array):
+    if x.shape[-1] != 3:
+      # reshape channel last for JAX
+      x = jnp.transpose(x, (0, 2, 3, 4, 1))
+      assert x.shape[-1] == 3, f"Expected input shape (N, D, H, W, 3), got {x.shape}"
+    
+    self.clear_cache()
+
+    t = x.shape[1]
+    iter_ = 1 + (t - 1) // 4
+    for i in range(iter_):
+      if i == 0:
+        out = self.encoder(
+          x[:, :1, :, :, :],
+          feat_cache=self._enc_feat_map,
+          feat_ids=self._enc_conv_idx
+        )
+      else:
+        out_ = self.encoder(
+          x[:, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+          feat_cache=self._enc_feat_map,
+          feat_idx=self._enc_conv_idx
+        )
+        out = jnp.concatenate([out, out_], axis=1)
+    
+    enc = self.quant_conv(out)
+    mu, logvar = enc[:, :, :, :, : self.z_dim], enc[:, :, :, :, self.z_dim :]
+    enc = jnp.concatenate([mu, logvar], dim=1)
+    self.clear_cache()
+    return enc
 
   def encode(self, x: jax.Array, return_dict: bool = True) -> Union[FlaxAutoencoderKLOutput, Tuple[FlaxDiagonalGaussianDistribution]]:
     """ Encode video into latent distribution."""
-    if x.shape[-1] != 3:
-      raise ValueError(f"Expected input shape (N, D, H, W, 3), got {x.shape}")
-    
-    self.clear_cache()
+    h = self._encode(x)
+    posterior = FlaxDiagonalGaussianDistribution(h)
+    if not return_dict:
+      return (posterior, )
+    return FlaxAutoencoderKLOutput(latent_dict=posterior)
+    
diff --git a/src/maxdiffusion/tests/wan_vae_test.py b/src/maxdiffusion/tests/wan_vae_test.py
@@ -15,6 +15,9 @@
  """
 
 import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import jax
 import jax.numpy as jnp
 from flax import nnx
@@ -26,27 +29,15 @@
   WanCausalConv3d,
   WanUpsample,
   AutoencoderKLWan,
+  WanEncoder3d,
   WanRMS_norm,
+  WanResample,
   ZeroPaddedConv2D
 )
 
-class WanVaeTest(unittest.TestCase):
-  def setUp(self):
-    WanVaeTest.dummy_data = {}
-  
-  # def test_clear_cache(self):
-  #   key = jax.random.key(0)
-  #   rngs = nnx.Rngs(key)
-  #   wan_vae = AutoencoderKLWan(rngs=rngs)
-  #   wan_vae.clear_cache()
+CACHE_T = 2
 
-  def test_wanrms_norm(self):
-    """Test against the Pytorch implementation"""
-    import torch
-    import torch.nn as nn
-    import torch.nn.functional as F
-
-    class TorchWanRMS_norm(nn.Module):
+class TorchWanRMS_norm(nn.Module):
       r"""
       A custom RMS normalization layer.
 
@@ -70,6 +61,103 @@ def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bi
 
       def forward(self, x):
         return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+
+class TorchWanResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+
+    def __init__(self, dim: int, mode: str) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+            self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+class WanVaeTest(unittest.TestCase):
+  def setUp(self):
+    WanVaeTest.dummy_data = {}
+  
+  # def test_clear_cache(self):
+  #   key = jax.random.key(0)
+  #   rngs = nnx.Rngs(key)
+  #   wan_vae = AutoencoderKLWan(rngs=rngs)
+  #   wan_vae.clear_cache()
+
+  def test_wanrms_norm(self):
+    """Test against the Pytorch implementation"""
     
     # --- Test Case 1: images == True ---
     dim = 96
@@ -103,8 +191,6 @@ def forward(self, x):
     assert np.allclose(output_np, torch_output_np) == True
   
   def test_zero_padded_conv(self):
-    import torch
-    import torch.nn as nn
 
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
@@ -148,6 +234,49 @@ def test_wan_upsample(self):
     # --- Test Case 1: depth == 1 ---
     output = upsample(dummy_input)
     assert output.shape == (1, 1, 64, 64, 3)
+  
+  def test_wan_resample(self):
+    # TODO - needs to test all modes - upsample2d, upsample3d, downsample2d, downsample3d and identity
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    
+    # --- Test Case 1: downsample2d ---
+    batch = 1
+    dim = 96
+    t = 1
+    h = 480
+    w = 720
+    mode="downsample2d"
+    input_shape = (batch, dim, t, h, w)
+    expected_output_shape = (1, dim, 1, 240, 360)
+    # output dim should be (1, 96, 1, 480, 720)
+    dummy_input = torch.ones(input_shape)
+    torch_wan_resample = TorchWanResample(
+      dim=dim,
+      mode=mode
+    )
+    torch_output = torch_wan_resample(dummy_input)
+    assert torch_output.shape == (batch, dim, t, h // 2, w //2)
+
+    wan_resample = WanResample(
+       dim,
+       mode=mode,
+       rngs=rngs
+    )
+    # channels is always last here
+    input_shape = (batch, t, h, w, dim)
+    dummy_input = jnp.ones(input_shape)
+    output = wan_resample(dummy_input)
+    assert output.shape == (batch, t, h//2, h//2, dim)
+    breakpoint()
+    
+    # --- Test Case 1: downsample3d ---
+    dim = 192
+    input_shape = (1, dim, 1, 240, 360)
+    torch_wan_resample = WanResample(
+       dim=dim,
+       mode="downsample3d"
+    )
 
   def test_3d_conv(self):
     key = jax.random.key(0)
@@ -189,5 +318,36 @@ def test_3d_conv(self):
     output_with_larger_cache = causal_conv_layer(dummy_input, cache_x=dummy_larger_cache)
     assert output_with_larger_cache.shape == (1, 10, 32, 32, 16)
 
+  def test_wan_encode(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    dim = 96
+    z_dim = 32
+    dim_mult = [1, 2, 4, 4]
+    num_res_blocks = 2
+    attn_scales = []
+    temperal_downsample = [False, True, True]
+    nonlinearity = "silu"
+    wan_encoder = WanEncoder3d(
+       rngs=rngs,
+       dim=dim,
+       z_dim=z_dim,
+       dim_mult=dim_mult,
+       num_res_blocks=num_res_blocks,
+       attn_scales=attn_scales,
+       temperal_downsample=temperal_downsample,
+       non_linearity=nonlinearity
+    )
+    batch = 1
+    channels = 3
+    t = 49
+    height = 480
+    width = 720
+    input_shape = (batch, channels, t, height, width)
+    input = jnp.ones(input_shape)
+    output = wan_encoder(input)
+
+
+
 if __name__ == "__main__":
   absltest.main()