Residual block test

jfacevedo-google · jfacevedo-google · commit 0ec4b02d00ec · 2025-04-24T21:34:40.000Z
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -313,10 +313,49 @@ def __init__(
       dropout: float = 0.0,
       non_linearity: str = "silu",
   ):
-    pass
+    self.nonlinearity = get_activation(non_linearity)
+
+    # layers
+    self.norm1 = WanRMS_norm(dim=in_dim, rngs=rngs, images=False, channel_first=False)
+    self.conv1 = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=in_dim,
+      out_channels=out_dim,
+      kernel_size=3,
+      padding=1
+    )
+    self.norm2 = WanRMS_norm(dim=out_dim, rngs=rngs, images=False, channel_first=False)
+    self.dropout = nnx.Dropout(dropout, rngs=rngs)
+    self.conv2 = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=out_dim,
+      out_channels=out_dim,
+      kernel_size=3,
+      padding=1
+    )
+    self.conv_shortcut = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=in_dim,
+      out_channels=out_dim,
+      kernel_size=1
+    ) if in_dim != out_dim else Identity()
+
 
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
-    return x
+    # Apply shortcut connection
+    #breakpoint()
+    h = self.conv_shortcut(x)
+
+    x = self.norm1(x)
+    x = self.nonlinearity(x)
+    x = self.conv1(x)
+
+    x = self.norm2(x)
+    x = self.nonlinearity(x)
+    x = self.dropout(x)
+    x = self.conv2(x)
+
+    return x + h
 
 class WanAttentionBlock(nnx.Module):
   def __init__(
@@ -397,11 +436,11 @@ def __init__(
 
     # init block
     self.conv_in = WanCausalConv3d(
+      rngs=rngs,
       in_channels=3,
       out_channels=dims[0],
       kernel_size=3,
       padding=1,
-      rngs=rngs
     )
 
     # downsample blocks
@@ -439,6 +478,12 @@ def __init__(
     )
   
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
+    # (1, 1, 480, 720, 3)
+    x = self.conv_in(x)
+    # (1, 1, 480, 720, 96)
+    for layer in self.down_blocks:
+      x = layer(x)
+    breakpoint()
     return x
 
 class WanDecoder3d(nnx.Module):
@@ -480,7 +525,13 @@ def __init__(
     scale = 1.0 / 2 ** (len(dim_mult) - 2)
 
     # init block
-    self.conv_in = WanCausalConv3d(in_channels=z_dim, out_channels=dims[0], kernel_size=3, padding=1, rngs=rngs)
+    self.conv_in = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=z_dim,
+      out_channels=dims[0],
+      kernel_size=3,
+      padding=1
+    )
 
     # middle_blocks
     self.mid_block = WanMidBlock(dim=dims[0], rngs=rngs, dropout=dropout, non_linearity=non_linearity, num_layers=1)
@@ -516,7 +567,13 @@ def __init__(
     # output blocks
     self.norm_out = nnx.RMSNorm(num_features=out_dim, )
     self.norm_out = WanRMS_norm(dim=out_dim, images=False, rngs=rngs)
-    self.conv_out = WanCausalConv3d(in_channels=out_dim, out_channels=3, kernel_size=3, padding=1, rngs=rngs)
+    self.conv_out = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=out_dim,
+      out_channels=3,
+      kernel_size=3,
+      padding=1
+    )
   
   def __call__(self, x: jax.Array, feat_cache=None, feat_idx=[0]):
     breakpoint()
@@ -533,7 +590,7 @@ def __init__(
     dim_mult: Tuple[int] = [1,2,4,4],
     num_res_blocks: int = 2,
     attn_scales: List[float] = [],
-    temporal_downsample: List[bool] = [False, True, True],
+    temperal_downsample: List[bool] = [False, True, True],
     dropout: float = 0.0,
     latents_mean: List[float] = [
       -0.7571,-0.7089,-0.9113,0.1075,-0.1745,0.9653,-0.1517, 1.5508,
@@ -545,31 +602,59 @@ def __init__(
     ],
   ):
     self.z_dim = z_dim
-    self.temporal_downsample = temporal_downsample
-    self.temporal_upsample = temporal_downsample[::-1]
-
-    self.encoder = WanEncoder3d(z_dim * 2, z_dim * 2, 1)
-    self.quant_conv = WanCausalConv3d(z_dim * 2, z_dim * 2, 1, rngs=rngs)
-    self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1, rngs=rngs)
+    self.temperal_downsample = temperal_downsample
+    self.temporal_upsample = temperal_downsample[::-1]
 
-    self.decoder = WanDecoder3d(
-      base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temporal_upsample, dropout
+    self.encoder = WanEncoder3d(
+      rngs=rngs,
+      dim=base_dim,
+      z_dim=z_dim * 2,
+      dim_mult=dim_mult,
+      num_res_blocks=num_res_blocks,
+      attn_scales=attn_scales,
+      temperal_downsample=temperal_downsample,
+      dropout=dropout,
+    )
+    self.quant_conv = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=z_dim * 2,
+      out_channels=z_dim * 2,
+      kernel_size=1
+    )
+    self.post_quant_conv = WanCausalConv3d(
+      rngs=rngs,
+      in_channels=z_dim,
+      out_channels=z_dim,
+      kernel_size=1,
     )
+
+    # self.decoder = WanDecoder3d(
+    #   rngs=rngs,
+    #   dim=base_dim,
+    #   z_dim=z_dim,
+    #   dim_mult=dim_mult,
+    #   num_res_blocks=num_res_blocks,
+    #   attn_scales=attn_scales,
+    #   temperal_upsample=self.temporal_upsample,
+    #   dropout=dropout
+    # )
     self.clear_cache()
   
   def clear_cache(self):
     """ Resets cache dictionaries and indices"""
     def _count_conv3d(module):
       count = 0
-      node_types = nnx.graph.iter_graph(module, nnx.Module)
-      for node in node_types:
-        if isinstance(node.value, WanCausalConv3d):
+      node_types = nnx.graph.iter_graph([module])
+      for path, value in node_types:
+        #breakpoint()
+        if isinstance(value, WanCausalConv3d):
+          print("value: ", value)
           count +=1
       return count
 
-    self._conv_num = _count_conv3d(self.decoder)
-    self._conv_idx = [0]
-    self._feat_map = [None] * self._conv_num
+    # self._conv_num = _count_conv3d(self.decoder)
+    # self._conv_idx = [0]
+    # self._feat_map = [None] * self._conv_num
     # cache encode
     self._enc_conv_num = _count_conv3d(self.encoder)
     self._enc_conv_idx = [0]
@@ -581,7 +666,7 @@ def _encode(self, x: jax.Array):
       x = jnp.transpose(x, (0, 2, 3, 4, 1))
       assert x.shape[-1] == 3, f"Expected input shape (N, D, H, W, 3), got {x.shape}"
     
-    self.clear_cache()
+    #self.clear_cache()
 
     t = x.shape[1]
     iter_ = 1 + (t - 1) // 4
@@ -590,7 +675,7 @@ def _encode(self, x: jax.Array):
         out = self.encoder(
           x[:, :1, :, :, :],
           feat_cache=self._enc_feat_map,
-          feat_ids=self._enc_conv_idx
+          feat_idx=self._enc_conv_idx
         )
       else:
         out_ = self.encoder(
@@ -600,11 +685,12 @@ def _encode(self, x: jax.Array):
         )
         out = jnp.concatenate([out, out_], axis=1)
     
-    enc = self.quant_conv(out)
-    mu, logvar = enc[:, :, :, :, : self.z_dim], enc[:, :, :, :, self.z_dim :]
-    enc = jnp.concatenate([mu, logvar], dim=1)
-    self.clear_cache()
-    return enc
+    # enc = self.quant_conv(out)
+    # mu, logvar = enc[:, :, :, :, : self.z_dim], enc[:, :, :, :, self.z_dim :]
+    # enc = jnp.concatenate([mu, logvar], dim=1)
+    # self.clear_cache()
+    # return enc
+    return x
 
   def encode(self, x: jax.Array, return_dict: bool = True) -> Union[FlaxAutoencoderKLOutput, Tuple[FlaxDiagonalGaussianDistribution]]:
     """ Encode video into latent distribution."""
diff --git a/src/maxdiffusion/tests/wan_vae_test.py b/src/maxdiffusion/tests/wan_vae_test.py
@@ -30,6 +30,7 @@
   WanUpsample,
   AutoencoderKLWan,
   WanEncoder3d,
+  WanResidualBlock,
   WanRMS_norm,
   WanResample,
   ZeroPaddedConv2D
@@ -318,6 +319,45 @@ def test_3d_conv(self):
     output_with_larger_cache = causal_conv_layer(dummy_input, cache_x=dummy_larger_cache)
     assert output_with_larger_cache.shape == (1, 10, 32, 32, 16)
 
+  def test_wan_residual(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    # one test
+    in_dim = out_dim = 96
+    batch = 1
+    t = 1
+    height = 480
+    width = 720
+    dim = 96
+    input_shape = (batch, t, height, width, dim)
+    expected_output_shape = (batch, t, height, width, dim)
+
+    wan_residual_block = WanResidualBlock(
+      in_dim=in_dim,
+      out_dim=out_dim,
+      rngs=rngs,
+    )
+    dummy_input = jnp.ones(input_shape)
+    dummy_output = wan_residual_block(dummy_input)
+    assert dummy_output.shape == expected_output_shape
+
+    # another test
+    in_dim = 96
+    out_dim = 196
+    expected_output_shape = (batch, t, height, width, out_dim)
+
+    wan_residual_block = WanResidualBlock(
+      in_dim=in_dim,
+      out_dim=out_dim,
+      rngs=rngs,
+    )
+    dummy_input = jnp.ones(input_shape)
+    dummy_output = wan_residual_block(dummy_input)
+    assert dummy_output.shape == expected_output_shape
+
+
+
+
   def test_wan_encode(self):
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
@@ -328,24 +368,34 @@ def test_wan_encode(self):
     attn_scales = []
     temperal_downsample = [False, True, True]
     nonlinearity = "silu"
-    wan_encoder = WanEncoder3d(
-       rngs=rngs,
-       dim=dim,
-       z_dim=z_dim,
-       dim_mult=dim_mult,
-       num_res_blocks=num_res_blocks,
-       attn_scales=attn_scales,
-       temperal_downsample=temperal_downsample,
-       non_linearity=nonlinearity
+    wan_vae = AutoencoderKLWan(
+      rngs=rngs,
+      base_dim=dim,
+      z_dim=z_dim,
+      dim_mult=dim_mult,
+      num_res_blocks=num_res_blocks,
+      attn_scales=attn_scales,
+      temperal_downsample=temperal_downsample,
     )
+    # wan_encoder = WanEncoder3d(
+    #    rngs=rngs,
+    #    dim=dim,
+    #    z_dim=z_dim,
+    #    dim_mult=dim_mult,
+    #    num_res_blocks=num_res_blocks,
+    #    attn_scales=attn_scales,
+    #    temperal_downsample=temperal_downsample,
+    #    non_linearity=nonlinearity
+    # )
     batch = 1
     channels = 3
     t = 49
     height = 480
     width = 720
     input_shape = (batch, channels, t, height, width)
     input = jnp.ones(input_shape)
-    output = wan_encoder(input)
+    output = wan_vae.encode(input)
+    breakpoint()