wan transformer with in/out shapes verified

jfacevedo-google · jfacevedo-google · commit 440f39c2a4a6 · 2025-05-13T22:59:38.000Z
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -15,10 +15,11 @@
 """
 
 from typing import Tuple, Optional, Dict, Union, Any
+import math
 import jax
 import jax.numpy as jnp
 from flax import nnx
-from .... import common_types, max_logging
+from .... import common_types
 from ...modeling_flax_utils import FlaxModelMixin, get_activation
 from ....configuration_utils import ConfigMixin, register_to_config
 from ...embeddings_flax import (
@@ -447,7 +448,7 @@ def __init__(
         rngs=rngs,
         dim=inner_dim,
         ffn_dim=ffn_dim,
-        num_attention_heads=num_attention_heads,
+        num_heads=num_attention_heads,
         qk_norm=qk_norm,
         cross_attn_norm=cross_attn_norm,
         eps=eps,
@@ -462,6 +463,20 @@ def __init__(
       blocks.append(block)
     self.blocks = blocks
 
+    self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False)
+    self.proj_out = nnx.Linear(
+      rngs=rngs,
+      in_features=inner_dim,
+      out_features=out_channels * math.prod(patch_size),
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", "mlp",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+    )
+    key = rngs.params()
+    self.scale_shift_table = nnx.Param(jax.random.normal(key, (1, 2, inner_dim)) / inner_dim**0.5)
+
   def __call__(
     self,
     hidden_states: jax.Array,
@@ -492,7 +507,14 @@ def __call__(
 
     for block in self.blocks:
       hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-    breakpoint()
+      
+    shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
+
+    hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
+    hidden_states = self.proj_out(hidden_states)
 
+    # TODO - can this reshape happen in a single command?
+    hidden_states = hidden_states.reshape(batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1)
+    hidden_states = hidden_states.reshape(batch_size, num_frames, height, width, num_channels)
 
     return hidden_states
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -27,7 +27,9 @@
   create_device_mesh,
   get_flash_block_sizes
 )
-from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed, WanTimeTextImageEmbedding, WanTransformerBlock
+from ..models.wan.transformers.transformer_wan import (
+  WanRotaryPosEmbed, WanTimeTextImageEmbedding, WanTransformerBlock, WanModel
+)
 from ..models.embeddings_flax import NNXTimestepEmbedding, NNXPixArtAlphaTextProjection
 from ..models.normalization_flax import FP32LayerNorm
 from ..models.attention_flax import FlaxWanAttention
@@ -256,7 +258,49 @@ def test_wan_attention(self):
     except NotImplementedError as e:
       pass
     
-    
+  def test_wan_model(self):
+    pyconfig.initialize(
+      [
+        None,
+        os.path.join(THIS_DIR, "..", "configs", "base_wan_14b.yml"),
+      ],
+      unittest=True
+    )
+    config = pyconfig.config
+
+    batch_size = 1
+    channels = 16
+    frames = 21
+    height = 90
+    width = 160
+    hidden_states_shape = (batch_size, frames, height, width, channels)
+    dummy_hidden_states = jnp.ones(hidden_states_shape)
+
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    devices_array = create_device_mesh(config)
+
+    flash_block_sizes = get_flash_block_sizes(config)
+
+    mesh = Mesh(devices_array, config.mesh_axes)
+    batch_size = 1
+    query_dim = 5120
+    wan_model = WanModel(
+      rngs=rngs,
+      attention="flash",
+      mesh=mesh,
+      flash_block_sizes=flash_block_sizes,
+    )
+
+    dummy_timestep = jnp.ones((batch_size))
+    dummy_encoder_hidden_states = jnp.ones((batch_size, 512, 4096))
+
+    dummy_output = wan_model(
+      hidden_states=dummy_hidden_states,
+      timestep=dummy_timestep,
+      encoder_hidden_states=dummy_encoder_hidden_states
+    )
+    assert dummy_output.shape == hidden_states_shape
 
 if __name__ == "__main__":
   absltest.main()