add transformer block

jfacevedo-google · jfacevedo-google · commit 4c0008562130 · 2025-05-13T20:34:19.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -697,11 +697,11 @@ def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tup
   def __call__(
     self,
     hidden_states: jax.Array,
-    encoder_hidden_states: jax.Array,
+    encoder_hidden_states: jax.Array = None,
     rotary_emb: Optional[jax.Array] = None
   ) -> jax.Array:
+    
     dtype = hidden_states.dtype
-    # batch_size = hidden_states.shape[0]
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
     query_proj = self.query(hidden_states)
@@ -715,12 +715,14 @@ def __call__(
     if self.qk_norm:
       query_proj = self.query_norm(query_proj)
       key_proj = self.key_norm(key_proj)
-    query_proj = _unflatten_heads(query_proj, self.heads)
-    key_proj = _unflatten_heads(key_proj, self.heads)
+    
     if rotary_emb is not None:
+      query_proj = _unflatten_heads(query_proj, self.heads)
+      key_proj = _unflatten_heads(key_proj, self.heads)
       query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
-    query_proj = _reshape_heads_to_head_dim(query_proj)
-    key_proj = _reshape_heads_to_head_dim(key_proj)
+      query_proj = _reshape_heads_to_head_dim(query_proj)
+      key_proj = _reshape_heads_to_head_dim(key_proj)
+    
     attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
     attn_output = attn_output.astype(dtype=dtype)
 
@@ -1309,7 +1311,6 @@ def __call__(self, hidden_states, context, deterministic=True, cross_attention_k
     hidden_states = hidden_states + residual
     return self.dropout_layer(hidden_states, deterministic=deterministic)
 
-
 class FlaxFeedForward(nn.Module):
   r"""
   Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -28,6 +28,7 @@
   NNXPixArtAlphaTextProjection
 )
 from ...normalization_flax import FP32LayerNorm
+from ...attention_flax import FlaxWanAttention
 
 BlockSizes = common_types.BlockSizes
 
@@ -181,6 +182,89 @@ def __init__(
       rope_max_seq_len
     )
 
+class ApproximateGELU(nnx.Module):
+  r"""
+  The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
+  [paper](https://arxiv.org/abs/1606.08415).
+  """
+  def __init__(
+    self,
+    rngs: nnx.Rngs,
+    dim_in: int,
+    dim_out: int,
+    bias: bool,
+    dtype: jnp.dtype = jnp.float32,
+    weights_dtype: jnp.dtype = jnp.float32,
+    precision: jax.lax.Precision = None,
+  ):
+    self.proj = nnx.Linear(
+      rngs=rngs,
+      in_features=dim_in,
+      out_features=dim_out,
+      use_bias=bias,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", "mlp",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+    )
+  
+  def __call__(self, x: jax.Array) -> jax.Array:
+    x = self.proj(x)
+    return x * jax.nn.sigmoid(1.702 * x)
+  
+
+class WanFeedForward(nnx.Module):
+  def __init__(
+    self,
+    rngs: nnx.Rngs,
+    dim: int,
+    dim_out: Optional[int] = None,
+    mult: int = 4,
+    dropout: float = 0.0,
+    activation_fn: str = "geglu",
+    final_dropout: bool = False,
+    inner_dim: int = None,
+    bias: bool = True,
+    dtype: jnp.dtype = jnp.float32,
+    weights_dtype: jnp.dtype = jnp.float32,
+    precision: jax.lax.Precision = None,
+  ):
+    if inner_dim is None:
+      inner_dim = int(dim * mult)
+    dim_out = dim_out if dim_out is not None else dim
+    
+    self.act_fn = None
+    if activation_fn == "gelu-approximate":
+      self.act_fn = ApproximateGELU(
+        rngs=rngs,
+        dim_in=dim,
+        dim_out=inner_dim,
+        bias=bias,
+        dtype=dtype,
+        weights_dtype=weights_dtype,
+        precision=precision
+      )
+    else:
+      raise NotImplementedError(f"{activation_fn} is not implemented.")
+
+    self.proj_out = nnx.Linear(
+      rngs=rngs,
+      in_features=inner_dim,
+      out_features=dim_out,
+      use_bias=bias,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+    )
+  
+  def __call__(self, hidden_states: jax.Array) -> jax.Array:
+    hidden_states = self.act_fn(hidden_states)
+    return self.proj_out(hidden_states)
+
+
 
 class WanTransformerBlock(nnx.Module):
   def __init__(
@@ -192,17 +276,107 @@ def __init__(
       qk_norm: str = "rms_norm_across_heads",
       cross_attn_norm: bool = False,
       eps: float = 1e-6,
-      added_kv_proj_dim: Optional[int] = None
+      # In torch, this is none, so it can be ignored.  
+      # added_kv_proj_dim: Optional[int] = None,
+      flash_min_seq_length: int = 4096,
+      flash_block_sizes: BlockSizes = None,
+      mesh: jax.sharding.Mesh = None,
+      dtype: jnp.dtype = jnp.float32,
+      weights_dtype: jnp.dtype = jnp.float32,
+      precision: jax.lax.Precision = None,
+      attention: str = "dot_product",
+
   ):
+    
+    # 1. Self-attention
     self.norm1 = FP32LayerNorm(
+      rngs=rngs,
       dim=dim,
       eps=eps,
       elementwise_affine=False
     )
+    self.attn1 = FlaxWanAttention(
+      rngs=rngs,
+      query_dim=dim,
+      heads=num_heads,
+      dim_head= dim // num_heads,
+      qk_norm=qk_norm,
+      eps=eps,
+      flash_min_seq_length=flash_min_seq_length,
+      flash_block_sizes=flash_block_sizes,
+      mesh=mesh,
+      dtype=dtype,
+      weights_dtype=weights_dtype,
+      precision=precision,
+      attention_kernel=attention
+    )
+
+    # 1. Cross-attention
+    self.attn2 = FlaxWanAttention(
+      rngs=rngs,
+      query_dim=dim,
+      heads=num_heads,
+      dim_head= dim // num_heads,
+      qk_norm=qk_norm,
+      eps=eps,
+      flash_min_seq_length=flash_min_seq_length,
+      flash_block_sizes=flash_block_sizes,
+      mesh=mesh,
+      dtype=dtype,
+      weights_dtype=weights_dtype,
+      precision=precision,
+      attention_kernel=attention
+    )
+    assert cross_attn_norm == True
+    self.norm2 = FP32LayerNorm(
+      rngs=rngs,
+      dim=dim,
+      eps=eps,
+      elementwise_affine=True
+    )
+
+    # 3. Feed-forward
+    self.ffn = WanFeedForward(
+      rngs=rngs,
+      dim=dim,
+      inner_dim=ffn_dim,
+      activation_fn="gelu-approximate",
+      dtype=dtype,
+      weights_dtype=weights_dtype,
+      precision=precision
+    )
+    self.norm3 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=False)
+    
+    key = rngs.params()
+    self.scale_shift_table = nnx.Param(jax.random.normal(key, (1, 6, dim)) / dim**0.5)
   
-  def __call__(self):
-    pass
+  def __call__(
+    self,
+    hidden_states: jax.Array,
+    encoder_hidden_states: jax.Array,
+    temb: jax.Array,
+    rotary_emb: jax.Array
+    ):
+    shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
+      (self.scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
+    )
+
+    # 1. Self-attention
+    norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
+    attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
+    hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
+
+    # 2. Cross-attention
+    norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32))
+    attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+    hidden_states = hidden_states + attn_output
+
+    # 3. Feed-forward
+    norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
 
+    ff_output = self.ffn(norm_hidden_states)
+    hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(hidden_states.dtype)
+    return hidden_states
   
 
 class WanModel(nnx.Module, FlaxModelMixin, ConfigMixin):
@@ -269,7 +443,22 @@ def __init__(
     # 3. Transformer blocks
     blocks = []
     for _ in range(num_layers):
-      block = WanTransformerBlock()
+      block = WanTransformerBlock(
+        rngs=rngs,
+        dim=inner_dim,
+        ffn_dim=ffn_dim,
+        num_attention_heads=num_attention_heads,
+        qk_norm=qk_norm,
+        cross_attn_norm=cross_attn_norm,
+        eps=eps,
+        flash_min_seq_length=flash_min_seq_length,
+        flash_block_sizes=flash_block_sizes,
+        mesh=mesh,
+        dtype=dtype,
+        weights_dtype=weights_dtype,
+        precision=precision,
+        attention=attention
+      )
       blocks.append(block)
     self.blocks = blocks
 
@@ -301,8 +490,9 @@ def __call__(
     if encoder_hidden_states_image is not None:
       raise NotImplementedError("img2vid is not yet implemented.")
 
-    # for block in self.blocks:
-
+    for block in self.blocks:
+      hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+    breakpoint()
 
 
     return hidden_states
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -27,7 +27,7 @@
   create_device_mesh,
   get_flash_block_sizes
 )
-from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed, WanTimeTextImageEmbedding
+from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed, WanTimeTextImageEmbedding, WanTransformerBlock
 from ..models.embeddings_flax import NNXTimestepEmbedding, NNXPixArtAlphaTextProjection
 from ..models.normalization_flax import FP32LayerNorm
 from ..models.attention_flax import FlaxWanAttention
@@ -119,6 +119,75 @@ def test_wan_time_text_embedding(self):
     assert timestep_proj.shape == (batch_size, time_proj_dim)
     assert encoder_hidden_states.shape == (batch_size, time_freq_dim * 2, dim)
   
+  def test_wan_block(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    pyconfig.initialize(
+      [
+        None,
+        os.path.join(THIS_DIR, "..", "configs", "base_wan_14b.yml"),
+      ],
+      unittest=True
+    )
+    config = pyconfig.config
+
+    devices_array = create_device_mesh(config)
+
+    flash_block_sizes = get_flash_block_sizes(config)
+
+    mesh = Mesh(devices_array, config.mesh_axes)
+
+    dim=5120
+    ffn_dim=13824
+    num_heads=40
+    qk_norm="rms_norm_across_heads"
+    cross_attn_norm=True
+    eps=1e-6
+
+    batch_size = 1
+    channels = 16
+    frames = 21
+    height = 90
+    width = 160
+    hidden_dim = 75600
+
+    # for rotary post embed.
+    hidden_states_shape = (batch_size, frames, height, width, channels)
+    dummy_hidden_states = jnp.ones(hidden_states_shape)
+
+    wan_rot_embed = WanRotaryPosEmbed(
+      attention_head_dim=128,
+      patch_size=[1, 2, 2],
+      max_seq_len=1024
+    )
+    dummy_rotary_emb = wan_rot_embed(dummy_hidden_states)
+    assert dummy_rotary_emb.shape == (batch_size, 1, hidden_dim, 64)
+    
+    # for transformer block
+    dummy_hidden_states = jnp.ones((batch_size, hidden_dim, dim))
+    
+    dummy_encoder_hidden_states = jnp.ones((batch_size, 512, dim))
+
+    dummy_temb = jnp.ones((batch_size, 6, dim))
+
+    wan_block = WanTransformerBlock(
+      rngs=rngs,
+      dim=dim,
+      ffn_dim=ffn_dim,
+      num_heads=num_heads,
+      qk_norm=qk_norm,
+      cross_attn_norm=cross_attn_norm,
+      eps=eps,
+      attention="flash",
+      mesh=mesh,
+      flash_block_sizes=flash_block_sizes
+    )
+
+    dummy_output = wan_block(dummy_hidden_states, dummy_encoder_hidden_states, dummy_temb, dummy_rotary_emb)
+    assert dummy_output.shape == dummy_hidden_states.shape
+
+
+
   def test_wan_attention(self):
     pyconfig.initialize(
       [