add wan time text embedding layer.

jfacevedo-google · jfacevedo-google · commit 08444fd3aba4 · 2025-05-09T22:52:27.000Z
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -230,6 +230,51 @@ def get_1d_rotary_pos_embed(
     out = jax.lax.complex(jnp.ones_like(freqs), freqs)
   return out
 
+class NNXPixArtAlphaTextProjection(nnx.Module):
+  def __init__(
+    self,
+    rngs: nnx.Rngs,
+    in_features: int,
+    hidden_size: int,
+    out_features: int = None,
+    act_fn: str = "gelu_tanh",
+    dtype: jnp.dtype = jnp.float32,
+    weights_dtype: jnp.dtype = jnp.float32,
+    precision: jax.lax.Precision = None
+  ):
+    if out_features is None:
+      out_features = hidden_size
+    
+    self.linear_1 = nnx.Linear(
+      rngs=rngs,
+      in_features=in_features,
+      out_features=hidden_size,
+      use_bias=True,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", "mlp",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+    )
+    self.act_1 = get_activation(act_fn)
+
+    self.linear_2 = nnx.Linear(
+      rngs=rngs,
+      in_features=hidden_size,
+      out_features=out_features,
+      use_bias=True,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+    )
+  
+  def __call__(self, caption):
+    hidden_states = self.linear_1(caption)
+    hidden_states = self.act_1(hidden_states)
+    hidden_states = self.linear_2(hidden_states)
+    return hidden_states
 
 class PixArtAlphaTextProjection(nn.Module):
   """
diff --git a/src/maxdiffusion/models/modeling_flax_utils.py b/src/maxdiffusion/models/modeling_flax_utils.py
@@ -42,8 +42,8 @@
 
 
 logger = logging.get_logger(__name__)
-
-_ACTIVATIONS = {"swish": jax.nn.silu, "silu": jax.nn.silu, "relu": jax.nn.relu, "gelu": jax.nn.gelu, "mish": jax.nn.mish}
+# gelu and gelu_tanh both use approximate=True by default
+_ACTIVATIONS = {"swish": jax.nn.silu, "silu": jax.nn.silu, "relu": jax.nn.relu, "gelu": jax.nn.gelu, "gelu_tanh" : jax.nn.gelu, "mish": jax.nn.mish}
 
 def get_activation(name: str):
   func = _ACTIVATIONS.get(name)
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -19,9 +19,14 @@
 import jax.numpy as jnp
 from flax import nnx
 from .... import common_types, max_logging
-from ...modeling_flax_utils import FlaxModelMixin
+from ...modeling_flax_utils import FlaxModelMixin, get_activation
 from ....configuration_utils import ConfigMixin, register_to_config
-from ...embeddings_flax import get_1d_rotary_pos_embed, NNXFlaxTimesteps, NNXTimestepEmbedding
+from ...embeddings_flax import (
+  get_1d_rotary_pos_embed,
+  NNXFlaxTimesteps,
+  NNXTimestepEmbedding,
+  NNXPixArtAlphaTextProjection
+)
 
 BlockSizes = common_types.BlockSizes
 
@@ -101,6 +106,23 @@ def __init__(
       rngs=rngs, in_channels=time_freq_dim, time_embed_dim=dim,
       dtype=dtype, weights_dtype=weights_dtype, precision=precision
     )
+    self.act_fn = get_activation("silu")
+    self.time_proj = nnx.Linear(
+      rngs=rngs,
+      in_features=dim,
+      out_features=time_proj_dim,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", "mlp",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+    )
+    self.text_embedder = NNXPixArtAlphaTextProjection(
+      rngs=rngs,
+      in_features=text_embed_dim,
+      hidden_size=dim,
+      act_fn="gelu_tanh",
+    )
   
   def __call__(
     self,
@@ -110,7 +132,13 @@ def __call__(
   ):
     timestep = self.timesteps_proj(timestep)
     temb = self.time_embedder(timestep)
-    breakpoint()
+    
+    timestep_proj = self.time_proj(self.act_fn(temb))
+
+    encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+    if encoder_hidden_states_image is not None:
+      raise NotImplementedError("currently img2vid is not supported")
+    return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
 
 
 
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -20,8 +20,8 @@
 from absl.testing import absltest
 from flax import nnx
 
-from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed
-from ..models.embeddings_flax import NNXTimestepEmbedding
+from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed, WanTimeTextImageEmbedding
+from ..models.embeddings_flax import NNXTimestepEmbedding, NNXPixArtAlphaTextProjection
 
 class WanTransformerTest(unittest.TestCase):
   def setUp(self):
@@ -42,7 +42,19 @@ def test_rotary_pos_embed(self):
     )
     dummy_output = wan_rot_embed(dummy_hidden_states)
     assert dummy_output.shape == (1, 1, 75600, 64)
-  
+
+  def test_nnx_pixart_alpha_text_projection(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    dummy_caption = jnp.ones((1, 512, 4096))
+    layer = NNXPixArtAlphaTextProjection(
+      rngs=rngs,
+      in_features=4096,
+      hidden_size=5120
+    )
+    dummy_output = layer(dummy_caption)
+    dummy_output.shape == (1, 512, 5120)
+
   def test_nnx_timestep_embedding(self):
     key = jax.random.key(0)
     rngs = nnx.Rngs(key)
@@ -56,5 +68,30 @@ def test_nnx_timestep_embedding(self):
     dummy_output = layer(dummy_sample)
     assert dummy_output.shape == (1, 5120)
 
+  def test_wan_time_text_embedding(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+    batch_size = 1
+    dim=5120
+    time_freq_dim=256
+    time_proj_dim=30720
+    text_embed_dim=4096
+    layer = WanTimeTextImageEmbedding(
+      rngs=rngs,
+      dim=dim,
+      time_freq_dim=time_freq_dim,
+      time_proj_dim=time_proj_dim,
+      text_embed_dim=text_embed_dim
+    )
+    
+    dummy_timestep = jnp.ones(batch_size)
+
+    encoder_hidden_states_shape = (batch_size, time_freq_dim * 2, text_embed_dim)
+    dummy_encoder_hidden_states = jnp.ones(encoder_hidden_states_shape)
+    temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = layer(dummy_timestep, dummy_encoder_hidden_states)
+    assert temb.shape == (batch_size, dim)
+    assert timestep_proj.shape == (batch_size, time_proj_dim)
+    assert encoder_hidden_states.shape == (batch_size, time_freq_dim * 2, dim)
+
 if __name__ == "__main__":
   absltest.main()