add nnx classes for timestep embeddings and timesteps.

jfacevedo-google · jfacevedo-google · commit 064fc5fe9c05 · 2025-05-09T19:58:22.000Z
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-
+from typing import Optional
 import flax.linen as nn
+from flax import nnx
 import jax.numpy as jnp
 from typing import List, Union
 import jax
+from .modeling_flax_utils import get_activation
 
 
 def get_sinusoidal_embeddings(
@@ -56,6 +58,86 @@ def get_sinusoidal_embeddings(
   signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
   return signal
 
+class NNXTimestepEmbedding(nnx.Module):
+  r"""
+  Time step Embedding Module. Learns embeddings for input time steps.
+
+  Args:
+      time_embed_dim (`int`, *optional*, defaults to `32`):
+              Time step embedding dimension
+      dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+              Parameters `dtype`
+  """
+  def __init__(
+    self,
+    rngs: nnx.Rngs,
+    in_channels: int,
+    time_embed_dim: int = 32,
+    act_fn: str = "silu",
+    out_dim: int = None,
+    post_act_fn: Optional[str] = None,
+    cond_proj_dim: int = None,
+    sample_proj_bias=True,
+    dtype: jnp.dtype = jnp.float32,
+    weights_dtype: jnp.dtype = jnp.float32,
+    precision: jax.lax.Precision = None,
+  ):
+    self.linear_1 = nnx.Linear(
+      rngs=rngs,
+      in_features=in_channels,
+      out_features=time_embed_dim,
+      use_bias=sample_proj_bias,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("embed", "mlp",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
+    )
+
+    if cond_proj_dim is not None:
+      self.cond_proj = nnx.Linear(
+        rngs=rngs,
+      )
+    else:
+      self.cond_proj = None
+    
+    self.act = get_activation(act_fn)
+
+    if out_dim is not None:
+      time_embed_dim_out = out_dim
+    else:
+      time_embed_dim_out = time_embed_dim
+    
+    self.linear_2 = nnx.Linear(
+      rngs=rngs,
+      in_features=time_embed_dim,
+      out_features=time_embed_dim_out,
+      use_bias=sample_proj_bias,
+      dtype=dtype,
+      param_dtype=weights_dtype,
+      precision=precision,
+      kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed",)),
+      bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+    )
+
+    if post_act_fn is None:
+      self.post_act = None
+    else:
+      self.post_act = get_activation(post_act_fn)
+
+  def __call__(self, sample, condition=None):
+    if condition is not None:
+      sample = sample + self.cond_proj(condition)
+    sample = self.linear_1(sample)
+
+    if self.act is not None:
+      sample = self.act(sample)
+    sample = self.linear_2(sample)
+
+    if self.post_act is not None:
+      sample = self.post_act(sample)
+    return sample
+
 
 class FlaxTimestepEmbedding(nn.Module):
   r"""
@@ -79,6 +161,23 @@ def __call__(self, temb):
     temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, param_dtype=self.weights_dtype, name="linear_2")(temb)
     return temb
 
+class NNXFlaxTimesteps(nnx.Module):
+  def __init__(
+    self,
+    dim: int = 32,
+    flip_sin_to_cos: bool = False,
+    freq_shift: float = 1.0,
+    scale: int = 1,
+  ):
+    self.dim = dim
+    self.flip_sin_to_cos = flip_sin_to_cos
+    self.freq_shift = freq_shift
+    self.scale = scale
+
+  def __call__(self, timesteps):
+    return get_sinusoidal_embeddings(
+      timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
+  )
 
 class FlaxTimesteps(nn.Module):
   r"""
diff --git a/src/maxdiffusion/models/modeling_flax_utils.py b/src/maxdiffusion/models/modeling_flax_utils.py
@@ -43,6 +43,13 @@
 
 logger = logging.get_logger(__name__)
 
+_ACTIVATIONS = {"swish": jax.nn.silu, "silu": jax.nn.silu, "relu": jax.nn.relu, "gelu": jax.nn.gelu, "mish": jax.nn.mish}
+
+def get_activation(name: str):
+  func = _ACTIVATIONS.get(name)
+  if func is None:
+    raise ValueError(f"Unknown activation function: {name}")
+  return func
 
 class FlaxModelMixin(PushToHubMixin):
   r"""
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -20,23 +20,14 @@
 import jax.numpy as jnp
 from flax import nnx
 from ...configuration_utils import ConfigMixin
-from ..modeling_flax_utils import FlaxModelMixin
+from ..modeling_flax_utils import FlaxModelMixin, get_activation
 from ... import common_types
 from ..vae_flax import (FlaxAutoencoderKLOutput, FlaxDiagonalGaussianDistribution, FlaxDecoderOutput)
 
 BlockSizes = common_types.BlockSizes
 
 CACHE_T = 2
 
-_ACTIVATIONS = {"swish": jax.nn.silu, "silu": jax.nn.silu, "relu": jax.nn.relu, "gelu": jax.nn.gelu, "mish": jax.nn.mish}
-
-
-def get_activation(name: str):
-  func = _ACTIVATIONS.get(name)
-  if func is None:
-    raise ValueError(f"Unknown activation function: {name}")
-  return func
-
 
 # Helper to ensure kernel_size, stride, padding are tuples of 3 integers
 def _canonicalize_tuple(x: Union[int, Sequence[int]], rank: int, name: str) -> Tuple[int, ...]:
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -14,14 +14,14 @@
  limitations under the License.
 """
 
-from typing import Tuple, Optional
+from typing import Tuple, Optional, Dict, Union, Any
 import jax
 import jax.numpy as jnp
 from flax import nnx
 from .... import common_types, max_logging
 from ...modeling_flax_utils import FlaxModelMixin
-from ....configuration_utils import ConfigMixin
-from ...embeddings_flax import get_1d_rotary_pos_embed
+from ....configuration_utils import ConfigMixin, register_to_config
+from ...embeddings_flax import get_1d_rotary_pos_embed, NNXFlaxTimesteps, NNXTimestepEmbedding
 
 BlockSizes = common_types.BlockSizes
 
@@ -65,7 +65,7 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     cumulative_sizes = jnp.cumsum(jnp.array(sizes))
     split_indices = cumulative_sizes[:-1]
     freqs_split = jnp.split(self.freqs, split_indices, axis=1)
-    
+
     freqs_f = jnp.expand_dims(jnp.expand_dims(freqs_split[0][:ppf], axis=1), axis=1)
     freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, freqs_split[0].shape[-1]))
 
@@ -80,6 +80,40 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     return freqs_final
 
 
+class WanTimeTextImageEmbedding(nnx.Module):
+  def __init__(
+    self,
+    rngs: nnx.Rngs,
+    dim: int,
+    time_freq_dim: int,
+    time_proj_dim: int,
+    text_embed_dim: int,
+    image_embed_dim: Optional[int] = None,
+    pos_embed_seq_len: Optional[int] = None,
+    dtype: jnp.dtype = jnp.float32,
+    weights_dtype: jnp.dtype = jnp.float32,
+    precision: jax.lax.Precision = None,
+  ):
+    self.timesteps_proj = NNXFlaxTimesteps(
+      dim=time_freq_dim, flip_sin_to_cos=True, freq_shift=0
+    )
+    self.time_embedder = NNXTimestepEmbedding(
+      rngs=rngs, in_channels=time_freq_dim, time_embed_dim=dim,
+      dtype=dtype, weights_dtype=weights_dtype, precision=precision
+    )
+  
+  def __call__(
+    self,
+    timestep: jax.Array,
+    encoder_hidden_states: jax.Array,
+    encoder_hidden_states_image: Optional[jax.Array] = None
+  ):
+    timestep = self.timesteps_proj(timestep)
+    temb = self.time_embedder(timestep)
+    breakpoint()
+
+
+
 class WanTransformer3DModel(nnx.Module, FlaxModelMixin, ConfigMixin):
   def __init__(
     self,
@@ -120,25 +154,28 @@ def __init__(
 
 
 class WanModel(nnx.Module, FlaxModelMixin, ConfigMixin):
-
+  
+  @register_to_config
   def __init__(
       self,
       rngs: nnx.Rngs,
       model_type="t2v",
-      patch_size=(1, 2, 2),
-      text_len=512,
-      in_dim=16,
-      dim=2048,
-      ffn_dim=8192,
-      freq_dim=256,
-      text_dim=4096,
-      out_dim=16,
-      num_heads=16,
-      num_layers=32,
-      window_size=(-1, -1),
-      qk_norm=True,
-      cross_attn_norm=True,
-      eps=1e-6,
+      patch_size: Tuple[int] = (1, 2, 2),
+      num_attention_heads: int = 40,
+      attention_head_dim: int = 128,
+      in_channels: int = 16,
+      out_channels: int = 16,
+      text_dim: int = 4096,
+      freq_dim: int = 256,
+      ffn_dim: int = 13824,
+      num_layers: int = 40,
+      cross_attn_norm: bool = True,
+      qk_norm: Optional[str] = "rms_norm_across_heads",
+      eps: float = 1e-6,
+      image_dim: Optional[int] = None,
+      added_kn_proj_dim: Optional[int] = None,
+      rope_max_seq_len: int = 1024,
+      pos_embed_seq_len: Optional[int] = None,
       flash_min_seq_length: int = 4096,
       flash_block_sizes: BlockSizes = None,
       mesh: jax.sharding.Mesh = None,
@@ -147,18 +184,62 @@ def __init__(
       precision: jax.lax.Precision = None,
       attention: str = "dot_product",
   ):
-    self.path_embedding = nnx.Conv(
-        in_dim,
-        dim,
+    
+    inner_dim = num_attention_heads * attention_head_dim
+    out_channels = out_channels or in_channels
+
+    #1. Patch & position embedding
+    self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+    self.patch_embedding = nnx.Conv(
+        in_channels,
+        inner_dim,
+        rngs=rngs,
         kernel_size=patch_size,
         strides=patch_size,
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("batch",)),
-        rngs=rngs,
     )
 
-  def __call__(self, x):
-    x = self.path_embedding(x)
-    return x
+    # 2. Condition embeddings
+    # image_embedding_dim=1280 for I2V model
+    self.condition_embedder = WanTimeTextImageEmbedding(
+      rngs=rngs,
+      dim=inner_dim,
+      time_freq_dim=freq_dim,
+      time_proj_dim=inner_dim * 6,
+      text_embed_dim=text_dim,
+      image_embed_dim=image_dim,
+      pos_embed_seq_len=pos_embed_seq_len
+    )
+
+  def __call__(
+    self,
+    hidden_states: jax.Array,
+    timestep: jax.Array,
+    encoder_hidden_states: jax.Array,
+    encoder_hidden_states_image: Optional[jax.Array] = None,
+    return_dict: bool = True,
+    attention_kwargs: Optional[Dict[str, Any]] = None,
+  ) -> Union[jax.Array, Dict[str, jax.Array]]:
+    batch_size, num_frames, height, width, num_channels = hidden_states.shape
+    p_t, p_h, p_w = self.config.patch_size
+    post_patch_num_frames = num_frames // p_t
+    post_patch_height = height // p_h
+    post_patch_width = width // p_w
+
+
+    rotary_emb = self.rope(hidden_states)
+    hidden_states = self.patch_embedding(hidden_states)
+    hidden_states = jax.lax.collapse(hidden_states, 1, -1)
+
+    temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+      timestep, encoder_hidden_states, encoder_hidden_states_image
+    )
+    #hidden_states = 
+    # Torch shape: ([1, 5120, 21, 45, 80])
+    # Jax shape: (1, 21, 45, 80, 5120) so channels is 5120
+
+
+    return hidden_states
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -21,6 +21,7 @@
 from flax import nnx
 
 from ..models.wan.transformers.transformer_wan import WanRotaryPosEmbed
+from ..models.embeddings_flax import NNXTimestepEmbedding
 
 class WanTransformerTest(unittest.TestCase):
   def setUp(self):
@@ -41,7 +42,19 @@ def test_rotary_pos_embed(self):
     )
     dummy_output = wan_rot_embed(dummy_hidden_states)
     assert dummy_output.shape == (1, 1, 75600, 64)
-    # output shape should be torch.Size([1, 1, 75600, 64])
+  
+  def test_nnx_timestep_embedding(self):
+    key = jax.random.key(0)
+    rngs = nnx.Rngs(key)
+
+    dummy_sample = jnp.ones((1, 256))
+    layer = NNXTimestepEmbedding(
+      rngs=rngs,
+      in_channels=256,
+      time_embed_dim=5120
+    )
+    dummy_output = layer(dummy_sample)
+    assert dummy_output.shape == (1, 5120)
 
 if __name__ == "__main__":
   absltest.main()