revert

prishajain1 · prishajain1 · commit d9f55d20e978 · 2026-04-09T22:00:08.000+05:30
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -385,25 +385,6 @@ def __call__(self, caption):
     return hidden_states
 
 
-class NNXSimpleLinearWrapper(nnx.Module):
-
-  def __init__(self, rngs: nnx.Rngs, in_features: int, out_features: int, weights_dtype: jnp.dtype):
-    super().__init__()
-    self.linear = nnx.Linear(
-        rngs=rngs,
-        in_features=in_features,
-        out_features=out_features,
-        use_bias=True,
-        dtype=jnp.float32,
-        param_dtype=weights_dtype,
-        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
-    )
-
-  def __call__(self, x):
-    return self.linear(x)
-
-
 class PixArtAlphaTextProjection(nn.Module):
   """
   Projects caption embeddings. Also handles dropout for classifier-free guidance.
@@ -505,13 +486,19 @@ def __init__(
       weights_dtype: jnp.dtype = jnp.float32,
       precision: jax.lax.Precision = None,
   ):
-    self.text_embedder = NNXPixArtAlphaTextProjection(
+    if out_features is None:
+      out_features = hidden_size
+
+    self.linear = nnx.Linear(
         rngs=rngs,
         in_features=in_features,
-        hidden_size=embedding_dim,
-        dtype=dtype,
-        weights_dtype=weights_dtype,
+        out_features=out_features,
+        use_bias=True,
+        dtype=jnp.float32,
+        param_dtype=weights_dtype,
         precision=precision,
+        kernel_init=nnx.with_partitioning(nnx.initializers.xavier_uniform(), ("mlp", "embed")),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
     )
 
     self.time_proj = NNXTimesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
@@ -529,7 +516,7 @@ def __init__(self, rngs: nnx.Rngs, embedding_dim: int, weights_dtype: jnp.dtype)
     self.emb = EmbWrapper(rngs, embedding_dim, weights_dtype)
 
   def __call__(self, caption, timestep):
-    hidden_states = self.text_embedder(caption)
+    hidden_states = self.linear(caption)
 
     timesteps_proj = self.time_proj(timestep)
     timesteps_emb = self.emb.timestep_embedder(timesteps_proj)
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -696,16 +696,20 @@ def __init__(
 
     # 2. Prompt embeddings
     if self.cross_attn_mod:
-      self.caption_projection = NNXSimpleLinearWrapper(
+      self.caption_projection = NNXCombinedTimestepTextProjEmbeddings(
           rngs=rngs,
           in_features=self.caption_channels,
-          out_features=self.cross_attention_dim,
+          hidden_size=self.cross_attention_dim,
+          embedding_dim=inner_dim,
+          dtype=self.dtype,
           weights_dtype=self.weights_dtype,
       )
-      self.audio_caption_projection = NNXSimpleLinearWrapper(
+      self.audio_caption_projection = NNXCombinedTimestepTextProjEmbeddings(
           rngs=rngs,
           in_features=self.audio_caption_channels,
-          out_features=self.audio_cross_attention_dim,
+          hidden_size=self.audio_cross_attention_dim,
+          embedding_dim=audio_inner_dim,
+          dtype=self.dtype,
           weights_dtype=self.weights_dtype,
       )
     else: