refactor for ltx2.3

prishajain1 · prishajain1 · commit c159b84325fc · 2026-04-09T23:11:14.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_3_utils.py b/src/maxdiffusion/models/ltx2/ltx2_3_utils.py
@@ -17,10 +17,10 @@
     "model.diffusion_model.": "",
     "connectors.": "",
     "transformer_1d_blocks": "stacked_blocks",
-    "text_embedding_projection.audio_aggregate_embed.weight": "feature_extractor.audio_linear.kernel",
-    "text_embedding_projection.audio_aggregate_embed.bias": "feature_extractor.audio_linear.bias",
-    "text_embedding_projection.video_aggregate_embed.weight": "feature_extractor.video_linear.kernel",
-    "text_embedding_projection.video_aggregate_embed.bias": "feature_extractor.video_linear.bias",
+    "text_embedding_projection.audio_aggregate_embed.weight": "audio_text_proj_in.kernel",
+    "text_embedding_projection.audio_aggregate_embed.bias": "audio_text_proj_in.bias",
+    "text_embedding_projection.video_aggregate_embed.weight": "video_text_proj_in.kernel",
+    "text_embedding_projection.video_aggregate_embed.bias": "video_text_proj_in.bias",
     "q_norm": "norm_q",
     "k_norm": "norm_k",
     "norm_q.weight": "norm_q.scale",
@@ -35,8 +35,12 @@
     "ff.net.2.weight": "ff.net_2.kernel",
     "ff.net.2.bias": "ff.net_2.bias",
     "to_gate_logits.weight": "to_gate_logits.kernel",
-    "audio_linear.weight": "audio_linear.kernel",
-    "video_linear.weight": "video_linear.kernel",
+    "audio_linear.weight": "audio_text_proj_in.kernel",
+    "audio_linear.bias": "audio_text_proj_in.bias",
+    "video_linear.weight": "video_text_proj_in.kernel",
+    "video_linear.bias": "video_text_proj_in.bias",
+    "video_embeddings_connector": "video_connector",
+    "audio_embeddings_connector": "audio_connector",
 }
 
 def load_connectors_weights(
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py
@@ -261,3 +261,118 @@ def block_scan_fn(carry, block_module):
     hidden_states = self.final_norm(hidden_states)
 
     return hidden_states, attention_mask
+
+
+class LTX2TextConnectors(nnx.Module):
+
+  def __init__(
+      self,
+      caption_channels: int = 3840,
+      text_proj_in_factor: int = 49,
+      video_connector_num_attention_heads: int = 30,
+      video_connector_attention_head_dim: int = 128,
+      video_connector_num_layers: int = 2,
+      video_connector_num_learnable_registers: int = 128,
+      video_gated_attn: bool = False,
+      audio_connector_num_attention_heads: int = 30,
+      audio_connector_attention_head_dim: int = 128,
+      audio_connector_num_layers: int = 2,
+      audio_connector_num_learnable_registers: int = 128,
+      audio_gated_attn: bool = False,
+      connector_rope_base_seq_len: int = 4096,
+      rope_theta: float = 10000.0,
+      rope_double_precision: bool = True,
+      rope_type: str = "interleaved",
+      per_modality_projections: bool = False,
+      video_hidden_dim: int = 4096,
+      audio_hidden_dim: int = 2048,
+      proj_bias: bool = False,
+      attention_kernel: str = "flash",
+      mesh: jax.sharding.Mesh = None,
+      rngs: nnx.Rngs = None,
+  ):
+    text_encoder_dim = caption_channels * text_proj_in_factor
+    self.per_modality_projections = per_modality_projections
+    self.caption_channels = caption_channels
+    self.video_hidden_dim = video_hidden_dim
+    self.audio_hidden_dim = audio_hidden_dim
+
+    if per_modality_projections:
+      self.video_text_proj_in = nnx.Linear(
+          in_features=text_encoder_dim, out_features=video_hidden_dim, use_bias=proj_bias, rngs=rngs
+      )
+      self.audio_text_proj_in = nnx.Linear(
+          in_features=text_encoder_dim, out_features=audio_hidden_dim, use_bias=proj_bias, rngs=rngs
+      )
+    else:
+      self.text_proj_in = nnx.Linear(
+          in_features=text_encoder_dim, out_features=caption_channels, use_bias=proj_bias, rngs=rngs
+      )
+
+    self.video_connector = Embeddings1DConnector(
+        input_dim=video_hidden_dim if per_modality_projections else caption_channels,
+        heads=video_connector_num_attention_heads,
+        head_dim=video_connector_attention_head_dim,
+        layers=video_connector_num_layers,
+        theta=rope_theta,
+        num_learnable_registers=video_connector_num_learnable_registers,
+        rope_type=rope_type,
+        base_seq_len=connector_rope_base_seq_len,
+        double_precision=rope_double_precision,
+        attention_kernel=attention_kernel,
+        mesh=mesh,
+        rngs=rngs,
+        gated_attn=video_gated_attn,
+    )
+
+    self.audio_connector = Embeddings1DConnector(
+        input_dim=audio_hidden_dim if per_modality_projections else caption_channels,
+        heads=audio_connector_num_attention_heads,
+        head_dim=audio_connector_attention_head_dim,
+        layers=audio_connector_num_layers,
+        theta=rope_theta,
+        num_learnable_registers=audio_connector_num_learnable_registers,
+        rope_type=rope_type,
+        base_seq_len=connector_rope_base_seq_len,
+        double_precision=rope_double_precision,
+        attention_kernel=attention_kernel,
+        mesh=mesh,
+        rngs=rngs,
+        gated_attn=audio_gated_attn,
+    )
+
+  def __call__(self, text_encoder_hidden_states: Array, attention_mask: Array) -> Tuple[Array, Array, Array]:
+
+    if text_encoder_hidden_states.ndim == 3:
+      b, l, d = text_encoder_hidden_states.shape
+      text_proj_in_factor = d // self.caption_channels
+      text_encoder_hidden_states = text_encoder_hidden_states.reshape(b, l, self.caption_channels, text_proj_in_factor)
+    else:
+      b, l, _, _ = text_encoder_hidden_states.shape
+
+    if self.per_modality_projections:
+      # LTX-2.3
+      # per_token_rms_norm
+      variance = jnp.mean(text_encoder_hidden_states**2, axis=2, keepdims=True)
+      norm_text_encoder_hidden_states = text_encoder_hidden_states * jax.lax.rsqrt(variance + 1e-6)
+
+      norm_text_encoder_hidden_states = norm_text_encoder_hidden_states.reshape(b, l, -1)
+
+      bool_mask = (attention_mask > 0.5).astype(jnp.float32)[..., None]
+      norm_text_encoder_hidden_states = norm_text_encoder_hidden_states * bool_mask
+
+      # Rescale norms
+      video_scale_factor = jnp.sqrt(self.video_hidden_dim / self.caption_channels)
+      video_norm_text_emb = norm_text_encoder_hidden_states * video_scale_factor
+      audio_scale_factor = jnp.sqrt(self.audio_hidden_dim / self.caption_channels)
+      audio_norm_text_emb = norm_text_encoder_hidden_states * audio_scale_factor
+
+      video_text_emb_proj = self.video_text_proj_in(video_norm_text_emb)
+      audio_text_emb_proj = self.audio_text_proj_in(audio_norm_text_emb)
+    else:
+      raise NotImplementedError("LTX-2.0 path in LTX2TextConnectors not fully implemented yet.")
+
+    video_text_embedding, video_attn_mask = self.video_connector(video_text_emb_proj, attention_mask)
+    audio_text_embedding, _ = self.audio_connector(audio_text_emb_proj, attention_mask)
+
+    return video_text_embedding, audio_text_embedding, video_attn_mask
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -615,6 +615,7 @@ def __init__(
       flash_min_seq_length: int = 4096,
       gated_attn: bool = False,
       cross_attn_mod: bool = False,
+      use_prompt_embeddings: bool = True,
       **kwargs,
   ):
     self.in_channels = in_channels
@@ -649,6 +650,7 @@ def __init__(
     self.attention_out_bias = attention_out_bias
     self.rope_theta = rope_theta
     self.rope_double_precision = rope_double_precision
+    self.use_prompt_embeddings = use_prompt_embeddings
     self.causal_offset = causal_offset
     self.timestep_scale_multiplier = timestep_scale_multiplier
     self.cross_attn_timestep_scale_multiplier = cross_attn_timestep_scale_multiplier
@@ -694,38 +696,42 @@ def __init__(
     )
 
     # 2. Prompt embeddings
-    if self.cross_attn_mod:
-      self.caption_projection = NNXCombinedTimestepTextProjEmbeddings(
-          rngs=rngs,
-          in_features=self.caption_channels,
-          hidden_size=self.cross_attention_dim,
-          embedding_dim=self.cross_attention_dim,
-          dtype=self.dtype,
-          weights_dtype=self.weights_dtype,
-      )
-      self.audio_caption_projection = NNXCombinedTimestepTextProjEmbeddings(
-          rngs=rngs,
-          in_features=self.audio_caption_channels,
-          hidden_size=self.audio_cross_attention_dim,
-          embedding_dim=self.audio_cross_attention_dim,
-          dtype=self.dtype,
-          weights_dtype=self.weights_dtype,
-      )
+    if self.use_prompt_embeddings:
+      if self.cross_attn_mod:
+        self.caption_projection = NNXCombinedTimestepTextProjEmbeddings(
+            rngs=rngs,
+            in_features=self.caption_channels,
+            hidden_size=self.cross_attention_dim,
+            embedding_dim=self.cross_attention_dim,
+            dtype=self.dtype,
+            weights_dtype=self.weights_dtype,
+        )
+        self.audio_caption_projection = NNXCombinedTimestepTextProjEmbeddings(
+            rngs=rngs,
+            in_features=self.audio_caption_channels,
+            hidden_size=self.audio_cross_attention_dim,
+            embedding_dim=self.audio_cross_attention_dim,
+            dtype=self.dtype,
+            weights_dtype=self.weights_dtype,
+        )
+      else:
+        self.caption_projection = NNXPixArtAlphaTextProjection(
+            rngs=rngs,
+            in_features=self.caption_channels,
+            hidden_size=inner_dim,
+            dtype=self.dtype,
+            weights_dtype=self.weights_dtype,
+        )
+        self.audio_caption_projection = NNXPixArtAlphaTextProjection(
+            rngs=rngs,
+            in_features=self.audio_caption_channels,
+            hidden_size=audio_inner_dim,
+            dtype=self.dtype,
+            weights_dtype=self.weights_dtype,
+        )
     else:
-      self.caption_projection = NNXPixArtAlphaTextProjection(
-          rngs=rngs,
-          in_features=self.caption_channels,
-          hidden_size=inner_dim,
-          dtype=self.dtype,
-          weights_dtype=self.weights_dtype,
-      )
-      self.audio_caption_projection = NNXPixArtAlphaTextProjection(
-          rngs=rngs,
-          in_features=self.audio_caption_channels,
-          hidden_size=audio_inner_dim,
-          dtype=self.dtype,
-          weights_dtype=self.weights_dtype,
-      )
+      self.caption_projection = None
+      self.audio_caption_projection = None
     # 3. Timestep Modulation Params and Embedding
     num_mod_params = 9 if self.cross_attn_mod else 6
     self.time_embed = LTX2AdaLayerNormSingle(
@@ -1050,11 +1056,14 @@ def __call__(
       audio_cross_attn_v2a_gate = audio_cross_attn_v2a_gate.reshape(batch_size, -1, audio_cross_attn_v2a_gate.shape[-1])
 
       # 4. Prepare prompt embeddings
-      encoder_hidden_states = self.caption_projection(encoder_hidden_states, timestep)
-      encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
+      if self.use_prompt_embeddings:
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states, timestep)
+        encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
 
-      audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states, audio_timestep if audio_timestep is not None else timestep)
-      audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
+        audio_encoder_hidden_states = self.audio_caption_projection(
+            audio_encoder_hidden_states, audio_timestep if audio_timestep is not None else timestep
+        )
+        audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
 
     # 5. Run transformer blocks
     def scan_fn(carry, block):