changes in text encoder and feature extractor file

prishajain1 · prishajain1 · commit 85df9d27660b · 2026-04-24T12:30:16.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py
@@ -102,14 +102,25 @@ def __init__(
       output_dim: int,
       dtype: DType = jnp.float32,
       rngs: nnx.Rngs = None,
+      per_modality_projections: bool = False,
+      use_bias: bool = False,
+      video_output_dim: Optional[int] = None,
+      audio_output_dim: Optional[int] = None,
   ):
     """
     Args:
         input_dim: Dimension of flattened hidden states (Gemma dim * Num layers).
-        output_dim: Target dimension for diffusion conditioning.
+        output_dim: Target dimension for diffusion conditioning (fallback).
     """
-    # LTX-2 uses bias=False for the projection
-    self.linear = nnx.Linear(input_dim, output_dim, use_bias=False, dtype=dtype, rngs=rngs)
+    self.per_modality_projections = per_modality_projections
+    
+    if per_modality_projections:
+      v_dim = video_output_dim if video_output_dim is not None else output_dim
+      a_dim = audio_output_dim if audio_output_dim is not None else output_dim
+      self.video_linear = nnx.Linear(input_dim, v_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+      self.audio_linear = nnx.Linear(input_dim, a_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+    else:
+      self.linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
 
   def __call__(self, hidden_states: Union[Tuple[Array, ...], Array], attention_mask: Array) -> Array:
     """
@@ -133,4 +144,7 @@ def __call__(self, hidden_states: Union[Tuple[Array, ...], Array], attention_mas
     x_norm = _norm_and_concat_padded_batch(x, attention_mask)
 
     # 4. Projection
-    return self.linear(x_norm)
+    if self.per_modality_projections:
+      return self.video_linear(x_norm), self.audio_linear(x_norm)
+    else:
+      return self.linear(x_norm)
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py
@@ -39,6 +39,8 @@ class LTX2AudioVideoGemmaTextEncoder(nnx.Module, FlaxModelMixin, ConfigMixin):
   def __init__(
       self,
       caption_channels: int = 3840,
+      video_caption_channels: Optional[int] = None,
+      audio_caption_channels: Optional[int] = None,
       text_proj_in_factor: int = 49,
       video_connector_attention_head_dim: int = 128,
       video_connector_num_attention_heads: int = 30,
@@ -57,63 +59,151 @@ def __init__(
       attention_kernel: str = "flash",
       mesh: jax.sharding.Mesh = None,
       rngs: nnx.Rngs = None,
+      per_modality_projections: bool = False,
+      proj_bias: bool = False,
+      video_gated_attn: bool = False,
+      audio_gated_attn: bool = False,
+      audio_hidden_dim: Optional[int] = None,
+      video_hidden_dim: Optional[int] = None,
       **kwargs,
   ):
-    input_dim = caption_channels * text_proj_in_factor
-
-    self.feature_extractor = LTX2GemmaFeatureExtractor(
-        input_dim=input_dim,
-        output_dim=caption_channels,
-        dtype=dtype,
-        rngs=rngs,
-    )
-
-    # Two independent connectors
-    self.video_embeddings_connector = Embeddings1DConnector(
-        input_dim=caption_channels,
-        heads=video_connector_num_attention_heads,
-        head_dim=video_connector_attention_head_dim,
-        layers=video_connector_num_layers,
-        num_learnable_registers=video_connector_num_learnable_registers,
-        rope_type=rope_type,
-        theta=rope_theta,
-        base_seq_len=connector_rope_base_seq_len,
-        double_precision=rope_double_precision,
-        attention_kernel=attention_kernel,
-        mesh=mesh,
-        rngs=rngs,
-    )
-
-    self.audio_embeddings_connector = Embeddings1DConnector(
-        input_dim=caption_channels,
-        heads=audio_connector_num_attention_heads,
-        head_dim=audio_connector_attention_head_dim,
-        layers=audio_connector_num_layers,
-        num_learnable_registers=audio_connector_num_learnable_registers,
-        rope_type=rope_type,
-        theta=rope_theta,
-        base_seq_len=connector_rope_base_seq_len,
-        double_precision=rope_double_precision,
-        attention_kernel=attention_kernel,
-        mesh=mesh,
-        rngs=rngs,
-    )
+    gemma_dim = 3840 if video_caption_channels is not None else caption_channels
+    input_dim = gemma_dim * text_proj_in_factor
+
+    v_dim = video_hidden_dim if video_hidden_dim is not None else (video_caption_channels if video_caption_channels is not None else caption_channels)
+    a_dim = audio_hidden_dim if audio_hidden_dim is not None else (audio_caption_channels if audio_caption_channels is not None else caption_channels)
+
+    self.per_modality_projections = per_modality_projections
+
+    if per_modality_projections:
+      self.video_text_proj_in = nnx.Linear(
+          in_features=input_dim, out_features=v_dim, use_bias=proj_bias, rngs=rngs
+      )
+      self.audio_text_proj_in = nnx.Linear(
+          in_features=input_dim, out_features=a_dim, use_bias=proj_bias, rngs=rngs
+      )
+
+      self.video_embeddings_connector = Embeddings1DConnector(
+          input_dim=v_dim,
+          heads=video_connector_num_attention_heads,
+          head_dim=video_connector_attention_head_dim,
+          layers=video_connector_num_layers,
+          num_learnable_registers=video_connector_num_learnable_registers,
+          rope_type=rope_type,
+          theta=rope_theta,
+          base_seq_len=connector_rope_base_seq_len,
+          double_precision=rope_double_precision,
+          attention_kernel=attention_kernel,
+          mesh=mesh,
+          rngs=rngs,
+          gated_attn=video_gated_attn,
+      )
+      self.audio_embeddings_connector = Embeddings1DConnector(
+          input_dim=a_dim,
+          heads=audio_connector_num_attention_heads,
+          head_dim=audio_connector_attention_head_dim,
+          layers=audio_connector_num_layers,
+          num_learnable_registers=audio_connector_num_learnable_registers,
+          rope_type=rope_type,
+          theta=rope_theta,
+          base_seq_len=connector_rope_base_seq_len,
+          double_precision=rope_double_precision,
+          attention_kernel=attention_kernel,
+          mesh=mesh,
+          rngs=rngs,
+          gated_attn=audio_gated_attn,
+      )
+    else:
+      self.feature_extractor = LTX2GemmaFeatureExtractor(
+          input_dim=input_dim,
+          output_dim=caption_channels,
+          dtype=dtype,
+          rngs=rngs,
+          per_modality_projections=per_modality_projections,
+          use_bias=proj_bias,
+          video_output_dim=v_dim,
+          audio_output_dim=a_dim,
+      )
+
+      # Two independent connectors
+      self.video_embeddings_connector = Embeddings1DConnector(
+          input_dim=v_dim,
+          heads=video_connector_num_attention_heads,
+          head_dim=video_connector_attention_head_dim,
+          layers=video_connector_num_layers,
+          num_learnable_registers=video_connector_num_learnable_registers,
+          rope_type=rope_type,
+          theta=rope_theta,
+          base_seq_len=connector_rope_base_seq_len,
+          double_precision=rope_double_precision,
+          attention_kernel=attention_kernel,
+          mesh=mesh,
+          rngs=rngs,
+          gated_attn=video_gated_attn,
+      )
+      self.audio_embeddings_connector = Embeddings1DConnector(
+          input_dim=a_dim,
+          heads=audio_connector_num_attention_heads,
+          head_dim=audio_connector_attention_head_dim,
+          layers=audio_connector_num_layers,
+          num_learnable_registers=audio_connector_num_learnable_registers,
+          rope_type=rope_type,
+          theta=rope_theta,
+          base_seq_len=connector_rope_base_seq_len,
+          double_precision=rope_double_precision,
+          attention_kernel=attention_kernel,
+          mesh=mesh,
+          rngs=rngs,
+          gated_attn=audio_gated_attn,
+      )
 
   def __call__(
       self,
       hidden_states: Union[Tuple[Array, ...], List[Array]],
       attention_mask: Array,
-  ) -> Tuple[Array, Array]:
+  ) -> Tuple[Array, Array, Array]:
     """
     Returns:
         (video_embeds, audio_embeds, new_attention_mask)
     """
     with jax.named_scope("Text Encoder Forward"):
-      # 1. Shared Feature Extraction
-      features = self.feature_extractor(hidden_states, attention_mask)
+      if self.per_modality_projections:
+        # 1. Stack Hidden States if needed
+        if isinstance(hidden_states, (tuple, list)):
+          x = jnp.stack(hidden_states, axis=-1)
+        else:
+          x = hidden_states
+
+        b, l, d, k = x.shape
+        
+        # 2. Per-token RMS norm
+        variance = jnp.mean(x**2, axis=2, keepdims=True)
+        norm_text_encoder_hidden_states = x * jax.lax.rsqrt(variance + 1e-6)
+
+        norm_text_encoder_hidden_states = norm_text_encoder_hidden_states.reshape(b, l, -1)
+
+        bool_mask = (attention_mask > 0.5).astype(jnp.float32)[..., None]
+        norm_text_encoder_hidden_states = norm_text_encoder_hidden_states * bool_mask
+
+        # 3. Rescale norms
+        cap_channels = getattr(self, "caption_channels", getattr(self.config, "caption_channels", 3840))
+        
+        video_scale_factor = jnp.sqrt(self.video_embeddings_connector.dim / cap_channels)
+        video_norm_text_emb = norm_text_encoder_hidden_states * video_scale_factor
+        audio_scale_factor = jnp.sqrt(self.audio_embeddings_connector.dim / cap_channels)
+        audio_norm_text_emb = norm_text_encoder_hidden_states * audio_scale_factor
+
+        video_text_emb_proj = self.video_text_proj_in(video_norm_text_emb)
+        audio_text_emb_proj = self.audio_text_proj_in(audio_norm_text_emb)
+
+        video_embeds, new_attention_mask = self.video_embeddings_connector(video_text_emb_proj, attention_mask)
+        audio_embeds, _ = self.audio_embeddings_connector(audio_text_emb_proj, attention_mask)
+      else:
+        # 1. Shared Feature Extraction
+        features = self.feature_extractor(hidden_states, attention_mask)
 
-      # 2. Parallel Connection
-      video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
-      audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
+        # 2. Parallel Connection
+        video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
+        audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
 
       return video_embeds, audio_embeds, new_attention_mask