removed dead code from embeddings_connector

prishajain1 · prishajain1 · commit a6fdc4408252 · 2026-04-13T10:15:32.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py
@@ -262,117 +262,3 @@ def block_scan_fn(carry, block_module):
 
     return hidden_states, attention_mask
 
-
-class LTX2TextConnectors(nnx.Module):
-
-  def __init__(
-      self,
-      caption_channels: int = 3840,
-      text_proj_in_factor: int = 49,
-      video_connector_num_attention_heads: int = 30,
-      video_connector_attention_head_dim: int = 128,
-      video_connector_num_layers: int = 2,
-      video_connector_num_learnable_registers: int = 128,
-      video_gated_attn: bool = False,
-      audio_connector_num_attention_heads: int = 30,
-      audio_connector_attention_head_dim: int = 128,
-      audio_connector_num_layers: int = 2,
-      audio_connector_num_learnable_registers: int = 128,
-      audio_gated_attn: bool = False,
-      connector_rope_base_seq_len: int = 4096,
-      rope_theta: float = 10000.0,
-      rope_double_precision: bool = True,
-      rope_type: str = "interleaved",
-      per_modality_projections: bool = False,
-      video_hidden_dim: int = 4096,
-      audio_hidden_dim: int = 2048,
-      proj_bias: bool = False,
-      attention_kernel: str = "flash",
-      mesh: jax.sharding.Mesh = None,
-      rngs: nnx.Rngs = None,
-  ):
-    text_encoder_dim = caption_channels * text_proj_in_factor
-    self.per_modality_projections = per_modality_projections
-    self.caption_channels = caption_channels
-    self.video_hidden_dim = video_hidden_dim
-    self.audio_hidden_dim = audio_hidden_dim
-
-    if per_modality_projections:
-      self.video_text_proj_in = nnx.Linear(
-          in_features=text_encoder_dim, out_features=video_hidden_dim, use_bias=proj_bias, rngs=rngs
-      )
-      self.audio_text_proj_in = nnx.Linear(
-          in_features=text_encoder_dim, out_features=audio_hidden_dim, use_bias=proj_bias, rngs=rngs
-      )
-    else:
-      self.text_proj_in = nnx.Linear(
-          in_features=text_encoder_dim, out_features=caption_channels, use_bias=proj_bias, rngs=rngs
-      )
-
-    self.video_connector = Embeddings1DConnector(
-        input_dim=video_hidden_dim if per_modality_projections else caption_channels,
-        heads=video_connector_num_attention_heads,
-        head_dim=video_connector_attention_head_dim,
-        layers=video_connector_num_layers,
-        theta=rope_theta,
-        num_learnable_registers=video_connector_num_learnable_registers,
-        rope_type=rope_type,
-        base_seq_len=connector_rope_base_seq_len,
-        double_precision=rope_double_precision,
-        attention_kernel=attention_kernel,
-        mesh=mesh,
-        rngs=rngs,
-        gated_attn=video_gated_attn,
-    )
-
-    self.audio_connector = Embeddings1DConnector(
-        input_dim=audio_hidden_dim if per_modality_projections else caption_channels,
-        heads=audio_connector_num_attention_heads,
-        head_dim=audio_connector_attention_head_dim,
-        layers=audio_connector_num_layers,
-        theta=rope_theta,
-        num_learnable_registers=audio_connector_num_learnable_registers,
-        rope_type=rope_type,
-        base_seq_len=connector_rope_base_seq_len,
-        double_precision=rope_double_precision,
-        attention_kernel=attention_kernel,
-        mesh=mesh,
-        rngs=rngs,
-        gated_attn=audio_gated_attn,
-    )
-
-  def __call__(self, text_encoder_hidden_states: Array, attention_mask: Array) -> Tuple[Array, Array, Array]:
-
-    if text_encoder_hidden_states.ndim == 3:
-      b, l, d = text_encoder_hidden_states.shape
-      text_proj_in_factor = d // self.caption_channels
-      text_encoder_hidden_states = text_encoder_hidden_states.reshape(b, l, self.caption_channels, text_proj_in_factor)
-    else:
-      b, l, _, _ = text_encoder_hidden_states.shape
-
-    if self.per_modality_projections:
-      # LTX-2.3
-      # per_token_rms_norm
-      variance = jnp.mean(text_encoder_hidden_states**2, axis=2, keepdims=True)
-      norm_text_encoder_hidden_states = text_encoder_hidden_states * jax.lax.rsqrt(variance + 1e-6)
-
-      norm_text_encoder_hidden_states = norm_text_encoder_hidden_states.reshape(b, l, -1)
-
-      bool_mask = (attention_mask > 0.5).astype(jnp.float32)[..., None]
-      norm_text_encoder_hidden_states = norm_text_encoder_hidden_states * bool_mask
-
-      # Rescale norms
-      video_scale_factor = jnp.sqrt(self.video_hidden_dim / self.caption_channels)
-      video_norm_text_emb = norm_text_encoder_hidden_states * video_scale_factor
-      audio_scale_factor = jnp.sqrt(self.audio_hidden_dim / self.caption_channels)
-      audio_norm_text_emb = norm_text_encoder_hidden_states * audio_scale_factor
-
-      video_text_emb_proj = self.video_text_proj_in(video_norm_text_emb)
-      audio_text_emb_proj = self.audio_text_proj_in(audio_norm_text_emb)
-    else:
-      raise NotImplementedError("LTX-2.0 path in LTX2TextConnectors not fully implemented yet.")
-
-    video_text_embedding, video_attn_mask = self.video_connector(video_text_emb_proj, attention_mask)
-    audio_text_embedding, _ = self.audio_connector(audio_text_emb_proj, attention_mask)
-
-    return video_text_embedding, audio_text_embedding, video_attn_mask