Add LTX2.3 Model support

prishajain1 · prishajain1 · commit fcf64e3c41e0 · 2026-04-09T09:38:43.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_3_video.yml b/src/maxdiffusion/configs/ltx2_3_video.yml
@@ -0,0 +1,107 @@
+#hardware
+hardware: 'tpu'
+skip_jax_distributed_system: False
+attention: 'flash'
+a2v_attention_kernel: 'flash'
+v2a_attention_kernel: 'dot_product'
+attention_sharding_uniform: True 
+precision: 'bf16'
+scan_layers: True
+names_which_can_be_saved: []
+names_which_can_be_offloaded: []
+remat_policy: "NONE"
+
+jax_cache_dir: ''
+weights_dtype: 'bfloat16'
+activations_dtype: 'bfloat16'
+
+run_name: 'ltx2_inference'
+output_dir: ''
+config_path: ''
+save_config_to_gcs: False
+
+#Checkpoints
+max_sequence_length: 1024
+sampler: "from_checkpoint"
+
+# Generation parameters
+global_batch_size_to_train_on: 1
+num_inference_steps: 40
+guidance_scale: 3.0
+fps: 24
+pipeline_type: multi-scale
+prompt: "A man in a brightly lit room talks on a vintage telephone. In a low, heavy voice, he says, 'I understand. I won't call again. Goodbye.' He hangs up the receiver and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is brightly lit by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a dramatic movie."
+negative_prompt: "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."
+height: 512
+width: 768
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+num_frames: 121
+quantization: "int8"
+seed: 10
+#parallelism
+mesh_axes: ['data', 'fsdp', 'context', 'tensor']
+logical_axis_rules: [
+                      ['batch', ['data', 'fsdp']],
+                      ['activation_batch', ['data', 'fsdp']],
+                      ['activation_self_attn_heads', ['context', 'tensor']],
+                      ['activation_cross_attn_q_length', ['context', 'tensor']],
+                      ['activation_length', 'context'],
+                      ['activation_heads', 'tensor'],
+                      ['mlp','tensor'],
+                      ['embed', ['context', 'fsdp']],
+                      ['heads', 'tensor'],
+                      ['norm', 'tensor'],
+                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['out_channels', 'tensor'],
+                      ['conv_out', 'context'],
+                    ]
+data_sharding: ['data', 'fsdp', 'context', 'tensor']
+
+dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
+dcn_fsdp_parallelism: -1
+
+flash_block_sizes: {
+  block_q: 2048,
+  block_kv: 2048,
+  block_kv_compute: 1024,
+  block_q_dkv: 2048,
+  block_kv_dkv: 2048,
+  block_kv_dkv_compute: 2048,
+  use_fused_bwd_kernel: True,
+}
+flash_min_seq_length: 4096
+dcn_context_parallelism: 1
+dcn_tensor_parallelism: 1
+ici_data_parallelism: 1
+ici_fsdp_parallelism: 1  
+ici_context_parallelism: -1 # recommended ICI axis to be auto-sharded
+ici_tensor_parallelism: 1
+enable_profiler: False
+
+replicate_vae: False
+
+allow_split_physical_axes: False
+learning_rate_schedule_steps: -1
+max_train_steps: 500
+pretrained_model_name_or_path: 'Lightricks/LTX-2.3'
+model_name: "ltx2.3"
+model_type: "T2V"
+unet_checkpoint: ''
+checkpoint_dir: ""
+dataset_name: ''
+train_split: 'train'
+dataset_type: 'tfrecord'
+cache_latents_text_encoder_outputs: True
+per_device_batch_size: 1.0
+compile_topology_num_slices: -1 
+quantization_local_shard_count: -1
+use_qwix_quantization: False
+weight_quantization_calibration_method: "absmax"
+act_quantization_calibration_method: "absmax"
+bwd_quantization_calibration_method: "absmax"
+qwix_module_path: ".*"
+jit_initializers: True 
+enable_single_replica_ckpt_restoring: False
+seed: 0
+audio_format: "s16"
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -349,6 +349,7 @@ def __init__(
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
       flash_min_seq_length: int = 4096,
+      gated_attn: bool = False,
   ):
     self.heads = heads
     self.rope_type = rope_type
@@ -426,6 +427,19 @@ def __init__(
     else:
       self.dropout_layer = None
 
+    if gated_attn:
+      self.to_gate_logits = nnx.Linear(
+          query_dim,
+          heads,
+          use_bias=True,
+          kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads")),
+          bias_init=nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",)),
+          rngs=rngs,
+          dtype=dtype,
+      )
+    else:
+      self.to_gate_logits = None
+
     self.attention_op = NNXAttentionOp(
         mesh=mesh,
         attention_kernel=attention_kernel,
@@ -489,6 +503,14 @@ def __call__(
       # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
+      if getattr(self, "to_gate_logits", None) is not None:
+        gate_logits = self.to_gate_logits(hidden_states)
+        b, s, _ = attn_output.shape
+        attn_output = attn_output.reshape(b, s, self.heads, self.dim_head)
+        gates = 2.0 * jax.nn.sigmoid(gate_logits)
+        attn_output = attn_output * jnp.expand_dims(gates, axis=-1)
+        attn_output = attn_output.reshape(b, s, -1)
+
       # 7. Output Projection
       hidden_states = self.to_out(attn_output)
 
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -57,6 +57,16 @@ def rename_for_ltx2_transformer(key):
   if "to_out_0" in key:
     key = key.replace("to_out_0", "to_out")
 
+  # LTX-2.3 specific mappings
+  if "prompt_adaln" in key:
+    key = key.replace("prompt_adaln", "caption_projection")
+  if "audio_prompt_adaln" in key:
+    key = key.replace("audio_prompt_adaln", "audio_caption_projection")
+  if "video_text_proj_in" in key:
+    key = key.replace("video_text_proj_in", "feature_extractor.video_linear")
+  if "audio_text_proj_in" in key:
+    key = key.replace("audio_text_proj_in", "feature_extractor.audio_linear")
+
   return key
 
 
@@ -269,6 +279,11 @@ def rename_for_ltx2_vocoder(key):
   key = key.replace("ups.", "upsamplers.")
   key = key.replace("resblocks", "resnets")
   key = key.replace("conv_post", "conv_out")
+  
+  # LTX-2.3 specific mappings for Vocoder
+  if "downsample" in key and "lowpass" not in key:
+    key = key.replace("downsample", "downsample.lowpass")
+    
   return key
 
 
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/embeddings_connector_ltx2.py
@@ -37,6 +37,7 @@ def __init__(
       attention_kernel: str = "flash",
       mesh: jax.sharding.Mesh = None,
       rngs: nnx.Rngs = None,
+      gated_attn: bool = False,
   ):
     self.attn1 = LTX2Attention(
         query_dim=dim,
@@ -48,6 +49,7 @@ def __init__(
         attention_kernel=attention_kernel,
         mesh=mesh,
         rngs=rngs,
+        gated_attn=gated_attn,
     )
     self.ff = NNXSimpleFeedForward(rngs=rngs, dim=dim, dim_out=dim, activation_fn="gelu_tanh")
     self.norm1 = nnx.RMSNorm(dim, epsilon=1e-6, dtype=jnp.float32, param_dtype=jnp.float32, use_scale=False, rngs=rngs)
@@ -92,6 +94,7 @@ def __init__(
       attention_kernel: str = "flash",
       mesh: jax.sharding.Mesh = None,
       rngs: nnx.Rngs = None,
+      gated_attn: bool = False,
   ):
     self.dim = input_dim
     self.heads = heads
@@ -117,6 +120,7 @@ def create_block(rngs):
           attention_kernel=attention_kernel,
           mesh=mesh,
           rngs=rngs,
+          gated_attn=gated_attn,
       )
 
     # Call the vmapped constructor
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py
@@ -102,14 +102,21 @@ def __init__(
       output_dim: int,
       dtype: DType = jnp.float32,
       rngs: nnx.Rngs = None,
+      per_modality_projections: bool = False,
+      use_bias: bool = False,
   ):
     """
     Args:
         input_dim: Dimension of flattened hidden states (Gemma dim * Num layers).
         output_dim: Target dimension for diffusion conditioning.
     """
-    # LTX-2 uses bias=False for the projection
-    self.linear = nnx.Linear(input_dim, output_dim, use_bias=False, dtype=dtype, rngs=rngs)
+    self.per_modality_projections = per_modality_projections
+    
+    if per_modality_projections:
+      self.video_linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+      self.audio_linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+    else:
+      self.linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
 
   def __call__(self, hidden_states: Union[Tuple[Array, ...], Array], attention_mask: Array) -> Array:
     """
@@ -133,4 +140,7 @@ def __call__(self, hidden_states: Union[Tuple[Array, ...], Array], attention_mas
     x_norm = _norm_and_concat_padded_batch(x, attention_mask)
 
     # 4. Projection
-    return self.linear(x_norm)
+    if self.per_modality_projections:
+      return self.video_linear(x_norm), self.audio_linear(x_norm)
+    else:
+      return self.linear(x_norm)
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py
@@ -57,15 +57,23 @@ def __init__(
       attention_kernel: str = "flash",
       mesh: jax.sharding.Mesh = None,
       rngs: nnx.Rngs = None,
+      per_modality_projections: bool = False,
+      proj_bias: bool = False,
+      video_gated_attn: bool = False,
+      audio_gated_attn: bool = False,
       **kwargs,
   ):
     input_dim = caption_channels * text_proj_in_factor
 
+    self.per_modality_projections = per_modality_projections
+
     self.feature_extractor = LTX2GemmaFeatureExtractor(
         input_dim=input_dim,
         output_dim=caption_channels,
         dtype=dtype,
         rngs=rngs,
+        per_modality_projections=per_modality_projections,
+        use_bias=proj_bias,
     )
 
     # Two independent connectors
@@ -82,6 +90,7 @@ def __init__(
         attention_kernel=attention_kernel,
         mesh=mesh,
         rngs=rngs,
+        gated_attn=video_gated_attn,
     )
 
     self.audio_embeddings_connector = Embeddings1DConnector(
@@ -97,6 +106,7 @@ def __init__(
         attention_kernel=attention_kernel,
         mesh=mesh,
         rngs=rngs,
+        gated_attn=audio_gated_attn,
     )
 
   def __call__(
@@ -113,7 +123,12 @@ def __call__(
       features = self.feature_extractor(hidden_states, attention_mask)
 
       # 2. Parallel Connection
-      video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
-      audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
+      if self.per_modality_projections:
+        video_features, audio_features = features
+        video_embeds, new_attention_mask = self.video_embeddings_connector(video_features, attention_mask)
+        audio_embeds, _ = self.audio_embeddings_connector(audio_features, attention_mask)
+      else:
+        video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
+        audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
 
       return video_embeds, audio_embeds, new_attention_mask
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
diff --git a/src/maxdiffusion/models/ltx2/vocoder_bwe_ltx2.py b/src/maxdiffusion/models/ltx2/vocoder_bwe_ltx2.py
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py