use_cross_timestep change in transformer

prishajain1 · prishajain1 · commit e0ef7ed83d21 · 2026-04-10T20:13:22.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_3_utils.py b/src/maxdiffusion/models/ltx2/ltx2_3_utils.py
@@ -369,8 +369,13 @@ def load_vocoder_weights_2_3(
   flax_state_dict = {}
   cpu = jax.local_devices(backend="cpu")[0]
 
+  from flax.traverse_util import flatten_dict
+  flat_eval = flatten_dict(eval_shapes)
+  print("Expected vocoder keys:", [k for k in flat_eval.keys() if "mel_stft" in str(k)])
+
   for pt_key, tensor in tensors.items():
     # Keys are already filtered and stripped of "vocoder." by load_and_segregate
+    print("Processing pt_key:", pt_key)
     key = rename_for_ltx2_3_vocoder(pt_key)
     
     # Always apply LTX-2.3 specific replacement
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -1027,6 +1027,7 @@ def __call__(
       video_coords: Optional[jax.Array] = None,
       audio_coords: Optional[jax.Array] = None,
       attention_kwargs: Optional[Dict[str, Any]] = None,
+      use_cross_timestep: bool = False,
       return_dict: bool = True,
       perturbation_mask: Optional[jax.Array] = None,
   ) -> Any:
@@ -1096,12 +1097,20 @@ def __call__(
         temb_prompt = None
         temb_prompt_audio = None
 
+      if use_cross_timestep:
+        assert sigma is not None and audio_sigma is not None, "sigma and audio_sigma must be provided when use_cross_timestep is True"
+        video_ca_timestep = audio_sigma.flatten()
+        audio_ca_timestep = sigma.flatten()
+      else:
+        video_ca_timestep = timestep.flatten()
+        audio_ca_timestep = audio_timestep.flatten() if audio_timestep is not None else timestep.flatten()
+
       video_cross_attn_scale_shift, _ = self.av_cross_attn_video_scale_shift(
-          timestep.flatten(),
+          video_ca_timestep,
           hidden_dtype=hidden_states.dtype,
       )
       video_cross_attn_a2v_gate, _ = self.av_cross_attn_video_a2v_gate(
-          timestep.flatten() * timestep_cross_attn_gate_scale_factor,
+          video_ca_timestep * timestep_cross_attn_gate_scale_factor,
           hidden_dtype=hidden_states.dtype,
       )
       video_cross_attn_scale_shift = video_cross_attn_scale_shift.reshape(
@@ -1110,11 +1119,11 @@ def __call__(
       video_cross_attn_a2v_gate = video_cross_attn_a2v_gate.reshape(batch_size, -1, video_cross_attn_a2v_gate.shape[-1])
 
       audio_cross_attn_scale_shift, _ = self.av_cross_attn_audio_scale_shift(
-          audio_timestep.flatten(),
+          audio_ca_timestep,
           hidden_dtype=audio_hidden_states.dtype,
       )
       audio_cross_attn_v2a_gate, _ = self.av_cross_attn_audio_v2a_gate(
-          audio_timestep.flatten() * timestep_cross_attn_gate_scale_factor,
+          audio_ca_timestep * timestep_cross_attn_gate_scale_factor,
           hidden_dtype=audio_hidden_states.dtype,
       )
       audio_cross_attn_scale_shift = audio_cross_attn_scale_shift.reshape(
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -125,7 +125,7 @@ def create_model(rngs: nnx.Rngs, ltx2_config: dict):
         "audio_attention_head_dim": 64,
         "audio_cross_attention_dim": 2048,
         "num_layers": 48,
-        "caption_channels": 4096,
+        "caption_channels": 3840,
         "audio_caption_channels": 2048,
         "use_prompt_embeddings": False,
     }
@@ -365,7 +365,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
             {
                 "video_connector_num_layers": 8,
                 "audio_connector_num_layers": 8,
-                "caption_channels": 2048,
+                "caption_channels": 3840,
                 "video_caption_channels": 4096,
                 "audio_caption_channels": 2048,
                 "video_connector_num_attention_heads": 32,
@@ -1264,6 +1264,11 @@ def __call__(
       timesteps: List[int] = None,
       guidance_scale: float = 3.0,
       guidance_rescale: float = 0.0,
+      stg_scale: float = 0.0,
+      modality_scale: float = 1.0,
+      audio_guidance_scale: Optional[float] = None,
+      audio_stg_scale: Optional[float] = None,
+      audio_modality_scale: Optional[float] = None,
       noise_scale: float = 1.0,
       num_videos_per_prompt: Optional[int] = 1,
       generator: Optional[jax.Array] = None,
@@ -1279,6 +1284,7 @@ def __call__(
       dtype: Optional[jnp.dtype] = None,
       output_type: str = "pil",
       return_dict: bool = True,
+      use_cross_timestep: bool = False,
   ):
     # 1. Check inputs
     self.check_inputs(
@@ -1499,23 +1505,24 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
             audio_num_frames,
             frame_rate,
             perturbation_mask=perturbation_mask,
+            use_cross_timestep=use_cross_timestep,
         )
 
-        do_stg = getattr(self.config, "stg_scale", 0.0) > 0.0
+        do_stg = stg_scale > 0.0
 
         if guidance_scale > 1.0 and do_stg:
           noise_pred_uncond, noise_pred_text, noise_pred_perturb = jnp.split(noise_pred, 3, axis=0)
           noise_pred = (
               noise_pred_uncond
               + guidance_scale * (noise_pred_text - noise_pred_uncond)
-              + self.config.stg_scale * (noise_pred_text - noise_pred_perturb)
+              + stg_scale * (noise_pred_text - noise_pred_perturb)
           )
           # Audio guidance
           noise_pred_audio_uncond, noise_pred_audio_text, noise_pred_audio_perturb = jnp.split(noise_pred_audio, 3, axis=0)
           noise_pred_audio = (
               noise_pred_audio_uncond
               + guidance_scale * (noise_pred_audio_text - noise_pred_audio_uncond)
-              + self.config.stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
+              + stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
           )
         elif guidance_scale > 1.0:
           noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
@@ -1525,10 +1532,10 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
           noise_pred_audio = noise_pred_audio_uncond + guidance_scale * (noise_pred_audio_text - noise_pred_audio_uncond)
         elif do_stg:
           noise_pred_text, noise_pred_perturb = jnp.split(noise_pred, 2, axis=0)
-          noise_pred = noise_pred_text + self.config.stg_scale * (noise_pred_text - noise_pred_perturb)
+          noise_pred = noise_pred_text + stg_scale * (noise_pred_text - noise_pred_perturb)
           
           noise_pred_audio_text, noise_pred_audio_perturb = jnp.split(noise_pred_audio, 2, axis=0)
-          noise_pred_audio = noise_pred_audio_text + self.config.stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
+          noise_pred_audio = noise_pred_audio_text + stg_scale * (noise_pred_audio_text - noise_pred_audio_perturb)
 
         # Extract latents_step based on stacking strategy
         if do_cfg and do_stg:
@@ -1693,6 +1700,8 @@ def transformer_forward_pass(
     fps,
     perturbation_mask=None,
     sigma=None,
+    audio_sigma=None,
+    use_cross_timestep=False,
 ):
   transformer = nnx.merge(graphdef, state)
 
@@ -1704,11 +1713,17 @@ def transformer_forward_pass(
   else:
     sigma = jnp.expand_dims(sigma, 0).repeat(latents.shape[0])
 
+  if audio_sigma is None:
+    audio_sigma = timestep
+  else:
+    audio_sigma = jnp.expand_dims(audio_sigma, 0).repeat(latents.shape[0])
+
   noise_pred, noise_pred_audio = transformer(
       hidden_states=latents,
       encoder_hidden_states=encoder_hidden_states,
       timestep=timestep,
       sigma=sigma,
+      audio_sigma=audio_sigma,
       encoder_attention_mask=encoder_attention_mask,
       num_frames=latent_num_frames,
       height=latent_height,
@@ -1720,6 +1735,7 @@ def transformer_forward_pass(
       audio_num_frames=audio_num_frames,
       return_dict=False,
       perturbation_mask=perturbation_mask,
+      use_cross_timestep=use_cross_timestep,
   )
 
   return noise_pred, noise_pred_audio