debug

prishajain1 · prishajain1 · commit 943c1de0e330 · 2026-04-15T21:38:13.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -1075,7 +1075,7 @@ def __call__(
       causal: bool = False,
       deterministic: bool = True,
   ) -> jax.Array:
-    print(f"[LTX2 XPROF Tracing] Decoder __call__ input shape: {sample.shape}")
+    print(f"[LTX2 XPROF Tracing] Video Decoder __call__ input shape: {sample.shape}")
     if self.timestep_scale_multiplier is not None and temb is not None:
       temb = temb * self.timestep_scale_multiplier.value
 
@@ -1587,7 +1587,7 @@ def decode(
       generator: Optional[jax.Array] = None,
       causal: Optional[bool] = None,
   ) -> Union[FlaxDecoderOutput, Tuple[jax.Array]]:
-    print(f"[LTX2 XPROF Tracing] VAE decode input shape: {latents.shape}")
+    print(f"[LTX2 XPROF Tracing] Video VAE decode input shape: {latents.shape}")
     causal = self.decoder_causal if causal is None else causal
     key = generator
 
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -935,6 +935,7 @@ def __call__(
     print(f"[LTX2 XPROF Tracing]   hidden_states shape: {hidden_states.shape}")
     print(f"[LTX2 XPROF Tracing]   audio_hidden_states shape: {audio_hidden_states.shape}")
     print(f"[LTX2 XPROF Tracing]   encoder_hidden_states shape: {encoder_hidden_states.shape}")
+    print(f"[LTX2 XPROF Tracing]   audio_encoder_hidden_states shape: {audio_encoder_hidden_states.shape}")
 
     # 1. Prepare RoPE positional embeddings
     with jax.named_scope("RoPE Preparation"):
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1482,6 +1482,7 @@ def __call__(
     # Post-process video (converts to numpy/PIL)
     # VAE outputs (B, T, H, W, C), but video processor expects (B, C, T, H, W)
     video_np = np.array(video).transpose(0, 4, 1, 2, 3)
+    max_logging.log(f"[LTX2 XPROF] Produced video shape (B, C, T, H, W): {video_np.shape}")
     video = self.video_processor.postprocess_video(torch.from_numpy(video_np), output_type=output_type)
 
     # Decode Audio
@@ -1494,6 +1495,7 @@ def __call__(
 
     # Convert audio to numpy
     audio = np.array(audio)
+    max_logging.log(f"[LTX2 XPROF] Produced audio shape: {audio.shape}")
 
     return LTX2PipelineOutput(frames=video, audio=audio)
 
@@ -1531,6 +1533,7 @@ def transformer_forward_pass(
   print(f"[LTX2 XPROF Tracing] latents shape: {latents.shape}")
   print(f"[LTX2 XPROF Tracing] audio_latents shape: {audio_latents.shape}")
   print(f"[LTX2 XPROF Tracing] encoder_hidden_states shape: {encoder_hidden_states.shape}")
+  print(f"[LTX2 XPROF Tracing] audio_encoder_hidden_states shape: {audio_encoder_hidden_states.shape}")
 
   transformer = nnx.merge(graphdef, state)