debug

prishajain1 · prishajain1 · commit b5539a4b3255 · 2026-04-15T21:17:27.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -901,6 +901,7 @@ def __call__(
       causal: bool = True,
       deterministic: bool = True,
   ) -> jax.Array:
+    print(f"[LTX2 XPROF Tracing] Encoder __call__ input shape: {sample.shape}")
     # JAX: (B, T, H, W, C)
     B, T, H, W, C = sample.shape
     p = self.patch_size
@@ -1074,6 +1075,7 @@ def __call__(
       causal: bool = False,
       deterministic: bool = True,
   ) -> jax.Array:
+    print(f"[LTX2 XPROF Tracing] Decoder __call__ input shape: {sample.shape}")
     if self.timestep_scale_multiplier is not None and temb is not None:
       temb = temb * self.timestep_scale_multiplier.value
 
@@ -1556,6 +1558,7 @@ def encode(
       key: Optional[jax.Array] = None,
       causal: Optional[bool] = None,
   ) -> Union[FlaxAutoencoderKLOutput, Tuple[jax.Array]]:
+    print(f"[LTX2 XPROF Tracing] VAE encode input shape: {sample.shape}")
     causal = self.encoder_causal if causal is None else causal
 
     if self.use_slicing and sample.shape[0] > 1:
@@ -1584,6 +1587,7 @@ def decode(
       generator: Optional[jax.Array] = None,
       causal: Optional[bool] = None,
   ) -> Union[FlaxDecoderOutput, Tuple[jax.Array]]:
+    print(f"[LTX2 XPROF Tracing] VAE decode input shape: {latents.shape}")
     causal = self.decoder_causal if causal is None else causal
     key = generator
 
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py
@@ -562,6 +562,7 @@ def __init__(
       self.conv_out = nnx.Conv(block_in, z_channels, kernel_size=(3, 3), padding="SAME", dtype=dtype, rngs=rngs)
 
   def __call__(self, x, train: bool = False):
+    print(f"[LTX2 XPROF Tracing] Audio Encoder __call__ input shape: {x.shape}")
     h = self.conv_in(x)
 
     for stage in self.down_stages:
@@ -702,6 +703,7 @@ def __init__(
       self.conv_out = nnx.Conv(block_in, self.output_channels, kernel_size=(3, 3), padding="SAME", dtype=dtype, rngs=rngs)
 
   def __call__(self, z, target_frames=None, target_mel_bins=None, train: bool = False):
+    print(f"[LTX2 XPROF Tracing] Audio Decoder __call__ input shape: {z.shape}")
     h = self.conv_in(z)
 
     h = self.mid_block1(h, train=train)
@@ -825,6 +827,7 @@ def __init__(
     self.latents_std = nnx.Param(jnp.ones((base_channels,), dtype=dtype))
 
   def encode(self, x: jnp.ndarray, return_dict: bool = True, train: bool = False):
+    print(f"[LTX2 XPROF Tracing] Audio VAE encode input shape: {x.shape}")
     h = self.encoder(x, train=train)
     posterior = FlaxDiagonalGaussianDistribution(h)
 
@@ -833,6 +836,7 @@ def encode(self, x: jnp.ndarray, return_dict: bool = True, train: bool = False):
     return FlaxAutoencoderKLOutput(latent_dist=posterior)
 
   def decode(self, z: jnp.ndarray, return_dict: bool = True, train: bool = False):
+    print(f"[LTX2 XPROF Tracing] Audio VAE decode input shape: {z.shape}")
     batch, time, freq, channels = z.shape
     target_frames = time * self.latent_downsample_factor
     if self.causality_axis is not None and self.causality_axis != "none":
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -354,6 +354,9 @@ def __call__(
       a2v_cross_attention_mask: Optional[jax.Array] = None,
       v2a_cross_attention_mask: Optional[jax.Array] = None,
   ) -> Tuple[jax.Array, jax.Array]:
+    print(f"[LTX2 XPROF Tracing] Block __call__ inputs:")
+    print(f"[LTX2 XPROF Tracing]   hidden_states shape: {hidden_states.shape}")
+    print(f"[LTX2 XPROF Tracing]   audio_hidden_states shape: {audio_hidden_states.shape}")
     batch_size = hidden_states.shape[0]
 
     axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_embed"))
@@ -633,6 +636,10 @@ def __init__(
     inner_dim = self.num_attention_heads * self.attention_head_dim
     audio_inner_dim = self.audio_num_attention_heads * self.audio_attention_head_dim
 
+    print(f"[LTX2 XPROF Config] num_layers: {self.num_layers}")
+    print(f"[LTX2 XPROF Config] Video: inner_dim={inner_dim}, num_heads={self.num_attention_heads}, head_dim={self.attention_head_dim}")
+    print(f"[LTX2 XPROF Config] Audio: audio_inner_dim={audio_inner_dim}, num_heads={self.audio_num_attention_heads}, head_dim={self.audio_attention_head_dim}")
+
     # 1. Patchification input projections
     self.proj_in = nnx.Linear(
         self.in_channels,
@@ -924,6 +931,11 @@ def __call__(
 
     batch_size = hidden_states.shape[0]
 
+    print(f"[LTX2 XPROF Tracing] Model __call__ inputs:")
+    print(f"[LTX2 XPROF Tracing]   hidden_states shape: {hidden_states.shape}")
+    print(f"[LTX2 XPROF Tracing]   audio_hidden_states shape: {audio_hidden_states.shape}")
+    print(f"[LTX2 XPROF Tracing]   encoder_hidden_states shape: {encoder_hidden_states.shape}")
+
     # 1. Prepare RoPE positional embeddings
     with jax.named_scope("RoPE Preparation"):
       if video_coords is None:
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1209,6 +1209,9 @@ def __call__(
     latent_width = width // self.vae_spatial_compression_ratio
     latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
 
+    max_logging.log(f"[LTX2 XPROF] Input dimensions: height={height}, width={width}, num_frames={num_frames}")
+    max_logging.log(f"[LTX2 XPROF] Video Latent dimensions: height={latent_height}, width={latent_width}, num_frames={latent_num_frames}")
+
     # 4. Prepare Audio Latents
     audio_channels = (
         self.audio_vae.config.latent_channels
@@ -1222,6 +1225,8 @@ def __call__(
     )
     audio_num_frames = round(duration_s * audio_latents_per_second)
 
+    max_logging.log(f"[LTX2 XPROF] Audio Latent dimensions: channels={audio_channels}, num_frames={audio_num_frames}")
+
     audio_latents = self.prepare_audio_latents(
         batch_size=batch_size,
         num_channels_latents=audio_channels,
@@ -1238,6 +1243,8 @@ def __call__(
     video_sequence_length = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
     video_sequence_length *= (height // self.vae_spatial_compression_ratio) * (width // self.vae_spatial_compression_ratio)
 
+    max_logging.log(f"[LTX2 XPROF] Video Sequence Length: {video_sequence_length}")
+
     mu = calculate_shift(
         video_sequence_length,
         self.scheduler.config.get("base_image_seq_len", 1024),
@@ -1521,6 +1528,10 @@ def transformer_forward_pass(
     audio_num_frames,
     fps,
 ):
+  print(f"[LTX2 XPROF Tracing] latents shape: {latents.shape}")
+  print(f"[LTX2 XPROF Tracing] audio_latents shape: {audio_latents.shape}")
+  print(f"[LTX2 XPROF Tracing] encoder_hidden_states shape: {encoder_hidden_states.shape}")
+
   transformer = nnx.merge(graphdef, state)
 
   # Expand timestep to batch size