vae tiled decode disable + debug statements

prishajain1 · prishajain1 · commit 83972c8b585b · 2026-04-10T23:46:22.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -1379,9 +1379,12 @@ def tiled_decode(
     B, T, H, W, C = z.shape
     sample_height = H * self.spatial_compression_ratio
     sample_width = W * self.spatial_compression_ratio
+    print(f"DEBUG: VAE tiled_decode called with hidden shape H={H}, W={W}")
+    print(f"DEBUG: target sample_height={sample_height}, sample_width={sample_width}")
 
     tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
     tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+    print(f"DEBUG: tile_latent_min_height={tile_latent_min_height}, tile_latent_min_width={tile_latent_min_width}")
     tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
     tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
 
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -469,6 +469,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
             mesh=mesh,
             **vae_kwargs,
         )
+      vae.tile_sample_min_width = 1024
       return vae
 
     p_model_factory = partial(create_model, config=config)
@@ -1724,6 +1725,7 @@ def convert_to_vel(lat, x0):
     # VAE outputs (B, T, H, W, C), but video processor expects (B, C, T, H, W)
     video_np = np.array(video).transpose(0, 4, 1, 2, 3)
     video = self.video_processor.postprocess_video(torch.from_numpy(video_np), output_type=output_type)
+    print(f"DEBUG: final video shape: {np.array(video).shape}")
 
     # Decode Audio
     audio_latents = audio_latents.astype(self.audio_vae.dtype)
@@ -1739,6 +1741,7 @@ def convert_to_vel(lat, x0):
 
     # Convert audio to numpy
     audio = np.array(audio)
+    print(f"DEBUG: final audio shape: {audio.shape}")
 
     return LTX2PipelineOutput(frames=video, audio=audio)