annotations for cpu side ops

prishajain1 · prishajain1 · commit e8f964e2f9e6 · 2026-04-01T20:24:22.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -1595,17 +1595,19 @@ def decode(
         keys_slice = jax.random.split(key, latents.shape[0])
       decoded_slices = []
       for i in range(latents.shape[0]):
-        with jax.named_scope(f"Decode Slice {i}"):
-          z_slice = latents[i : i + 1]
-          t_slice = temb[i : i + 1] if temb is not None else None
-          subkey = keys_slice[i] if keys_slice is not None else None
-          res = self._decode(z_slice, t_slice, key=subkey, causal=causal, return_dict=True)
-          decoded_slices.append(res.sample)
+        with jax.profiler.TraceMe(f"VAE Decode Slice {i}"):
+          with jax.named_scope(f"Decode Slice {i}"):
+            z_slice = latents[i : i + 1]
+            t_slice = temb[i : i + 1] if temb is not None else None
+            subkey = keys_slice[i] if keys_slice is not None else None
+            res = self._decode(z_slice, t_slice, key=subkey, causal=causal, return_dict=True)
+            decoded_slices.append(res.sample)
 
       dec = jnp.concatenate(decoded_slices, axis=0)
     else:
-      with jax.named_scope("Decode Full Batch"):
-        dec = self._decode(latents, temb, key=key, causal=causal, return_dict=True).sample
+      with jax.profiler.TraceMe("VAE Decode Full Batch"):
+        with jax.named_scope("Decode Full Batch"):
+          dec = self._decode(latents, temb, key=key, causal=causal, return_dict=True).sample
 
     if not return_dict:
       return (dec,)
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1392,17 +1392,21 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
     max_logging.log(f"[Tuning] VAE decoding took: {t_vae:.4f} seconds")
     # Post-process video (converts to numpy/PIL)
     # VAE outputs (B, T, H, W, C), but video processor expects (B, C, T, H, W)
-    video_np = np.array(video).transpose(0, 4, 1, 2, 3)
-    video = self.video_processor.postprocess_video(torch.from_numpy(video_np), output_type=output_type)
+    with jax.profiler.TraceMe("Video Post-processing"):
+      video_np = np.array(video).transpose(0, 4, 1, 2, 3)
+      video = self.video_processor.postprocess_video(torch.from_numpy(video_np), output_type=output_type)
 
     # Decode Audio
     audio_latents = audio_latents.astype(self.audio_vae.dtype)
-    generated_mel_spectrograms = self.audio_vae.decode(audio_latents, return_dict=False)[0]
+    with jax.profiler.TraceMe("Audio VAE Decode"):
+      generated_mel_spectrograms = self.audio_vae.decode(audio_latents, return_dict=False)[0]
 
     # Audio VAE outputs (B, T, F, C), Vocoder expects (B, Channels, Time, MelBins)
     generated_mel_spectrograms = generated_mel_spectrograms.transpose(0, 3, 1, 2)
+    
     s_vocoder = time.perf_counter()
-    audio = self.vocoder(generated_mel_spectrograms)
+    with jax.profiler.TraceMe("Vocoder Audio Generation"):
+      audio = self.vocoder(generated_mel_spectrograms)
     t_vocoder = time.perf_counter() - s_vocoder
     max_logging.log(f"[Tuning] Vocoder took: {t_vocoder:.4f} seconds")