@@ -1093,6 +1093,8 @@ def __call__(
10931093 )
10941094
10951095 # 2. Encode inputs (Text)
1096+ import time
1097+ s_text = time .perf_counter ()
10961098 prompt_embeds , prompt_attention_mask , negative_prompt_embeds , negative_prompt_attention_mask = self .encode_prompt (
10971099 prompt ,
10981100 negative_prompt ,
@@ -1105,6 +1107,8 @@ def __call__(
11051107 max_sequence_length = max_sequence_length ,
11061108 dtype = dtype ,
11071109 )
1110+ t_text = time .perf_counter () - s_text
1111+ max_logging .log (f"[Tuning] Prompt encoding took: { t_text :.4f} seconds" )
11081112
11091113 # 3. Prepare latents
11101114 batch_size = prompt_embeds [0 ].shape [0 ] if isinstance (prompt_embeds , list ) else prompt_embeds .shape [0 ]
@@ -1326,6 +1330,7 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
13261330 if output_type == "latent" :
13271331 return LTX2PipelineOutput (frames = latents , audio = audio_latents )
13281332
1333+ s_vae = time .perf_counter ()
13291334 if getattr (self .vae .config , "timestep_conditioning" , False ):
13301335 noise = jax .random .normal (generator , latents .shape , dtype = latents .dtype )
13311336
@@ -1346,6 +1351,8 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
13461351 else :
13471352 latents = latents .astype (self .vae .dtype )
13481353 video = self .vae .decode (latents , return_dict = False )[0 ]
1354+ t_vae = time .perf_counter () - s_vae
1355+ max_logging .log (f"[Tuning] VAE decoding took: { t_vae :.4f} seconds" )
13491356 # Post-process video (converts to numpy/PIL)
13501357 # VAE outputs (B, T, H, W, C), but video processor expects (B, C, T, H, W)
13511358 video_np = np .array (video ).transpose (0 , 4 , 1 , 2 , 3 )
@@ -1357,7 +1364,10 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
13571364
13581365 # Audio VAE outputs (B, T, F, C), Vocoder expects (B, Channels, Time, MelBins)
13591366 generated_mel_spectrograms = generated_mel_spectrograms .transpose (0 , 3 , 1 , 2 )
1367+ s_vocoder = time .perf_counter ()
13601368 audio = self .vocoder (generated_mel_spectrograms )
1369+ t_vocoder = time .perf_counter () - s_vocoder
1370+ max_logging .log (f"[Tuning] Vocoder took: { t_vocoder :.4f} seconds" )
13611371
13621372 # Convert audio to numpy
13631373 audio = np .array (audio )
0 commit comments