Skip to content

Commit 6b265a9

Browse files
committed
transposing vae decoded output
1 parent a95f1d0 commit 6b265a9

1 file changed

Lines changed: 3 additions & 0 deletions

File tree

src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,6 +1395,9 @@ def run_connectors(graphdef, state, hidden_states, attention_mask):
13951395
# Decode Audio
13961396
audio_latents = audio_latents.astype(self.audio_vae.dtype)
13971397
generated_mel_spectrograms = self.audio_vae.decode(audio_latents, return_dict=False)[0]
1398+
1399+
# Audio VAE outputs (B, T, F, C), Vocoder expects (B, Channels, Time, MelBins)
1400+
generated_mel_spectrograms = generated_mel_spectrograms.transpose(0, 3, 1, 2)
13981401
audio = self.vocoder(generated_mel_spectrograms)
13991402

14001403
# Convert audio to numpy

0 commit comments

Comments
 (0)