Skip to content

Commit 13f1cbd

Browse files
committed
vocoder revert
1 parent 753ae2c commit 13f1cbd

1 file changed

Lines changed: 4 additions & 1 deletion

File tree

src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1672,9 +1672,11 @@ def convert_to_vel(lat, x0):
16721672
num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64
16731673
latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio
16741674

1675+
print(f"DEBUG: audio_latents shape before unpack: {audio_latents.shape}")
16751676
audio_latents = self._unpack_audio_latents(
16761677
audio_latents, audio_num_frames, num_mel_bins=latent_mel_bins, num_channels=audio_channels
16771678
)
1679+
print(f"DEBUG: audio_latents shape after unpack: {audio_latents.shape}")
16781680

16791681
# Audio VAE expects channels last (B, T, F, C) but unpack returns (B, C, T, F)
16801682
if audio_latents.ndim == 4:
@@ -1726,9 +1728,10 @@ def convert_to_vel(lat, x0):
17261728
# Decode Audio
17271729
audio_latents = audio_latents.astype(self.audio_vae.dtype)
17281730
generated_mel_spectrograms = self.audio_vae.decode(audio_latents, return_dict=False)[0]
1731+
print(f"DEBUG: generated_mel_spectrograms shape: {generated_mel_spectrograms.shape}")
17291732

17301733
# Audio VAE outputs (B, T, F, C), Vocoder expects (B, Channels, Time, MelBins)
1731-
generated_mel_spectrograms = generated_mel_spectrograms.transpose(0, 3, 2, 1)
1734+
generated_mel_spectrograms = generated_mel_spectrograms.transpose(0, 3, 1, 2)
17321735
audio = self.vocoder(generated_mel_spectrograms)
17331736

17341737
# Convert audio to numpy

0 commit comments

Comments
 (0)