@@ -1672,9 +1672,11 @@ def convert_to_vel(lat, x0):
16721672 num_mel_bins = self .audio_vae .config .mel_bins if getattr (self , "audio_vae" , None ) is not None else 64
16731673 latent_mel_bins = num_mel_bins // self .audio_vae_mel_compression_ratio
16741674
1675+ print (f"DEBUG: audio_latents shape before unpack: { audio_latents .shape } " )
16751676 audio_latents = self ._unpack_audio_latents (
16761677 audio_latents , audio_num_frames , num_mel_bins = latent_mel_bins , num_channels = audio_channels
16771678 )
1679+ print (f"DEBUG: audio_latents shape after unpack: { audio_latents .shape } " )
16781680
16791681 # Audio VAE expects channels last (B, T, F, C) but unpack returns (B, C, T, F)
16801682 if audio_latents .ndim == 4 :
@@ -1726,9 +1728,10 @@ def convert_to_vel(lat, x0):
17261728 # Decode Audio
17271729 audio_latents = audio_latents .astype (self .audio_vae .dtype )
17281730 generated_mel_spectrograms = self .audio_vae .decode (audio_latents , return_dict = False )[0 ]
1731+ print (f"DEBUG: generated_mel_spectrograms shape: { generated_mel_spectrograms .shape } " )
17291732
17301733 # Audio VAE outputs (B, T, F, C), Vocoder expects (B, Channels, Time, MelBins)
1731- generated_mel_spectrograms = generated_mel_spectrograms .transpose (0 , 3 , 2 , 1 )
1734+ generated_mel_spectrograms = generated_mel_spectrograms .transpose (0 , 3 , 1 , 2 )
17321735 audio = self .vocoder (generated_mel_spectrograms )
17331736
17341737 # Convert audio to numpy
0 commit comments