Audio VAE weights

prishajain1 · prishajain1 · commit c109dbd0dcea · 2026-02-24T14:23:45.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -485,3 +485,186 @@ def load_connector_weights(
     jax.clear_caches()
     validate_flax_state_dict(eval_shapes, flax_state_dict)
     return unflatten_dict(flax_state_dict)
+
+def rename_for_ltx2_audio_vae(key):
+    # Standard VAE renaming (resblocks -> resnets, ups -> upsamplers)
+    key = key.replace("resblocks", "resnets")
+    key = key.replace("ups", "upsamplers")
+    key = key.replace("conv_shortcut.weight", "conv_shortcut_layer.kernel")
+    key = key.replace("conv_shortcut.bias", "conv_shortcut_layer.bias")
+    
+    # Handle q, k, v, proj_out in AttnBlock
+    if "q.weight" in key: key = key.replace("q.weight", "q.kernel")
+    if "k.weight" in key: key = key.replace("k.weight", "k.kernel")
+    if "v.weight" in key: key = key.replace("v.weight", "v.kernel")
+    if "proj_out.weight" in key: key = key.replace("proj_out.weight", "proj_out.kernel")
+    
+    # Handle conv.weight -> conv.kernel
+    if key.endswith(".weight") and "conv" in key:
+        key = key.replace(".weight", ".kernel")
+        
+    return key
+
+
+def load_audio_vae_weights(
+    pretrained_model_name_or_path: str,
+    eval_shapes: dict,
+    device: str,
+    hf_download: bool = True,
+    subfolder: str = "audio_vae"
+):
+    tensors = load_sharded_checkpoint(pretrained_model_name_or_path, subfolder, device)
+    flax_state_dict = {}
+    cpu = jax.local_devices(backend="cpu")[0]
+    
+    flattened_eval = flatten_dict(eval_shapes)
+    random_flax_state_dict = {}
+    for key in flattened_eval:
+          string_tuple = tuple([str(item) for item in key])
+          random_flax_state_dict[string_tuple] = flattened_eval[key]
+
+    for pt_key, tensor in tensors.items():
+        key = rename_for_ltx2_audio_vae(pt_key)
+        
+        # Determine if we need to transpose (Conv weights: OHWI -> HWIO)
+        # PyTorch Conv2d: (Out, In, H, W) -> Flax: (H, W, In, Out)
+        # However, for 1x1 convs (like q, k, v), it might be (Out, In, 1, 1) -> (1, 1, In, Out)
+        
+        should_transpose = False
+        if key.endswith(".kernel"):
+             if tensor.ndim == 4:
+                 should_transpose = True
+        
+        if should_transpose:
+            tensor = tensor.transpose(2, 3, 1, 0)
+            
+        # Handle special keys: latents_mean, latents_std
+        if "latents_mean" in key:
+            # PyTorch: [C], Flax: [C] (Buffer)
+            pass 
+        if "latents_std" in key:
+            pass
+            
+        # Convert key to tuple
+        parts = key.split(".")
+        flax_key_parts = []
+        for part in parts:
+            if part.isdigit():
+                flax_key_parts.append(int(part))
+            else:
+                flax_key_parts.append(part)
+        
+        flax_key = tuple(flax_key_parts)
+        
+        # Handle resnet nesting (down_blocks.0.resnets.0...)
+        # LTX-2 Audio VAE structure in Flax might be slightly different if not using List
+        # But we used nnx.List in the implementation, so it should match mostly.
+        # Let's check against random_flax_state_dict if possible or rely on structure.
+        
+        # Special handling for "mid_block" which in PT might be "mid_block.resnets.0" 
+        # but in our Flax implementation is "mid_block1", "mid_block2"
+        
+        if "mid_block" in pt_key:
+            # PT: mid_block.resnets.0 -> Flax: mid_block1
+            # PT: mid_block.attentions.0 -> Flax: mid_attn
+            # PT: mid_block.resnets.1 -> Flax: mid_block2
+            
+            new_flax_key_parts = list(flax_key)
+            if "resnets" in new_flax_key_parts:
+                idx = new_flax_key_parts[new_flax_key_parts.index("resnets") + 1]
+                if idx == 0:
+                     # Replace 'mid_block', 'resnets', 0 with 'mid_block1'
+                     # Warning: This is a bit fragile.
+                     pass
+            
+            # Actually, let's map explicitly based on known structure
+            if "mid_block.resnets.0" in pt_key:
+                # mid_block.resnets.0.conv1.weight -> mid_block1.conv1.kernel
+                key = key.replace("mid_block.resnets.0", "mid_block1")
+            elif "mid_block.resnets.1" in pt_key:
+                key = key.replace("mid_block.resnets.1", "mid_block2")
+            elif "mid_block.attentions.0" in pt_key:
+                key = key.replace("mid_block.attentions.0", "mid_attn")
+                
+            # Re-split after mid_block renaming
+            parts = key.split(".")
+            flax_key_parts = []
+            for part in parts:
+                if part.isdigit():
+                    flax_key_parts.append(int(part))
+                else:
+                    flax_key_parts.append(part)
+            flax_key = tuple(flax_key_parts)
+
+        # Handle down_blocks / up_blocks
+        # PT: down_blocks.0.resnets.0 -> Flax: down_stages.0.blocks.0
+        # PT: down_blocks.0.attentions.0 -> Flax: down_stages.0.attentions.0
+        # PT: down_blocks.0.downsamplers.0 -> Flax: down_stages.0.downsample
+        
+        if "down_blocks" in key:
+            # down_blocks.0.resnets.0 -> down_stages.0.blocks.0
+             if "resnets" in key:
+                 key = key.replace("down_blocks", "down_stages")
+                 key = key.replace("resnets", "blocks")
+             elif "attentions" in key:
+                 key = key.replace("down_blocks", "down_stages")
+                 key = key.replace("attentions", "attns")
+             elif "downsamplers" in key:
+                 key = key.replace("down_blocks", "down_stages")
+                 # downsamplers.0 -> downsample (since we have one downsample per stage)
+                 key = key.replace("downsamplers.0", "downsample")
+             
+             # Re-split
+             parts = key.split(".")
+             flax_key_parts = []
+             for part in parts:
+                if part.isdigit():
+                    flax_key_parts.append(int(part))
+                else:
+                    flax_key_parts.append(part)
+             flax_key = tuple(flax_key_parts)
+
+        if "up_blocks" in key:
+             # up_blocks.0.resnets.0 -> up_stages.0.blocks.0
+             # Note: PT up_blocks are usually reversed compared to simple iteration, but 
+             # in Diffusers they correspond to levels.
+             # Flax implementation: `up_stages` list iterates reversed(range(len(ch_mult)))
+             # so up_stages[0] corresponds to the deepest resolution? 
+             # LTX-2 Audio VAE implementation:
+             # for level in reversed(range(len(self.ch_mult))): ... self.up_stages.append(...)
+             # So up_stages[0] is the first upsample stage (lowest res -> higher res).
+             # Diffusers `up_blocks` usually go 0, 1, 2...
+             # So it should be a direct mapping if existing logic holds.
+             
+             if "resnets" in key:
+                 key = key.replace("up_blocks", "up_stages")
+                 key = key.replace("resnets", "blocks")
+             elif "attentions" in key:
+                 key = key.replace("up_blocks", "up_stages")
+                 key = key.replace("attentions", "attns")
+             elif "upsamplers" in key:
+                 key = key.replace("up_blocks", "up_stages")
+                 key = key.replace("upsamplers.0", "upsample")
+                 
+             # Re-split
+             parts = key.split(".")
+             flax_key_parts = []
+             for part in parts:
+                if part.isdigit():
+                    flax_key_parts.append(int(part))
+                else:
+                    flax_key_parts.append(part)
+             flax_key = tuple(flax_key_parts)
+             
+        flax_state_dict[flax_key] = jax.device_put(tensor, device=cpu)
+
+    # Filter eval shapes to remove rngs/dropout
+    filtered_eval_shapes = {}
+    for k, v in flattened_eval.items():
+          k_str = [str(x) for x in k]
+          if "dropout" in k_str or "rngs" in k_str:
+              continue
+          filtered_eval_shapes[k] = v
+
+    validate_flax_state_dict(unflatten_dict(filtered_eval_shapes), flax_state_dict)
+    return unflatten_dict(flax_state_dict)
diff --git a/src/maxdiffusion/tests/test_ltx2_utils.py b/src/maxdiffusion/tests/test_ltx2_utils.py
@@ -183,5 +183,48 @@ def test_load_connector_weights(self):
         validate_flax_state_dict(eval_shapes, flatten_dict(loaded_weights))
         print("Connector Weights Validated Successfully!")
 
+    def test_load_audio_vae_weights(self):
+        from maxdiffusion.models.ltx2.audio_vae import FlaxAutoencoderKLLTX2Audio
+        from maxdiffusion.models.ltx2.ltx2_utils import load_audio_vae_weights
+        
+        pretrained_model_name_or_path = "Lightricks/LTX-2"
+        
+        # Audio VAE Config from user request
+        config = {
+          "base_channels": 128,
+          "ch_mult": (1, 2, 4),
+          "double_z": True,
+          "dropout": 0.0,
+          "in_channels": 2,
+          "latent_channels": 8,
+          "mel_bins": 64,
+          "mel_hop_length": 160,
+          "mid_block_add_attention": False,
+          "norm_type": "pixel",
+          "num_res_blocks": 2,
+          "output_channels": 2,
+          "resolution": 256,
+          "sample_rate": 16000,
+          "rngs": nnx.Rngs(0)
+        }
+        
+        with jax.default_device(jax.devices("cpu")[0]):
+            model = FlaxAutoencoderKLLTX2Audio(**config)
+            
+        state = nnx.state(model)
+        eval_shapes = state.to_pure_dict()
+        
+        print("Loading Audio VAE Weights...")
+        loaded_weights = load_audio_vae_weights(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            eval_shapes=eval_shapes,
+            device=self.device,
+            hf_download=True
+        )
+        
+        print("Validating Audio VAE Weights...")
+        validate_flax_state_dict(eval_shapes, flatten_dict(loaded_weights))
+        print("Audio VAE Weights Validated Successfully!")
+
 if __name__ == "__main__":
     unittest.main()