weight loading for vocoder

prishajain1 · prishajain1 · commit 5d0e4a54ce7b · 2026-02-20T16:32:37.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -406,3 +406,95 @@ def load_ltx2_vae(
   validate_flax_state_dict(eval_shapes, flax_state_dict)
   flax_state_dict = unflatten_dict(flax_state_dict)
   return flax_state_dict
+
+
+def load_ltx2_vocoder(
+    pretrained_model_name_or_path: str,
+    eval_shapes: dict,
+    device: str,
+    hf_download: bool = True,
+    subfolder: str = "vocoder",
+):
+  device = jax.local_devices(backend=device)[0]
+  # Vocoder weights are usually in diffusion_pytorch_model.safetensors inside "vocoder" folder
+  filename = "diffusion_pytorch_model.safetensors"
+  
+  local_files = False
+  if os.path.isdir(pretrained_model_name_or_path):
+    ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
+    if os.path.isfile(ckpt_path):
+       local_files = True
+  
+  tensors = {}
+  if hf_download and not local_files:
+      try:
+          ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
+      except Exception as e:
+          raise e
+  elif local_files:
+      # ckpt_path already set
+      pass
+  else:
+      # If not hf_download and not local, we can't load unless path is direct file
+      if os.path.isfile(pretrained_model_name_or_path):
+          ckpt_path = pretrained_model_name_or_path
+      else:
+          # Maybe it's just the repo id and user expects download but hf_download=False?
+          pass
+
+  max_logging.log(f"Load and port {pretrained_model_name_or_path} Vocoder from {ckpt_path}")
+  
+  with safe_open(ckpt_path, framework="pt") as f:
+      for k in f.keys():
+          tensors[k] = torch2jax(f.get_tensor(k))
+
+  flax_state_dict = {}
+  cpu = jax.local_devices(backend="cpu")[0]
+  
+  # Flatten eval_shapes to find valid keys/shapes
+  flattened_eval_shapes = flatten_dict(eval_shapes)
+  random_flax_state_dict = {}
+  for key in flattened_eval_shapes:
+      string_tuple = tuple([str(item) for item in key])
+      random_flax_state_dict[string_tuple] = flattened_eval_shapes[key]
+  del flattened_eval_shapes
+
+  for pt_key, tensor in tensors.items():
+      renamed_pt_key = pt_key
+      
+      # Mapping for LTX2Vocoder
+      # PyTorch (Diffusers likely) -> Flax LTX2Vocoder
+      
+      # conv_in -> conv_in.conv (nnx.Conv doesn't usually nest .conv unless we use our wrapper)
+      # But checking vocoder_ltx2.py, self.conv_in = nnx.Conv(...)
+      # So key is conv_in.kernel or conv_in.weight -> conv_in.kernel
+      
+      # Diffusers usually uses: "conv_in.weight", "conv_in.bias"
+      
+      # If we use nnx.Conv directly:
+      # conv_in.weight -> conv_in.kernel
+      # conv_in.bias -> conv_in.bias
+      
+      # Does modeling_flax_pytorch_utils.rename_key handle .weight -> .kernel? Yes usually.
+      
+      # ups.X.conv.weight (in Diffusers) -> upsamplers.layers.X.kernel (in Flax nnx.ConvTranspose)
+      renamed_pt_key = renamed_pt_key.replace("ups.", "upsamplers.layers.")
+      
+      # resblocks.X.convs1.Y.weight -> resnets.layers.X.convs1.layers.Y.kernel
+      renamed_pt_key = renamed_pt_key.replace("resblocks.", "resnets.layers.")
+      renamed_pt_key = renamed_pt_key.replace("convs1.", "convs1.layers.")
+      renamed_pt_key = renamed_pt_key.replace("convs2.", "convs2.layers.")
+      
+      # conv_out -> conv_out
+      
+      pt_tuple_key = tuple(renamed_pt_key.split("."))
+      
+      flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, scan_layers=False)
+      flax_key = _tuple_str_to_int(flax_key)
+      
+      flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
+
+  validate_flax_state_dict(eval_shapes, flax_state_dict)
+  flax_state_dict = unflatten_dict(flax_state_dict)
+  return flax_state_dict
+
diff --git a/src/maxdiffusion/tests/test_loading_ltx2.py b/src/maxdiffusion/tests/test_loading_ltx2.py
@@ -6,7 +6,8 @@
 from flax import nnx
 from maxdiffusion.models.ltx2.transformer_ltx2 import LTX2VideoTransformer3DModel
 from maxdiffusion.models.ltx2.autoencoder_kl_ltx2 import LTX2VideoAutoencoderKL
-from maxdiffusion.models.ltx2.ltx2_utils import load_ltx2_transformer, load_ltx2_vae
+from maxdiffusion.models.ltx2.vocoder_ltx2 import LTX2Vocoder
+from maxdiffusion.models.ltx2.ltx2_utils import load_ltx2_transformer, load_ltx2_vae, load_ltx2_vocoder
 
 class LTX2LoadingTest(unittest.TestCase):
     def test_loading(self):
@@ -82,8 +83,32 @@ def create_vae():
         self.assertEqual(abstract_vae.latent_channels, 128)
         # self.assertEqual(len(abstract_vae.encoder.down_blocks), 4) # nnx.List not len()able directly? depends on version
         
+        
         print("VAE structure verified.")
 
+    def test_vocoder_loading(self):
+        # Configuration for Lightricks/LTX-2 Vocoder
+        def create_vocoder():
+             rngs = nnx.Rngs(0)
+             return LTX2Vocoder(
+                in_channels=128,
+                hidden_channels=1024,
+                out_channels=2,
+                upsample_kernel_sizes=(16, 15, 8, 4, 4),
+                upsample_factors=(6, 5, 2, 2, 2),
+                resnet_kernel_sizes=(3, 7, 11),
+                resnet_dilations=((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+                leaky_relu_negative_slope=0.1,
+                output_sampling_rate=24000,
+                rngs=rngs,
+                dtype=jnp.float32,
+             )
+        
+        abstract_vocoder = nnx.eval_shape(create_vocoder)
+        self.assertEqual(abstract_vocoder.out_channels, 2)
+        print("Vocoder structure verified.")
+
+
 
 if __name__ == "__main__":
     unittest.main()