fix

prishajain1 · prishajain1 · commit a34529fa2cc6 · 2026-02-20T18:00:53.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -1,4 +1,3 @@
-
 import os
 import json
 import torch
@@ -28,79 +27,73 @@ def rename_for_ltx2_transformer(key):
     """
     Renames Diffusers LTX-2 keys to MaxDiffusion Flax LTX-2 keys.
     """
-    # General replacements
     key = key.replace("patchify_proj", "proj_in")
     key = key.replace("audio_patchify_proj", "audio_proj_in")
-    key = key.replace("transformer_blocks", "transformer_blocks") # kept same
-    
-    # AdaLN / Timestep Embed handling
-    # Diffusers uses: time_embed, audio_time_embed, av_cross_attn_...
-    # These match Flax implementation names mostly.
-    
-    # Attention QK Norms -> Flax uses "norm_q", "norm_k" (Diffusers often uses q_norm, k_norm but conversion script mapped them to norm_q/norm_k already? 
-    # Wait, the conversion script maps *from* original *to* Diffusers. 
-    # If loading Diffusers checkpoint, we should expect "norm_q", "norm_k" if that's what Diffusers uses.
-    # Checking conversion script: LTX_2_0_TRANSFORMER_KEYS_RENAME_DICT maps "q_norm" -> "norm_q".
-    # So Diffusers likely uses "norm_q".
-    
-    # Handle "weight" -> "kernel" for Linear/Conv layers is done in rename_key_and_reshape_tensor
-    # checking rename_key_and_reshape_tensor: it handles "weight" -> "kernel" for linear/conv.
-    
-    # Specific LTX-2 nested renaming
-    # Diffusers: transformer_blocks.0.attn1.to_q.weight
-    # Flax: transformer_blocks.layers.0.attn1.query.kernel (if scanned)
-    
-    # rename_key_and_reshape_tensor handles:
-    # to_q -> query
-    # to_k -> key
-    # to_v -> value
-    # to_out.0 -> proj_attn
-    
-    # We might need to handle specific mismatches if any.
-    
-    # The "scale" vs "weight" for LayerNorm is also handled in rename_key_and_reshape_tensor 
-    # BUT only if it detects "norm" in key.
-    
-    # LTX2AdaLayerNormSingle usually has "linear" which is a Linear layer.
-    
+    key = key.replace("transformer_blocks", "transformer_blocks")
     return key
 
 
 def get_key_and_value(pt_tuple_key, tensor, flax_state_dict, random_flax_state_dict, scan_layers, num_layers=48):
   if scan_layers:
     if "transformer_blocks" in pt_tuple_key:
-      # transformer_blocks.0.attn1... -> transformer_blocks.layers.attn1...
-      # We need to extract the block index
-      new_key = ("transformer_blocks",) + pt_tuple_key[2:] # removing index
+      new_key = ("transformer_blocks",) + pt_tuple_key[2:]
       block_index = int(pt_tuple_key[1])
       pt_tuple_key = new_key
       
-      # For scanned layers, we need to locate the param in the huge stacked tensor
-      # But wait, rename_key_and_reshape_tensor takes the *modified* pt_tuple_key?
-      # No, it takes the original one usually to check against random_flax_state_dict.
-      # But here we are constructing it.
-      
   flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, scan_layers)
-  
-  # Custom cleaning after generic rename
-  # e.g. converting "weight" to "value" for Params if needed, though they usually just take array.
-  
   flax_key = _tuple_str_to_int(flax_key)
 
   if scan_layers:
     if "transformer_blocks" in flax_key:
-        # We need to stack correct index
         if flax_key in flax_state_dict:
             new_tensor = flax_state_dict[flax_key]
         else:
-            # Initialize with zeros of shape (num_layers, ...) + tensor.shape
             new_tensor = jnp.zeros((num_layers,) + flax_tensor.shape, dtype=flax_tensor.dtype)
-            
         new_tensor = new_tensor.at[block_index].set(flax_tensor)
         flax_tensor = new_tensor
         
   return flax_key, flax_tensor
 
+def load_sharded_checkpoint(pretrained_model_name_or_path, subfolder, device):
+    """
+    Loads weights from a sharded safetensors checkpoint.
+    """
+    index_file = "diffusion_pytorch_model.safetensors.index.json"
+    tensors = {}
+    
+    # Try to download index file
+    try:
+        index_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=index_file)
+        with open(index_path, "r") as f:
+            index_data = json.load(f)
+        weight_map = index_data["weight_map"]
+        shards = set(weight_map.values())
+        
+        for shard_file in shards:
+            shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=shard_file)
+            with safe_open(shard_path, framework="pt") as f:
+                for k in f.keys():
+                    tensors[k] = torch2jax(f.get_tensor(k))
+    except Exception:
+        # Fallback to single file
+        filename = "diffusion_pytorch_model.safetensors"
+        try:
+            ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
+        except Exception:
+            filename = "diffusion_pytorch_model.bin"
+            ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
+            
+        if filename.endswith(".safetensors"):
+             with safe_open(ckpt_path, framework="pt") as f:
+                for k in f.keys():
+                    tensors[k] = torch2jax(f.get_tensor(k))
+        else:
+             loaded_state_dict = torch.load(ckpt_path, map_location="cpu")
+             for k, v in loaded_state_dict.items():
+                 tensors[k] = torch2jax(v)
+
+    return tensors
+
 def load_transformer_weights(
     pretrained_model_name_or_path: str,
     eval_shapes: dict,
@@ -111,57 +104,25 @@ def load_transformer_weights(
     subfolder: str = "transformer",
 ):
   device = jax.local_devices(backend=device)[0]
-  
-  # Determine if local or hub
-  filename = "diffusion_pytorch_model.safetensors"
-  local_files = False
-  if os.path.isdir(pretrained_model_name_or_path):
-      ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
-      if not os.path.isfile(ckpt_path):
-           # Try .bin just in case
-           filename = "diffusion_pytorch_model.bin"
-           ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
-           if not os.path.isfile(ckpt_path):
-                raise FileNotFoundError(f"File {ckpt_path} not found for local directory.")
-      local_files = True
-  elif hf_download:
-      try:
-        ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
-      except Exception:
-         filename = "diffusion_pytorch_model.bin"
-         ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
-
   max_logging.log(f"Load and port {pretrained_model_name_or_path} {subfolder} on {device}")
   
   with jax.default_device(device):
-    tensors = {}
-    if filename.endswith(".safetensors"):
-        with safe_open(ckpt_path, framework="pt") as f:
-            for k in f.keys():
-                tensors[k] = torch2jax(f.get_tensor(k))
-    else: # bin/pt
-         loaded_state_dict = torch.load(ckpt_path, map_location="cpu")
-         for k, v in loaded_state_dict.items():
-             tensors[k] = torch2jax(v)
+    # Support sharded loading
+    tensors = load_sharded_checkpoint(pretrained_model_name_or_path, subfolder, device)
              
     flax_state_dict = {}
     cpu = jax.local_devices(backend="cpu")[0]
     flattened_dict = flatten_dict(eval_shapes)
     
-    # Create random state dict with string keys for matching
     random_flax_state_dict = {}
     for key in flattened_dict:
-        # Convert all ints to strings in key tuple
         string_tuple = tuple([str(item) for item in key])
         random_flax_state_dict[string_tuple] = flattened_dict[key]
         
     for pt_key, tensor in tensors.items():
         renamed_pt_key = rename_key(pt_key)
         renamed_pt_key = rename_for_ltx2_transformer(renamed_pt_key)
         
-        # Handling specific replacements that `rename_key` might miss or `rename_for_ltx2` specifically targets
-        # The `scan_layers` handling requires splitting the key differently if needed.
-        
         pt_tuple_key = tuple(renamed_pt_key.split("."))
         
         flax_key, flax_tensor = get_key_and_value(
@@ -176,7 +137,6 @@ def load_transformer_weights(
     jax.clear_caches()
     return flax_state_dict
 
-
 def load_vae_weights(
     pretrained_model_name_or_path: str, 
     eval_shapes: dict, 
@@ -185,22 +145,16 @@ def load_vae_weights(
     subfolder: str = "vae"
 ):
   device = jax.local_devices(backend=device)[0]
-  filename = "diffusion_pytorch_model.safetensors"
+  # VAE for LTX-2 is likely single file, but safe to use the helper if we wanted general robustness.
+  # But `lightricks/LTX-2` VAE is single file.
   
-  if os.path.isdir(pretrained_model_name_or_path):
-    ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
-    if not os.path.isfile(ckpt_path):
-      filename = "diffusion_pytorch_model.bin"
-      ckpt_path = os.path.join(pretrained_model_name_or_path, subfolder, filename)
-      if not os.path.isfile(ckpt_path):
-         raise FileNotFoundError(f"File {ckpt_path} not found for local directory.")
-  elif hf_download:
-     try:
+  filename = "diffusion_pytorch_model.safetensors"
+  try:
        ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
-     except Exception:
+  except Exception:
        filename = "diffusion_pytorch_model.bin"
        ckpt_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=filename)
-       
+
   max_logging.log(f"Load and port {pretrained_model_name_or_path} VAE on {device}")
   
   with jax.default_device(device):
@@ -218,83 +172,32 @@ def load_vae_weights(
       cpu = jax.local_devices(backend="cpu")[0]
       flattened_eval = flatten_dict(eval_shapes)
       
-      # Build random state dict for shape checking/key matching help
-      # VAE usually doesn't need scan layers logic for mapping (unless we implement scanned VAE similar to Transformer, but autoencoder_kl_ltx2.py uses scan but keys seem compatible with standard diffusers structure if mapped correctly)
-      # Wait, `autoencoder_kl_ltx2.py` DOES use scan for `resnets`!
-      # See `create_resnets` and `resnet_scan_fn`.
-      # So we DO need scan layer handling for VAE if we want to load it into that structure.
-      # The VAE resnets are scanned over `num_layers`.
-      
-      # Mapping Diffusers VAE to Scanned VAE:
-      # Diffusers: down_blocks.0.resnets.0 ...
-      # Flax Scanned: down_blocks.0.resnets.layers.0 ... (if mapped that way)
-      # OR: down_blocks.0.resnets -> (num_layers, ...) tensor if we stack them.
-      
-      # Let's check `autoencoder_kl_ltx2.py` again.
-      # `self.resnets = create_resnets(rngs)` where `create_resnets` is vmapped.
-      # This creates params with a leading dimension = num_layers.
-      # So we need to stack Diffusers resnets weights.
-      
-      # We need a custom `get_key_and_value` for VAE or modify the existing one to handle VAE blocks too.
-      pass 
-      
-      # For now, let's just write the loading logic and we might need to iterate and fix VAE scanning logic if it fails validation.
-      # Ideally we use `rename_key_and_reshape_tensor` heavily.
-
       random_flax_state_dict = {}
       for key in flattened_eval:
           string_tuple = tuple([str(item) for item in key])
           random_flax_state_dict[string_tuple] = flattened_eval[key]
-          
+            
       for pt_key, tensor in tensors.items():
           renamed_pt_key = rename_key(pt_key)
-          
-          # VAE specific renames
-          renamed_pt_key = renamed_pt_key.replace("mid_block.resnets.", "mid_block.resnets.layers.")
-          renamed_pt_key = renamed_pt_key.replace("down_blocks.", "down_blocks.") # keeping same
-          # Need to handle resnets.0 -> resnets.layers.0 etc if we want to be explicit, or rely on scanning logic.
-          
-          # If we use scan, we need to stack "resnets.0", "resnets.1" etc into "resnets" tensor.
-          # The logic in `get_key_and_value` handles `transformer_blocks` scanning. We should extend it for VAE `resnets`.
-          
-          # Actually, `autoencoder_kl_ltx2.py` VAE scanning is slightly different.
-          # It scans over `resnets`.
-          # Diffusers has `down_blocks.0.resnets.0`, `down_blocks.0.resnets.1`.
-          # We need to stack these.
-          
+          if ".resnets." in renamed_pt_key:
+             # pattern: resnets.0 -> resnets_0
+             parts = renamed_pt_key.split(".")
+             new_parts = []
+             i = 0
+             while i < len(parts):
+                 if parts[i] == "resnets" and i+1 < len(parts) and parts[i+1].isdigit():
+                     new_parts.append(f"resnets_{parts[i+1]}")
+                     i += 2
+                 else:
+                     new_parts.append(parts[i])
+                     i += 1
+             renamed_pt_key = ".".join(new_parts)
+             
           pt_tuple_key = tuple(renamed_pt_key.split("."))
           
-          # Let's add VAE scanning logic here or in a helper
-          # Identifying keys to stack: keys containing `resnets._`
-          
-          # Simplified VAE Loading (non-scanned or manual stacking):
-          # If `rename_key_and_reshape_tensor` expects exact matching, we might have trouble if keys are "resnets.0" but flax expects "resnets" (stacked).
-          
-          # I will implement a check: if key has `resnets.N`, we try to stack it.
-          
           flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
-          
-          # If it didn't match immediately, check if it's a resnet layer that needs stacking
-          # This part is tricky without strictly knowing num_layers per block.
-          # But we can infer or just load individually if Flax wasn't scanned?
-          # The Flax code definitely uses scan.
-          
-          # HACK: For VAE, let's assume we might need to manually stack or map to specific indices if `rename_key_and_reshape_tensor` didn't catch it.
-          # But for now, let's just use `rename_key_and_reshape_tensor` and `validate_flax_state_dict` will tell us what failed.
-          
           flax_key = _tuple_str_to_int(flax_key)
           
-          # Manual VAE Stacking logic if needed:
-          # if "resnets" in flax_key and generic match failed...
-          
-          # Let's rely on `validate_flax_state_dict` to debug VAE mapping in the test phase if it's complex.
-          # But I should probably add the `resnets` -> `resnets.layers` replacement to be safe?
-          # Wait, if I replace `resnets.0` with `resnets.layers.0`, and Flax expects `resnets` (stacked), it still won't match.
-          # Flax `nnx.vmap` with `transform_metadata={nnx.PARTITION_NAME: "layers"}` usually expects a stacked axis.
-          # The parameter key in `state_dict` for a vmapped layer often depends on how it's stored. 
-          # In NNX/Flax, it might be stored as `resnets.layers`? No, usually just `resnets` with an extra dim?
-          # Or `resnets.layers.kernel` if it kept the name.
-          
           flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
           
       validate_flax_state_dict(eval_shapes, flax_state_dict)