fix

prishajain1 · prishajain1 · commit 4c4446cdc4a8 · 2026-02-20T18:26:15.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_video.yml b/src/maxdiffusion/configs/ltx2_video.yml
@@ -3,6 +3,7 @@ hardware: 'tpu'
 skip_jax_distributed_system: False
 attention: 'flash'
 attention_sharding_uniform: True 
+audio_attention_head_dim: 128
 
 jax_cache_dir: ''
 weights_dtype: 'bfloat16'
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -40,6 +40,11 @@ def rename_for_ltx2_transformer(key):
     # This line was redundant, keeping it as a no-op or removing it is fine.
     # The instruction implies it should be `return key` at the end.
     key = key.replace("transformer_blocks", "transformer_blocks")
+    
+    # Handle to_out.0 -> to_out for LTX2Attention
+    if "to_out.0" in key:
+        key = key.replace("to_out.0", "to_out")
+        
     return key
 
 
@@ -145,9 +150,16 @@ def load_transformer_weights(
     for k in list(tensors.keys())[:20]:
         print(k)
         
+        
     print("\nDEBUG: Top 20 keys from Flax Model (eval_shapes):")
     for k in list(random_flax_state_dict.keys())[:20]:
         print(k)
+
+    print("\nDEBUG: Transformer Block keys from Flax Model (eval_shapes):")
+    for k in list(random_flax_state_dict.keys()):
+        if "transformer_blocks" in k and "attn1" in k:
+             print(k)
+             break
         
     for pt_key, tensor in tensors.items():
         renamed_pt_key = rename_key(pt_key)
@@ -211,17 +223,10 @@ def load_vae_weights(
           renamed_pt_key = rename_key(pt_key)
           if ".resnets." in renamed_pt_key:
              # pattern: resnets.0 -> resnets_0
-             parts = renamed_pt_key.split(".")
-             new_parts = []
-             i = 0
-             while i < len(parts):
-                 if parts[i] == "resnets" and i+1 < len(parts) and parts[i+1].isdigit():
-                     new_parts.append(f"resnets_{parts[i+1]}")
-                     i += 2
-                 else:
-                     new_parts.append(parts[i])
-                     i += 1
-             renamed_pt_key = ".".join(new_parts)
+             # We need to capture the number after resnets
+             import re
+             # Replace resnets.N with resnets_N
+             renamed_pt_key = re.sub(r"resnets\.(\d+)", r"resnets_\1", renamed_pt_key)
              
           pt_tuple_key = tuple(renamed_pt_key.split("."))
           
diff --git a/src/maxdiffusion/tests/test_ltx2_utils.py b/src/maxdiffusion/tests/test_ltx2_utils.py
@@ -29,19 +29,29 @@ def test_load_transformer_weights(self):
         pretrained_model_name_or_path = "Lightricks/LTX-2"
         
         with jax.default_device(jax.devices("cpu")[0]):
-             model = LTX2VideoTransformer3DModel(
-                rngs=self.rngs,
-                # Explicitly setting key params to version 2.0 to be safe
-                in_channels=128,
-                out_channels=128,
-                patch_size=1,
-                patch_size_t=1,
-                num_attention_heads=32,
-                attention_head_dim=128,
-                cross_attention_dim=4096,
-                num_layers=48,
-                scan_layers=True 
-             )
+            self.config = LTX2VideoConfig()
+            self.config.audio_attention_head_dim = 128 # Match Checkpoint
+            
+            self.transformer = LTX2VideoTransformer3DModel(
+                in_channels=self.config.in_channels,
+                out_channels=self.config.out_channels,
+                patch_size=self.config.patch_size,
+                patch_size_t=self.config.patch_size_t,
+                num_attention_heads=self.config.num_attention_heads,
+                attention_head_dim=self.config.attention_head_dim,
+                cross_attention_dim=self.config.cross_attention_dim,
+                audio_in_channels=self.config.audio_in_channels,
+                audio_out_channels=self.config.audio_out_channels,
+                audio_patch_size=self.config.audio_patch_size,
+                audio_patch_size_t=self.config.audio_patch_size_t,
+                audio_num_attention_heads=self.config.audio_num_attention_heads,
+                audio_attention_head_dim=128, # Match Config/Checkpoint
+                audio_cross_attention_dim=self.config.audio_cross_attention_dim,
+                num_layers=self.config.num_layers,
+                scan_layers=True,
+                param_dtype=jnp.bfloat16,
+                rngs=nnx.Rngs(0),
+            )
         
         # Get abstract state (shapes only)
         # We need the PyTree structure of parameters