fix

prishajain1 · prishajain1 · commit 07bbd6b197e5 · 2026-02-20T19:07:34.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -30,16 +30,12 @@ def rename_for_ltx2_transformer(key):
     key = key.replace("patchify_proj", "proj_in")
     key = key.replace("audio_patchify_proj", "audio_proj_in")
 
-    if "caption_projection" in key:
-        key = key.replace("caption_projection", "audio_caption_projection")
+    # if "caption_projection" in key:
+    #     key = key.replace("caption_projection", "audio_caption_projection")
 
     # Handle audio_ff.net_0.proj -> audio_ff.net_0
     if "audio_ff" in key and "proj" in key:
         key = key.replace(".proj", "")
-
-    # This line was redundant, keeping it as a no-op or removing it is fine.
-    # The instruction implies it should be `return key` at the end.
-    key = key.replace("transformer_blocks", "transformer_blocks")
     
     # Handle to_out.0 -> to_out for LTX2Attention
     if "to_out.0" in key:
@@ -241,6 +237,9 @@ def load_vae_weights(
                       pt_list.append(str(idx))
                   else:
                       pt_list.append(part)
+              elif part in ["conv1", "conv2", "conv_in", "conv_out", "conv_shortcut", "conv"]:
+                  pt_list.append(part)
+                  pt_list.append("conv")
               else:
                   pt_list.append(part)
           
diff --git a/src/maxdiffusion/tests/test_ltx2_utils.py b/src/maxdiffusion/tests/test_ltx2_utils.py
@@ -57,13 +57,13 @@ def test_load_transformer_weights(self):
                 patch_size_t=self.config.patch_size_t,
                 num_attention_heads=self.config.num_attention_heads,
                 attention_head_dim=self.config.attention_head_dim,
-                cross_attention_dim=self.config.cross_attention_dim,
+                cross_attention_dim=4096, # T5-XXL uses 4096
                 audio_in_channels=self.config.audio_in_channels,
                 audio_out_channels=self.config.audio_out_channels,
                 audio_patch_size=self.config.audio_patch_size,
                 audio_patch_size_t=self.config.audio_patch_size_t,
                 audio_num_attention_heads=self.config.audio_num_attention_heads,
-                audio_attention_head_dim=128, # Match Config/Checkpoint
+                audio_attention_head_dim=64, # Match Checkpoint (2048 / 32)
                 audio_cross_attention_dim=self.config.audio_cross_attention_dim,
                 num_layers=self.config.num_layers,
                 scan_layers=True,