attention head and audio cross attn head dim change

prishajain1 · prishajain1 · commit b02383b2f109 · 2026-04-09T18:57:43.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -145,7 +145,7 @@ def get_key_and_value(pt_tuple_key, tensor, flax_state_dict, random_flax_state_d
   flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, scan_layers)
   
   # Transpose back caption projections for LTX-2.3 as they are already in JAX format or shouldn't be transposed
-  if "caption_projection" in flax_key or "audio_caption_projection" in flax_key:
+  if ("caption_projection" in flax_key or "audio_caption_projection" in flax_key) and "timestep_embedder" not in flax_key:
     if "kernel" in flax_key and flax_tensor.ndim == 2:
       flax_tensor = flax_tensor.T
 
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -118,11 +118,11 @@ def create_model(rngs: nnx.Rngs, ltx2_config: dict):
         "in_channels": 128,
         "num_attention_heads": 32,
         "attention_head_dim": 128,
-        "cross_attention_dim": 4096,
+        "cross_attention_dim": 8192,
         "audio_in_channels": 128,
         "audio_num_attention_heads": 32,
         "audio_attention_head_dim": 64,
-        "audio_cross_attention_dim": 2048,
+        "audio_cross_attention_dim": 4096,
         "num_layers": 48,
         "caption_channels": 8192,
         "audio_caption_channels": 4096,