changes to attention, video vae and weight loading

prishajain1 · prishajain1 · commit 3f2320cae031 · 2026-04-24T12:11:20.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -353,6 +353,7 @@ def __init__(
       qkv_sharding_spec: Optional[tuple] = None,
       out_sharding_spec: Optional[tuple] = None,
       out_bias_sharding_spec: Optional[tuple] = None,
+      gated_attn: bool = False,
   ):
     self.heads = heads
     self.rope_type = rope_type
@@ -444,6 +445,17 @@ def __init__(
     else:
       self.dropout_layer = None
 
+    if gated_attn:
+      self.to_gate_logits = nnx.Linear(
+          query_dim,
+          heads,
+          use_bias=True,
+          kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads")),
+          bias_init=nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",)),
+          rngs=rngs,
+          dtype=dtype,
+      )
+
     self.attention_op = NNXAttentionOp(
         mesh=mesh,
         attention_kernel=attention_kernel,
@@ -464,6 +476,7 @@ def __call__(
       attention_mask: Optional[Array] = None,
       rotary_emb: Optional[Tuple[Array, Array]] = None,
       k_rotary_emb: Optional[Tuple[Array, Array]] = None,
+      perturbation_mask: Optional[Array] = None,
   ) -> Array:
     # Determine context (Self or Cross)
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
@@ -507,6 +520,17 @@ def __call__(
       # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
+      if perturbation_mask is not None:
+        attn_output = value + perturbation_mask * (attn_output - value)
+
+      if getattr(self, "to_gate_logits", None) is not None:
+        gate_logits = self.to_gate_logits(hidden_states)
+        b, s, _ = attn_output.shape
+        attn_output = attn_output.reshape(b, s, self.heads, self.dim_head)
+        gates = 2.0 * jax.nn.sigmoid(gate_logits)
+        attn_output = attn_output * jnp.expand_dims(gates, axis=-1)
+        attn_output = attn_output.reshape(b, s, -1)
+
       # 7. Output Projection
       hidden_states = self.to_out(attn_output)
 
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -668,6 +668,7 @@ def __init__(
       timestep_conditioning: bool = False,
       upsample_residual: bool = False,
       upscale_factor: int = 1,
+      upsample_type: str = "spatiotemporal",
       spatial_padding_mode: str = "constant",
       rngs: Optional[nnx.Rngs] = None,
       mesh: Optional[jax.sharding.Mesh] = None,
@@ -711,9 +712,18 @@ def __init__(
       )
 
     if spatio_temporal_scale:
+      if upsample_type == "spatiotemporal":
+        stride = (2, 2, 2)
+      elif upsample_type == "temporal":
+        stride = (2, 1, 1)
+      elif upsample_type == "spatial":
+        stride = (1, 2, 2)
+      else:
+        raise ValueError(f"Unknown upsample_type: {upsample_type}")
+
       self.upsampler = LTXVideoUpsampler3d(
           in_channels=out_channels * upscale_factor,
-          stride=(2, 2, 2),
+          stride=stride,
           residual=upsample_residual,
           upscale_factor=upscale_factor,
           spatial_padding_mode=spatial_padding_mode,
@@ -954,6 +964,7 @@ def __init__(
       timestep_conditioning: bool = False,
       upsample_residual: Tuple[bool, ...] = (True, True, True),
       upsample_factor: Tuple[int, ...] = (2, 2, 2),
+      upsample_type: Tuple[str, ...] = ("spatiotemporal", "spatiotemporal", "spatiotemporal"),
       spatial_padding_mode: str = "reflect",
       rngs: Optional[nnx.Rngs] = None,
       mesh: Optional[jax.sharding.Mesh] = None,
@@ -1020,6 +1031,7 @@ def __init__(
               timestep_conditioning=timestep_conditioning,
               upsample_residual=upsample_residual[i],
               upscale_factor=upsample_factor[i],
+              upsample_type=upsample_type[i],
               spatial_padding_mode=spatial_padding_mode,
               rngs=rngs,
               mesh=mesh,
@@ -1139,6 +1151,7 @@ def __init__(
       downsample_type: Tuple[str, ...] = ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
       upsample_residual: Tuple[bool, ...] = (True, True, True),
       upsample_factor: Tuple[int, ...] = (2, 2, 2),
+      upsample_type: Tuple[str, ...] = ("spatiotemporal", "spatiotemporal", "spatiotemporal"),
       timestep_conditioning: bool = False,
       patch_size: int = 4,
       patch_size_t: int = 1,
@@ -1184,6 +1197,7 @@ def __init__(
         spatio_temporal_scaling=decoder_spatio_temporal_scaling,
         upsample_factor=upsample_factor,
         upsample_residual=upsample_residual,
+        upsample_type=upsample_type,
         patch_size=patch_size,
         patch_size_t=patch_size_t,
         resnet_norm_eps=resnet_norm_eps,
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -40,6 +40,11 @@ def rename_for_ltx2_transformer(key):
   """
   Renames Diffusers LTX-2 keys to MaxDiffusion Flax LTX-2 keys.
   """
+  if "caption_proj" in key and "caption_projection" not in key:
+      key = key.replace("caption_proj", "caption_projection")
+  if "audio_caption_proj" in key and "audio_caption_projection" not in key:
+      key = key.replace("audio_caption_proj", "audio_caption_projection")
+
   key = key.replace("patchify_proj", "proj_in")
   key = key.replace("audio_patchify_proj", "audio_proj_in")
   key = key.replace("norm_final", "norm_out")
@@ -289,11 +294,21 @@ def load_vocoder_weights(
 
     flax_key = _tuple_str_to_int(parts)
 
+    # Skip filter keys as they are derived in NNX model
+    if "filter" in flax_key:
+      continue
+
     if flax_key[-1] == "kernel":
       if "upsamplers" in flax_key:
-        tensor = tensor.transpose(2, 0, 1)[::-1, :, :]
+        if "2.3" in pretrained_model_name_or_path:
+          tensor = tensor.transpose(2, 0, 1)
+        else:
+          tensor = tensor.transpose(2, 0, 1)[::-1, :, :]
       else:
         tensor = tensor.transpose(2, 1, 0)
+        
+    if "mel_stft" in flax_key and ("forward_basis" in flax_key or "inverse_basis" in flax_key):
+      tensor = tensor.transpose(2, 1, 0)
 
     flax_state_dict[flax_key] = jax.device_put(tensor, device=cpu)
 
@@ -305,6 +320,8 @@ def rename_for_ltx2_connector(key):
   key = key.replace("video_connector", "video_embeddings_connector")
   key = key.replace("audio_connector", "audio_embeddings_connector")
   key = key.replace("text_proj_in", "feature_extractor.linear")
+  key = key.replace("audio_feature_extractor.linear", "audio_text_proj_in")
+  key = key.replace("video_feature_extractor.linear", "video_text_proj_in")
 
   if "transformer_blocks" in key:
     key = key.replace("transformer_blocks", "stacked_blocks")