annotations added

prishajain1 · prishajain1 · commit 25123c6e76ee · 2026-04-01T09:27:49.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -16,6 +16,7 @@
 
 from typing import Optional, Tuple
 from flax import nnx
+import jax
 import jax.numpy as jnp
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
@@ -446,46 +447,48 @@ def __call__(
     # Determine context (Self or Cross)
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
 
-    # 1. Project
-    query = self.to_q(hidden_states)
-    key = self.to_k(context)
-    value = self.to_v(context)
+    # 1. Project and Norm
+    with jax.named_scope("QKV Projection and Norm"):
+      query = self.to_q(hidden_states)
+      key = self.to_k(context)
+      value = self.to_v(context)
 
-    # 2. Norm (Full Inner Dimension)
-    query = self.norm_q(query)
-    key = self.norm_k(key)
+      query = self.norm_q(query)
+      key = self.norm_k(key)
 
     # 3. Apply RoPE to tensors of shape [B, S, InnerDim]
     # Frequencies are shape [B, S, InnerDim]
     # 3. Apply RoPE
-    if rotary_emb is not None:
-      if hasattr(self, "rope_type") and self.rope_type == "split":
-        # Split RoPE: passing full freqs [B, H, S, D//2]
-        # apply_split_rotary_emb handles reshaping query/key
-
-        query = apply_split_rotary_emb(query, rotary_emb)
-
-        if k_rotary_emb is not None:
-          key = apply_split_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_split_rotary_emb(key, rotary_emb)
-
-      else:
-        # Interleaved (Default)
-        query = apply_rotary_emb(query, rotary_emb)
-        if k_rotary_emb is not None:
-          key = apply_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_rotary_emb(key, rotary_emb)
-
-    # 4. Attention
-    # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
-    attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
-
-    # 7. Output Projection
-    hidden_states = self.to_out(attn_output)
-
-    if self.dropout_layer is not None:
-      hidden_states = self.dropout_layer(hidden_states)
+    with jax.named_scope("Apply RoPE"):
+      if rotary_emb is not None:
+        if hasattr(self, "rope_type") and self.rope_type == "split":
+          # Split RoPE: passing full freqs [B, H, S, D//2]
+          # apply_split_rotary_emb handles reshaping query/key
+
+          query = apply_split_rotary_emb(query, rotary_emb)
+
+          if k_rotary_emb is not None:
+            key = apply_split_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_split_rotary_emb(key, rotary_emb)
+
+        else:
+          # Interleaved (Default)
+          query = apply_rotary_emb(query, rotary_emb)
+          if k_rotary_emb is not None:
+            key = apply_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_rotary_emb(key, rotary_emb)
+
+    with jax.named_scope("Attention and Output Project"):
+      # 4. Attention
+      # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
+      attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
+
+      # 7. Output Projection
+      hidden_states = self.to_out(attn_output)
+
+      if self.dropout_layer is not None:
+        hidden_states = self.dropout_layer(hidden_states)
 
     return hidden_states
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -900,113 +900,79 @@ def __call__(
     batch_size = hidden_states.shape[0]
 
     # 1. Prepare RoPE positional embeddings
-    if video_coords is None:
-      video_coords = self.rope.prepare_video_coords(batch_size, num_frames, height, width, fps=fps)
-    if audio_coords is None:
-      audio_coords = self.audio_rope.prepare_audio_coords(batch_size, audio_num_frames)
+    with jax.named_scope("RoPE Preparation"):
+      if video_coords is None:
+        video_coords = self.rope.prepare_video_coords(batch_size, num_frames, height, width, fps=fps)
+      if audio_coords is None:
+        audio_coords = self.audio_rope.prepare_audio_coords(batch_size, audio_num_frames)
 
-    video_rotary_emb = self.rope(video_coords)
-    audio_rotary_emb = self.audio_rope(audio_coords)
+      video_rotary_emb = self.rope(video_coords)
+      audio_rotary_emb = self.audio_rope(audio_coords)
 
-    video_cross_attn_rotary_emb = self.cross_attn_rope(video_coords[:, 0:1, :])
-    audio_cross_attn_rotary_emb = self.cross_attn_audio_rope(audio_coords[:, 0:1, :])
+      video_cross_attn_rotary_emb = self.cross_attn_rope(video_coords[:, 0:1, :])
+      audio_cross_attn_rotary_emb = self.cross_attn_audio_rope(audio_coords[:, 0:1, :])
 
     # 2. Patchify input projections
-    hidden_states = self.proj_in(hidden_states)
-    audio_hidden_states = self.audio_proj_in(audio_hidden_states)
+    with jax.named_scope("Input Projection"):
+      hidden_states = self.proj_in(hidden_states)
+      audio_hidden_states = self.audio_proj_in(audio_hidden_states)
 
     # 3. Prepare timestep embeddings and modulation parameters
-    timestep_cross_attn_gate_scale_factor = self.cross_attn_timestep_scale_multiplier / self.timestep_scale_multiplier
+    with jax.named_scope("Timestep and Caption Projection"):
+      timestep_cross_attn_gate_scale_factor = self.cross_attn_timestep_scale_multiplier / self.timestep_scale_multiplier
 
-    temb, embedded_timestep = self.time_embed(
-        timestep.flatten(),
-        hidden_dtype=hidden_states.dtype,
-    )
-    temb = temb.reshape(batch_size, -1, temb.shape[-1])
-    embedded_timestep = embedded_timestep.reshape(batch_size, -1, embedded_timestep.shape[-1])
+      temb, embedded_timestep = self.time_embed(
+          timestep.flatten(),
+          hidden_dtype=hidden_states.dtype,
+      )
+      temb = temb.reshape(batch_size, -1, temb.shape[-1])
+      embedded_timestep = embedded_timestep.reshape(batch_size, -1, embedded_timestep.shape[-1])
 
-    temb_audio, audio_embedded_timestep = self.audio_time_embed(
-        audio_timestep.flatten(),
-        hidden_dtype=audio_hidden_states.dtype,
-    )
-    temb_audio = temb_audio.reshape(batch_size, -1, temb_audio.shape[-1])
-    audio_embedded_timestep = audio_embedded_timestep.reshape(batch_size, -1, audio_embedded_timestep.shape[-1])
+      temb_audio, audio_embedded_timestep = self.audio_time_embed(
+          audio_timestep.flatten(),
+          hidden_dtype=audio_hidden_states.dtype,
+      )
+      temb_audio = temb_audio.reshape(batch_size, -1, temb_audio.shape[-1])
+      audio_embedded_timestep = audio_embedded_timestep.reshape(batch_size, -1, audio_embedded_timestep.shape[-1])
 
-    video_cross_attn_scale_shift, _ = self.av_cross_attn_video_scale_shift(
-        timestep.flatten(),
-        hidden_dtype=hidden_states.dtype,
-    )
-    video_cross_attn_a2v_gate, _ = self.av_cross_attn_video_a2v_gate(
-        timestep.flatten() * timestep_cross_attn_gate_scale_factor,
-        hidden_dtype=hidden_states.dtype,
-    )
-    video_cross_attn_scale_shift = video_cross_attn_scale_shift.reshape(
-        batch_size, -1, video_cross_attn_scale_shift.shape[-1]
-    )
-    video_cross_attn_a2v_gate = video_cross_attn_a2v_gate.reshape(batch_size, -1, video_cross_attn_a2v_gate.shape[-1])
+      video_cross_attn_scale_shift, _ = self.av_cross_attn_video_scale_shift(
+          timestep.flatten(),
+          hidden_dtype=hidden_states.dtype,
+      )
+      video_cross_attn_a2v_gate, _ = self.av_cross_attn_video_a2v_gate(
+          timestep.flatten() * timestep_cross_attn_gate_scale_factor,
+          hidden_dtype=hidden_states.dtype,
+      )
+      video_cross_attn_scale_shift = video_cross_attn_scale_shift.reshape(
+          batch_size, -1, video_cross_attn_scale_shift.shape[-1]
+      )
+      video_cross_attn_a2v_gate = video_cross_attn_a2v_gate.reshape(batch_size, -1, video_cross_attn_a2v_gate.shape[-1])
 
-    audio_cross_attn_scale_shift, _ = self.av_cross_attn_audio_scale_shift(
-        audio_timestep.flatten(),
-        hidden_dtype=audio_hidden_states.dtype,
-    )
-    audio_cross_attn_v2a_gate, _ = self.av_cross_attn_audio_v2a_gate(
-        audio_timestep.flatten() * timestep_cross_attn_gate_scale_factor,
-        hidden_dtype=audio_hidden_states.dtype,
-    )
-    audio_cross_attn_scale_shift = audio_cross_attn_scale_shift.reshape(
-        batch_size, -1, audio_cross_attn_scale_shift.shape[-1]
-    )
-    audio_cross_attn_v2a_gate = audio_cross_attn_v2a_gate.reshape(batch_size, -1, audio_cross_attn_v2a_gate.shape[-1])
+      audio_cross_attn_scale_shift, _ = self.av_cross_attn_audio_scale_shift(
+          audio_timestep.flatten(),
+          hidden_dtype=audio_hidden_states.dtype,
+      )
+      audio_cross_attn_v2a_gate, _ = self.av_cross_attn_audio_v2a_gate(
+          audio_timestep.flatten() * timestep_cross_attn_gate_scale_factor,
+          hidden_dtype=audio_hidden_states.dtype,
+      )
+      audio_cross_attn_scale_shift = audio_cross_attn_scale_shift.reshape(
+          batch_size, -1, audio_cross_attn_scale_shift.shape[-1]
+      )
+      audio_cross_attn_v2a_gate = audio_cross_attn_v2a_gate.reshape(batch_size, -1, audio_cross_attn_v2a_gate.shape[-1])
 
-    # 4. Prepare prompt embeddings
-    encoder_hidden_states = self.caption_projection(encoder_hidden_states)
-    encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
+      # 4. Prepare prompt embeddings
+      encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+      encoder_hidden_states = encoder_hidden_states.reshape(batch_size, -1, hidden_states.shape[-1])
 
-    audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states)
-    audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
+      audio_encoder_hidden_states = self.audio_caption_projection(audio_encoder_hidden_states)
+      audio_encoder_hidden_states = audio_encoder_hidden_states.reshape(batch_size, -1, audio_hidden_states.shape[-1])
 
     # 5. Run transformer blocks
     def scan_fn(carry, block):
       hidden_states, audio_hidden_states, rngs_carry = carry
-      hidden_states_out, audio_hidden_states_out = block(
-          hidden_states=hidden_states,
-          audio_hidden_states=audio_hidden_states,
-          encoder_hidden_states=encoder_hidden_states,
-          audio_encoder_hidden_states=audio_encoder_hidden_states,
-          temb=temb,
-          temb_audio=temb_audio,
-          temb_ca_scale_shift=video_cross_attn_scale_shift,
-          temb_ca_audio_scale_shift=audio_cross_attn_scale_shift,
-          temb_ca_gate=video_cross_attn_a2v_gate,
-          temb_ca_audio_gate=audio_cross_attn_v2a_gate,
-          video_rotary_emb=video_rotary_emb,
-          audio_rotary_emb=audio_rotary_emb,
-          ca_video_rotary_emb=video_cross_attn_rotary_emb,
-          ca_audio_rotary_emb=audio_cross_attn_rotary_emb,
-          encoder_attention_mask=encoder_attention_mask,
-          audio_encoder_attention_mask=audio_encoder_attention_mask,
-      )
-      return (
-          hidden_states_out.astype(hidden_states.dtype),
-          audio_hidden_states_out.astype(audio_hidden_states.dtype),
-          rngs_carry,
-      ), None
-
-    if self.scan_layers:
-      rematted_scan_fn = self.gradient_checkpoint.apply(
-          scan_fn, self.names_which_can_be_saved, self.names_which_can_be_offloaded, prevent_cse=not self.scan_layers
-      )
-      carry = (hidden_states, audio_hidden_states, nnx.Rngs(0))  # Placeholder RNGs for now if not used in block
-      (hidden_states, audio_hidden_states, _), _ = nnx.scan(
-          rematted_scan_fn,
-          length=self.num_layers,
-          in_axes=(nnx.Carry, 0),
-          out_axes=(nnx.Carry, 0),
-          transform_metadata={nnx.PARTITION_NAME: "layers"},
-      )(carry, self.transformer_blocks)
-    else:
-      for block in self.transformer_blocks:
-        hidden_states, audio_hidden_states = block(
+      with jax.named_scope("Transformer Block i"):
+        hidden_states_out, audio_hidden_states_out = block(
             hidden_states=hidden_states,
             audio_hidden_states=audio_hidden_states,
             encoder_hidden_states=encoder_hidden_states,
@@ -1024,6 +990,45 @@ def scan_fn(carry, block):
             encoder_attention_mask=encoder_attention_mask,
             audio_encoder_attention_mask=audio_encoder_attention_mask,
         )
+      return (
+          hidden_states_out.astype(hidden_states.dtype),
+          audio_hidden_states_out.astype(audio_hidden_states.dtype),
+          rngs_carry,
+      ), None
+
+    with jax.named_scope("Transformer Blocks"):
+      if self.scan_layers:
+        rematted_scan_fn = self.gradient_checkpoint.apply(
+            scan_fn, self.names_which_can_be_saved, self.names_which_can_be_offloaded, prevent_cse=not self.scan_layers
+        )
+        carry = (hidden_states, audio_hidden_states, nnx.Rngs(0))  # Placeholder RNGs for now if not used in block
+        (hidden_states, audio_hidden_states, _), _ = nnx.scan(
+            rematted_scan_fn,
+            length=self.num_layers,
+            in_axes=(nnx.Carry, 0),
+            out_axes=(nnx.Carry, 0),
+            transform_metadata={nnx.PARTITION_NAME: "layers"},
+        )(carry, self.transformer_blocks)
+      else:
+        for block in self.transformer_blocks:
+          hidden_states, audio_hidden_states = block(
+              hidden_states=hidden_states,
+              audio_hidden_states=audio_hidden_states,
+              encoder_hidden_states=encoder_hidden_states,
+              audio_encoder_hidden_states=audio_encoder_hidden_states,
+              temb=temb,
+              temb_audio=temb_audio,
+              temb_ca_scale_shift=video_cross_attn_scale_shift,
+              temb_ca_audio_scale_shift=audio_cross_attn_scale_shift,
+              temb_ca_gate=video_cross_attn_a2v_gate,
+              temb_ca_audio_gate=audio_cross_attn_v2a_gate,
+              video_rotary_emb=video_rotary_emb,
+              audio_rotary_emb=audio_rotary_emb,
+              ca_video_rotary_emb=video_cross_attn_rotary_emb,
+              ca_audio_rotary_emb=audio_cross_attn_rotary_emb,
+              encoder_attention_mask=encoder_attention_mask,
+              audio_encoder_attention_mask=audio_encoder_attention_mask,
+          )
 
     # 6. Output layers
     scale_shift_values = jnp.expand_dims(self.scale_shift_table, axis=(0, 1)) + jnp.expand_dims(embedded_timestep, axis=2)
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py