Attention

Perseus14 · Perseus14 · commit 1f7888b6cf2f · 2026-02-08T13:41:32.000Z
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -26,22 +26,39 @@
 
 
 def apply_rotary_emb(x: Array, freqs: Tuple[Array, Array]) -> Array:
-    """Apply rotary embeddings to input x."""
+    """
+    Applies Interleaved RoPE to input x.
+    Logic matches LTX-2 PyTorch: pairs neighbors [-x2, x1].
+    
+    Args:
+        x: Input tensor [B, S, H, D]
+        freqs: Tuple of (cos, sin), broadcasting to [B, S, 1, D] or [B, S, H, D]
+    """
     cos, sin = freqs
-    # x shape: [B, S, H, D]
-    # cos/sin shape: [B, S, 1, D]
     
-    # Standard interleaved rotation: [-x2, x1]
+    # 1. Reshape to pair neighbors: [..., D] -> [..., D//2, 2]
+    # This corresponds to "rearrange(..., (d r) -> ... d r, r=2)"
     x_reshaped = x.reshape(*x.shape[:-1], -1, 2)
+    
+    # 2. Split into components
+    # x_real = x[..., 0], x_imag = x[..., 1]
     x_real, x_imag = x_reshaped[..., 0], x_reshaped[..., 1]
     
+    # 3. Rotate [-x2, x1]
+    # Corresponds to "stack((-t2, t1))"
     x_rotated = jnp.stack([-x_imag, x_real], axis=-1).reshape(*x.shape)
     
+    # 4. Apply frequencies (Float32 for stability)
     out = x.astype(jnp.float32) * cos + x_rotated.astype(jnp.float32) * sin
+    
     return out.astype(x.dtype)
 
 
 class LTX2RotaryPosEmbed(nnx.Module):
+    """
+    RoPE implementation that accepts pre-computed position IDs.
+    Allows flexibility for 3D (Video) vs 1D (Audio/Temporal) usage.
+    """
     def __init__(self, dim: int, theta: float = 10000.0):
         self.dim = dim
         self.theta = theta
@@ -51,31 +68,36 @@ def __call__(self, ids: Array) -> Tuple[Array, Array]:
         Generates RoPE frequencies.
         Args:
             ids: [B, S, Num_Axes]
-                 - For Video 3D: Num_Axes=3 (T, H, W)
-                 - For Audio 1D: Num_Axes=1 (T)
-                 - For Temporal-Only: Pass ids[:, :, 0:1] (Slice to keep only Time)
+                 - For Video 3D: Num_Axes=3 (Time, Height, Width)
+                 - For Audio 1D: Num_Axes=1 (Time)
+        Returns:
+            cos, sin: [B, S, 1, Dim] (Ready for broadcasting across heads)
         """
         num_axes = ids.shape[-1]
         dim_per_axis = self.dim // num_axes
         
+        # Standard RoPE frequencies
         freq_indices = jnp.arange(0, dim_per_axis, 2, dtype=jnp.float32)
         inv_freq = 1.0 / (self.theta ** (freq_indices / dim_per_axis))
         
         freqs_list = []
         for i in range(num_axes):
             axis_pos = ids[..., i] 
+            # Outer product: [B, S] x [D_axis/2] -> [B, S, D_axis/2]
             freqs = jnp.einsum('bs,d->bsd', axis_pos, inv_freq)
             freqs_list.append(freqs)
             
+        # Concatenate axes -> [B, S, D/2]
         emb = jnp.concatenate(freqs_list, axis=-1)
         
         cos = jnp.cos(emb)
         sin = jnp.sin(emb)
         
+        # Repeat for Interleaved RoPE: [c1, c2] -> [c1, c1, c2, c2]
         cos = jnp.repeat(cos, 2, axis=-1)
         sin = jnp.repeat(sin, 2, axis=-1)
         
-        # Add head dim: [B, S, 1, D]
+        # Add head dim for broadcasting: [B, S, 1, Inner_Dim]
         return cos[:, :, None, :], sin[:, :, None, :]
 
 
@@ -87,7 +109,7 @@ def __init__(
         dim_head: int,
         context_dim: Optional[int] = None,
         dropout: float = 0.0,
-        bias: bool = True,
+        bias: bool = True,       # LTX-2 uses bias=True for projections
         out_bias: bool = True,
         rngs: nnx.Rngs = None,
         mesh: Mesh = None,
@@ -100,16 +122,19 @@ def __init__(
         self.inner_dim = dim_head * heads
         self.dropout_rate = dropout
 
+        # 1. Projections
         self.to_q = nnx.Linear(query_dim, self.inner_dim, use_bias=bias, rngs=rngs, dtype=dtype)
         
+        # Handle Self vs Cross Attention input dims
         kv_dim = context_dim if context_dim is not None else query_dim
         self.to_k = nnx.Linear(kv_dim, self.inner_dim, use_bias=bias, rngs=rngs, dtype=dtype)
         self.to_v = nnx.Linear(kv_dim, self.inner_dim, use_bias=bias, rngs=rngs, dtype=dtype)
 
-        # Norm over full inner_dim (Fix #2)
+        # 2. Normalization (Applied to full inner_dim, NOT per-head)
         self.norm_q = nnx.RMSNorm(self.inner_dim, epsilon=eps, dtype=dtype, use_scale=True, rngs=rngs)
         self.norm_k = nnx.RMSNorm(self.inner_dim, epsilon=eps, dtype=dtype, use_scale=True, rngs=rngs)
 
+        # 3. Output
         self.to_out = nnx.Linear(self.inner_dim, query_dim, use_bias=out_bias, rngs=rngs, dtype=dtype)
 
         if self.dropout_rate > 0:
@@ -126,6 +151,18 @@ def __init__(
             dtype=dtype,
         )
 
+    def _reshape_rope(self, rope_emb: Tuple[Array, Array]) -> Tuple[Array, Array]:
+        """Reshapes [B, S, 1, InnerDim] -> [B, S, Heads, DimHead] for broadcasting."""
+        cos, sin = rope_emb
+        # If tests pass already shaped tensors, return as is
+        if cos.ndim == 4 and cos.shape[-2] == self.heads and cos.shape[-1] == self.dim_head:
+             return cos, sin
+             
+        # Reshape: [B, S, 1, H*D] -> [B, S, H, D]
+        # We assume the last dimension is InnerDim = Heads * DimHead
+        new_shape = cos.shape[:-2] + (self.heads, self.dim_head)
+        return cos.reshape(new_shape), sin.reshape(new_shape)
+
     def __call__(
         self,
         hidden_states: Array,
@@ -135,33 +172,40 @@ def __call__(
         k_rotary_emb: Optional[Tuple[Array, Array]] = None,
     ) -> Array:
         
+        # Determine context (Self or Cross)
         context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
 
         # 1. Project
         query = self.to_q(hidden_states)
         key = self.to_k(context)
         value = self.to_v(context)
 
-        # 2. Norm
+        # 2. Norm (Full Inner Dimension)
         query = self.norm_q(query)
         key = self.norm_k(key)
 
-        # 3. Reshape for RoPE [B, S, H, D]
+        # 3. Reshape to Heads [B, S, H, D]
         query = query.reshape(*query.shape[:-1], self.heads, self.dim_head)
         key = key.reshape(*key.shape[:-1], self.heads, self.dim_head)
         value = value.reshape(*value.shape[:-1], self.heads, self.dim_head)
 
         # 4. Apply RoPE
         if rotary_emb is not None:
-            query = apply_rotary_emb(query, rotary_emb)
+            # Reshape [1, Inner] -> [H, D]
+            q_rope = self._reshape_rope(rotary_emb)
+            query = apply_rotary_emb(query, q_rope)
             
+            # Key RoPE Logic
             if k_rotary_emb is not None:
-                key = apply_rotary_emb(key, k_rotary_emb)
-            elif encoder_hidden_states is None: # Self-Attention
-                key = apply_rotary_emb(key, rotary_emb)
-
-        # 5. Flatten back for AttentionOp (Fix #1)
-        # [B, S, H, D] -> [B, S, H*D]
+                # Explicit Key RoPE (e.g. Cross-Modal)
+                k_rope = self._reshape_rope(k_rotary_emb)
+                key = apply_rotary_emb(key, k_rope)
+            elif encoder_hidden_states is None: 
+                # Self-Attention: Re-use q_rope
+                key = apply_rotary_emb(key, q_rope)
+
+        # 5. Flatten back for AttentionOp [B, S, H*D]
+        # NNXAttentionOp expects flattened input for flash kernel
         query = query.reshape(*query.shape[:-2], self.inner_dim)
         key = key.reshape(*key.shape[:-2], self.inner_dim)
         value = value.reshape(*value.shape[:-2], self.inner_dim)
@@ -171,10 +215,10 @@ def __call__(
             query=query, key=key, value=value, attention_mask=attention_mask
         )
 
-        # attn_output is already [B, S, H*D], no reshape needed before output proj
+        # 7. Output Projection
         hidden_states = self.to_out(attn_output)
 
         if self.dropout_layer is not None:
             hidden_states = self.dropout_layer(hidden_states)
 
-        return hidden_states
+        return hidden_states
diff --git a/src/maxdiffusion/tests/attention_ltx2_test.py b/src/maxdiffusion/tests/attention_ltx2_test.py
@@ -15,17 +15,14 @@
 
 import unittest
 from flax import nnx
-import jax
 import jax.numpy as jnp
-# Adjust this import to match your file structure
 from ..models.ltx2.attention_ltx2 import LTX2Attention, LTX2RotaryPosEmbed
 
 
 class LTX2AttentionTest(unittest.TestCase):
 
   def test_rope_video_shapes_3d(self):
     """Test 3D RoPE generation for Video (Time, Height, Width)."""
-    dim = 64
     # LTX-2 splits dim across axes. 60 is divisible by 3 (20 per axis).
     dim = 60
     rope = LTX2RotaryPosEmbed(dim=dim, theta=10000.0)
@@ -40,7 +37,6 @@ def test_rope_video_shapes_3d(self):
     cos, sin = rope(ids)
 
     # Expected output: [B, S, 1, D] (The 1 is for broadcasting across heads)
-    # This confirms the RoPE module outputs the correct broadcasting shape.
     self.assertEqual(cos.shape, (batch_size, seq_len, 1, dim))
     self.assertEqual(sin.shape, (batch_size, seq_len, 1, dim))
 
@@ -51,34 +47,30 @@ def test_rope_audio_shapes_1d(self):
 
     batch_size = 2
     seq_len = 20
-    
-    # Create dummy position IDs for [Time]
     # Shape: [B, S, 1]
     ids = jnp.zeros((batch_size, seq_len, 1), dtype=jnp.float32)
 
     cos, sin = rope(ids)
-
-    # Expected output: [B, S, 1, D]
     self.assertEqual(cos.shape, (batch_size, seq_len, 1, dim))
-    self.assertEqual(sin.shape, (batch_size, seq_len, 1, dim))
 
   def test_self_attention_forward(self):
     """Test basic Self-Attention forward pass (Video <-> Video)."""
     dim = 64
     heads = 4
     dim_head = 16 # inner_dim = 64
     
+    # Use dot_product to avoid TPU mesh requirements during unit testing
     model = LTX2Attention(
         query_dim=dim,
         heads=heads,
         dim_head=dim_head,
         rngs=nnx.Rngs(0),
+        attention_kernel="dot_product"
     )
     
     # Standard input [B, S, D]
     x = jnp.ones((1, 16, dim))
 
-    # Forward
     out = model(hidden_states=x)
     
     self.assertEqual(out.shape, (1, 16, dim))
@@ -96,6 +88,7 @@ def test_cross_attention_forward(self):
         dim_head=dim_head,
         context_dim=context_dim, # Triggers cross-attention init
         rngs=nnx.Rngs(0),
+        attention_kernel="dot_product"
     )
     
     x = jnp.ones((1, 16, query_dim))       # Video
@@ -116,12 +109,14 @@ def test_attention_with_rope_integration(self):
         heads=heads,
         dim_head=dim_head,
         rngs=nnx.Rngs(0),
+        attention_kernel="dot_product"
     )
     
     x = jnp.ones((2, 8, dim))
     
-    # Create manual RoPE embeddings matching the output of LTX2RotaryPosEmbed
-    # Shape: [B, S, 1, inner_dim]
+    # Create manual RoPE embeddings matching output of LTX2RotaryPosEmbed
+    # Shape: [B, S, 1, Inner_Dim]
+    # Inner_Dim = 64
     cos = jnp.ones((2, 8, 1, 64))
     sin = jnp.ones((2, 8, 1, 64))
     rope_emb = (cos, sin)
@@ -144,6 +139,7 @@ def test_cross_modal_temporal_rope(self):
         dim_head=dim_head,
         context_dim=query_dim, 
         rngs=nnx.Rngs(0),
+        attention_kernel="dot_product"
     )
     
     x = jnp.ones((1, 16, query_dim))      # Video
@@ -153,7 +149,7 @@ def test_cross_modal_temporal_rope(self):
     video_ids_3d = jnp.zeros((1, 16, 3))
     
     # 2. Extract ONLY Time axis for Cross-Attention RoPE
-    # The pipeline must do this slicing: ids[:, :, 0:1]
+    # This simulates the logic that will live in the pipeline/model loop
     video_ids_temporal = video_ids_3d[..., 0:1] # Shape [1, 16, 1]
     
     # 3. Generate 1D IDs for Audio