adding support for rope

prishajain1 · prishajain1 · commit ba1541289926 · 2026-02-11T11:02:01.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -54,6 +54,85 @@ def apply_rotary_emb(x: Array, freqs: Tuple[Array, Array]) -> Array:
   return out.astype(x.dtype)
 
 
+def apply_split_rotary_emb(x: Array, freqs: Tuple[Array, Array]) -> Array:
+  """
+  Applies Split RoPE to input x.
+  Logic matches Diffusers apply_split_rotary_emb.
+  
+  Args:
+      x: Input tensor. 
+         If ndim=3 [B, S, D], it will be reshaped to satisfy cos/sin shapes if needed.
+      freqs: Tuple of (cos, sin). 
+             Expected to be [B, H, S, D//2] if coming from LTX2RotaryPosEmbed(split).
+  """
+  cos, sin = freqs
+  
+  x_dtype = x.dtype
+  needed_reshape = False
+  original_shape = x.shape
+  
+  # Check if we need to reshape x to match cos layout (B, H, S, D//2)
+  # x typically [B, S, H*D] or [B, S, D]
+  # cos typically [B, H, S, D//2]
+  
+  if x.ndim != 4 and cos.ndim == 4:
+      # x is [B, S, Dim]
+      # cos is [B, H, S, R]
+      b = x.shape[0]
+      h, s, r = cos.shape[1], cos.shape[2], cos.shape[3]
+      
+      # Verify dimensions roughly match
+      # D (dim per head) = R * 2
+      # Dim = H * D = H * 2 * R
+      
+      # reshape x to [B, S, H, 2*R] -> transpose to [B, H, S, 2*R]
+      x = x.reshape(b, s, h, -1).transpose(0, 2, 1, 3)
+      needed_reshape = True
+      
+  # Now x should be [..., 2*R] i.e. [B, H, S, 2*R] considering the logic below
+  
+  last_dim = x.shape[-1]
+  r = last_dim // 2
+  
+  # Reshape last dim to (2, r)
+  # [..., 2*R] -> [..., 2, R]
+  split_x = x.reshape(*x.shape[:-1], 2, r)
+  
+  # Split into first and second half
+  first_x = split_x[..., 0, :] # [..., R]
+  second_x = split_x[..., 1, :] # [..., R]
+  
+  # Broadcast cos/sin: [B, H, S, R] -> [B, H, S, 1, R]
+  cos_u = jnp.expand_dims(cos, axis=-2)
+  sin_u = jnp.expand_dims(sin, axis=-2)
+  
+  # out = split_x * cos_u
+  # This applies cos to both halves
+  out = split_x * cos_u
+  
+  # Modifications
+  # first_out = x1*cos - x2*sin
+  # second_out = x2*cos + x1*sin
+  
+  # Apply updates
+  # We construct result manually to avoid in-place ops
+  out_first = out[..., 0, :] - second_x * sin_u.squeeze(-2)
+  out_second = out[..., 1, :] + first_x * sin_u.squeeze(-2)
+  
+  # Stack back: [..., 2, R]
+  out = jnp.stack([out_first, out_second], axis=-2)
+  
+  # Flatten back last dim: [..., 2*R]
+  out = out.reshape(*out.shape[:-2], last_dim)
+  
+  if needed_reshape:
+      # [B, H, S, D] -> [B, S, H, D] -> [B, S, H*D]
+      out = out.transpose(0, 2, 1, 3).reshape(original_shape)
+      
+  return out.astype(x_dtype)
+
+
+
 class LTX2RotaryPosEmbed(nnx.Module):
   """
   Video and audio rotary positional embeddings (RoPE) for the LTX-2.0 model.
@@ -131,26 +210,13 @@ def prepare_video_coords(
     latent_coords = jnp.expand_dims(latent_coords, 0)  # [1, num_patches, 3, 2]
     latent_coords = jnp.tile(latent_coords, (batch_size, 1, 1, 1))  # [B, num_patches, 3, 2]
 
-    # Transpose to match desired shape [B, 3, num_patches, 2] if needed,
-    # BUT Diffusers returns [B, 3, num_patches, 2] from flatten(1,3) on [3, N_F, N_H, N_W, 2]??
-    # Diffusers:
-    #   latent_coords = torch.stack([grid, patch_ends], dim=-1)  # [3, N_F, N_H, N_W, 2]
-    #   latent_coords = latent_coords.flatten(1, 3) # [3, num_patches, 2]
-    #   latent_coords = latent_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1) # [B, 3, num_patches, 2]
-    # My JAX above:
-    #   latent_coords = latent_coords.reshape(-1, 3, 2) was wrong relative to Diffusers shape
-
-    # Correct JAX implementation matching Diffusers:
     latent_coords = jnp.stack([grid, patch_ends], axis=-1)  # [3, N_F, N_H, N_W, 2]
     latent_coords = latent_coords.reshape(3, -1, 2)  # [3, num_patches, 2]
     latent_coords = jnp.expand_dims(latent_coords, 0)  # [1, 3, num_patches, 2]
     latent_coords = jnp.tile(latent_coords, (batch_size, 1, 1, 1))  # [B, 3, num_patches, 2]
 
     # 3. Calculate pixel space coords
     scale_tensor = jnp.array(self.scale_factors, dtype=latent_coords.dtype)
-    # Broadcast scale factors: [1, 3, 1, 1] matches [B, 3, num, 2] logic?
-    # Diffusers: broadcast_shape[1] = -1 (frame, height, width dim)
-    # Actually scale_factors is (8, 32, 32) corresponding to (F, H, W) i.e. dim 1 of latent_coords
     scale_tensor = scale_tensor.reshape(1, 3, 1, 1)
     pixel_coords = latent_coords * scale_tensor
 
@@ -260,11 +326,6 @@ def __call__(self, coords: Array) -> Tuple[Array, Array]:
 
       # Padding if needed
       if self.dim % num_rope_elems != 0:
-        # Diffusers logic: pad with ones/zeros
-        # JAX requires careful padding
-        # But here we computed freqs for `steps = self.dim // num_rope_elems`
-        # So we have `steps * num_rope_elems` elements currently.
-        # If mismatch, we pad.
         curr_dim = cos_freqs.shape[-1]
         pad_amt = self.dim - curr_dim
         if pad_amt > 0:
@@ -274,42 +335,32 @@ def __call__(self, coords: Array) -> Tuple[Array, Array]:
           sin_freqs = jnp.concatenate([sin_padding, sin_freqs], axis=-1)
 
     elif self.rope_type == "split":
-      # [B, N, D//2] -> [B, N, D//2]
-      # Padding first? Diffusers:
-      # expected_freqs = self.dim // 2
-      # current_freqs = freqs.shape[-1]
-      # pad_size = expected_freqs - current_freqs
-      # if pad != 0: pad (before logic?)
-      # Actually Diffusers code:
-      # cos_freq = freqs.cos(), sin_freq = freqs.sin()
-      # if pad: concatenate([pad, cos_freq], axis=-1)
-      # THEN reshape to multi-head?
-
-      curr_dim = cos_freqs.shape[-1]
+      # Cos/Sin
+      cos_freq = jnp.cos(freqs)
+      sin_freq = jnp.sin(freqs)
+
+      curr_dim = cos_freq.shape[-1]
       expected_dim = self.dim // 2
       pad_size = expected_dim - curr_dim
 
       if pad_size > 0:
-        cos_padding = jnp.ones((*cos_freqs.shape[:-1], pad_size), dtype=cos_freqs.dtype)
-        sin_padding = jnp.zeros((*sin_freqs.shape[:-1], pad_size), dtype=sin_freqs.dtype)
-        cos_freqs = jnp.concatenate([cos_padding, cos_freqs], axis=-1)
-        sin_freqs = jnp.concatenate([sin_padding, sin_freqs], axis=-1)
-
-      # Reshape for multi-head?
-      # Diffusers:
-      # cos_freq = cos_freq.reshape(b, t, self.num_attention_heads, -1)
-      # swapaxes(1, 2) -> (B, H, T, D//2)
-      # Here: input `coords` was flattened tokens (N).
-      # We assume N = Time?
-      # Wait, `prepare_video_coords` flattens all patches (T*H*W).
-      # So N = T*H*W.
-      # If `rope_type="split"`, does it imply specific Time-Head structure?
-      # LTX-2 `transformer_ltx2.py` in Diffusers passes `rope_type="interleaved"` by default.
-      # Split is mostly for specific attention optimizations.
-      # I will skip the complex reshape logic for now unless requested,
-      # as standard flow is interleaved.
-      # But I should keep the frequency generation logic consistent.
-      pass
+        cos_padding = jnp.ones((*cos_freq.shape[:-1], pad_size), dtype=cos_freq.dtype)
+        sin_padding = jnp.zeros((*sin_freq.shape[:-1], pad_size), dtype=sin_freq.dtype)
+        cos_freq = jnp.concatenate([cos_padding, cos_freq], axis=-1)
+        sin_freq = jnp.concatenate([sin_padding, sin_freq], axis=-1)
+
+      # Reshape freqs to be compatible with multi-head attention
+      # Diffusers: cos_freq.reshape(b, t, self.num_attention_heads, -1) -> swapaxes(1, 2)
+      # [B, S, D//2] -> [B, S, H, dim_head//2] -> [B, H, S, dim_head//2]
+      
+      b = cos_freq.shape[0]
+      s = cos_freq.shape[1]
+      
+      # We need to know H. `LTX2RotaryPosEmbed` has `num_attention_heads`.
+      h = self.num_attention_heads
+      
+      cos_freqs = cos_freq.reshape(b, s, h, -1).transpose(0, 2, 1, 3)
+      sin_freqs = sin_freq.reshape(b, s, h, -1).transpose(0, 2, 1, 3)
 
     return cos_freqs, sin_freqs
 
@@ -330,8 +381,10 @@ def __init__(
       eps: float = 1e-6,
       dtype: DType = jnp.float32,
       attention_kernel: str = "flash",
+      rope_type: str = "interleaved",
   ):
     self.heads = heads
+    self.rope_type = rope_type
     self.dim_head = dim_head
     self.inner_dim = dim_head * heads
     self.dropout_rate = dropout
@@ -387,12 +440,26 @@ def __call__(
 
     # 3. Apply RoPE to tensors of shape [B, S, InnerDim]
     # Frequencies are shape [B, S, InnerDim]
+    # 3. Apply RoPE
     if rotary_emb is not None:
-      query = apply_rotary_emb(query, rotary_emb)
-      if k_rotary_emb is not None:
-        key = apply_rotary_emb(key, k_rotary_emb)
-      elif encoder_hidden_states is None:
-        key = apply_rotary_emb(key, rotary_emb)
+      if hasattr(self, "rope_type") and self.rope_type == "split":
+         # Split RoPE: passing full freqs [B, H, S, D//2]
+         # apply_split_rotary_emb handles reshaping query/key
+         
+         query = apply_split_rotary_emb(query, rotary_emb)
+         
+         if k_rotary_emb is not None:
+             key = apply_split_rotary_emb(key, k_rotary_emb)
+         elif encoder_hidden_states is None:
+             key = apply_split_rotary_emb(key, rotary_emb)
+             
+      else:
+         # Interleaved (Default)
+         query = apply_rotary_emb(query, rotary_emb)
+         if k_rotary_emb is not None:
+           key = apply_rotary_emb(key, k_rotary_emb)
+         elif encoder_hidden_states is None:
+           key = apply_rotary_emb(key, rotary_emb)
 
     # 4. Attention
     # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -700,7 +700,6 @@ def __init__(
         num_attention_heads=self.audio_num_attention_heads,
     )
 
-    # 5. Transformer Blocks
     # 5. Transformer Blocks
     @nnx.split_rngs(splits=self.num_layers)
     @nnx.vmap(in_axes=0, out_axes=0, axis_size=self.num_layers, transform_metadata={nnx.PARTITION_NAME: "layers"})
diff --git a/src/maxdiffusion/tests/ltx_2_transformer_test.py b/src/maxdiffusion/tests/ltx_2_transformer_test.py
@@ -104,6 +104,40 @@ def test_ltx2_rope(self):
     self.assertEqual(cos.shape, (1, 10, dim))
     self.assertEqual(sin.shape, (1, 10, dim))
 
+  def test_ltx2_rope_split(self):
+    """Tests LTX2RotaryPosEmbed with rope_type='split'."""
+    dim = self.dim
+    patch_size = self.patch_size
+    patch_size_t = self.patch_size_t
+    base_num_frames = 8
+    base_height = 32
+    base_width = 32
+    
+    # Video RoPE Split
+    rope = LTX2RotaryPosEmbed(
+        dim=dim,
+        patch_size=patch_size,
+        patch_size_t=patch_size_t,
+        base_num_frames=base_num_frames,
+        base_height=base_height,
+        base_width=base_width,
+        modality="video",
+        rope_type="split"
+    )
+    ids = jnp.ones((1, 3, 10)) # (B, Axes, S)
+    cos, sin = rope(ids)
+    
+    # Check output shape
+    # Split RoPE returns concatenated [cos, cos] to match dim
+    self.assertEqual(cos.shape, (1, 10, dim))
+    self.assertEqual(sin.shape, (1, 10, dim))
+    
+    # Verify values are concatenated
+    cos1, cos2 = jnp.split(cos, 2, axis=-1)
+    # They should be identical
+    self.assertTrue(jnp.allclose(cos1, cos2))
+
+
   def test_ltx2_ada_layer_norm_single(self):
     """Tests LTX2AdaLayerNormSingle initialization and execution."""
     key = jax.random.key(0)

Original file line number	Diff line number	Diff line change
`@@ -700,7 +700,6 @@ def __init__(`
`700`	`700`	`num_attention_heads=self.audio_num_attention_heads,`
`701`	`701`	`)`
`702`	`702`
`703`		`- # 5. Transformer Blocks`
`704`	`703`	`# 5. Transformer Blocks`
`705`	`704`	`@nnx.split_rngs(splits=self.num_layers)`
`706`	`705`	`@nnx.vmap(in_axes=0, out_axes=0, axis_size=self.num_layers, transform_metadata={nnx.PARTITION_NAME: "layers"})`