feat: Implement feature extractor

syhuang22 · syhuang22 · commit 424324078151 · 2026-02-26T01:06:42.000Z
Signed-off-by: James Huang &lt;syhuang1201@gmail.com&gt;
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/__init__.py b/src/maxdiffusion/models/ltx2/text_encoders/__init__.py
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py
@@ -0,0 +1,152 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Tuple, Optional, Union
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from maxdiffusion import common_types
+
+Array = common_types.Array
+DType = common_types.DType
+
+
+def _norm_and_concat_padded_batch(
+    encoded_text: Array,
+    sequence_lengths: Array,
+    padding_side: str = "right",
+) -> Array:
+  """Normalize and flatten multi-layer hidden states, respecting padding.
+  Performs per-batch, per-layer normalization using masked mean and range,
+  then concatenates across the layer dimension.
+
+  Args:
+      encoded_text: Hidden states of shape [batch, seq_len, hidden_dim, num_layers].
+      sequence_lengths: Number of valid (non-padded) tokens per batch item.
+      padding_side: Whether padding is on "left" or "right".
+
+  Returns:
+      Normalized tensor of shape [batch, seq_len, hidden_dim * num_layers],
+      with padded positions zeroed out.
+  """
+  b, t, d, l = encoded_text.shape
+
+  # Build mask: [B, T] -> [B, T, 1, 1]
+  # token_indices: [1, T]
+  token_indices = jnp.arange(t)[None, :]
+
+  if padding_side == "right":
+    # Valid: indices < lengths
+    mask = token_indices < sequence_lengths[:, None]
+  elif padding_side == "left":
+    # Valid: indices >= (T - lengths)
+    start_indices = t - sequence_lengths[:, None]
+    mask = token_indices >= start_indices
+  else:
+    raise ValueError(f"padding_side must be 'left' or 'right', got {padding_side}")
+
+  # [B, T, 1, 1]
+  mask = mask[:, :, None, None]
+
+  eps = 1e-6
+
+  # 1. Compute Masked Mean
+  # Masked sum: [B, 1, 1, L] (sum over T, D)
+  # Using jnp.where to zero-out padding
+  masked_text = jnp.where(mask, encoded_text, 0.0)
+  sum_vals = jnp.sum(masked_text, axis=(1, 2), keepdims=True)
+
+  # Denom: sequence_length * D
+  denom = (sequence_lengths * d).reshape(b, 1, 1, 1)
+  mean = sum_vals / (denom + eps)
+
+  # 2. Compute Masked Min/Max for Range
+  # Use jnp.inf / -jnp.inf for padding to ignore them in min/max
+  safe_text_min = jnp.where(mask, encoded_text, jnp.inf)
+  safe_text_max = jnp.where(mask, encoded_text, -jnp.inf)
+
+  x_min = jnp.min(safe_text_min, axis=(1, 2), keepdims=True)
+  x_max = jnp.max(safe_text_max, axis=(1, 2), keepdims=True)
+
+  range_val = x_max - x_min
+
+  # 3. Normalize
+  # Only valid tokens are normalized. Padding will be garbage but masked out later.
+  normed = 8.0 * (encoded_text - mean) / (range_val + eps)
+
+  # 4. Concatenate/Flatten Layers
+  # [B, T, D, L] -> [B, T, D * L]
+  normed = normed.reshape(b, t, -1)
+
+  # 5. Apply Mask to Output
+  # Ensure padding positions are exactly 0.0
+  # mask: [B, T, 1, 1] -> [B, T, 1]
+  output_mask = mask.squeeze(-1).squeeze(-1)[:, :, None]
+  normed = jnp.where(output_mask, normed, 0.0)
+
+  return normed
+
+
+class LTX2GemmaFeatureExtractor(nnx.Module):
+  """
+  Feature extractor module for Gemma models in LTX-2.
+  Applies mean-centered scaling and a linear projection.
+  """
+
+  def __init__(
+      self,
+      input_dim: int,
+      output_dim: int,
+      dtype: DType = jnp.float32,
+      rngs: nnx.Rngs = None,
+  ):
+    """
+    Args:
+        input_dim: Dimension of flattened hidden states (Gemma dim * Num layers).
+        output_dim: Target dimension for diffusion conditioning.
+    """
+    # LTX-2 uses bias=False for the projection
+    self.linear = nnx.Linear(input_dim, output_dim, use_bias=False, dtype=dtype, rngs=rngs)
+
+  def __call__(
+      self, hidden_states: Union[Tuple[Array, ...], Array], attention_mask: Array, padding_side: str = "right"
+  ) -> Array:
+    """
+    Args:
+        hidden_states: Tuple of arrays from Gemma, each [B, T, D].
+                       Or pre-stacked array [B, T, D, L].
+        attention_mask: Mask [B, T] (1 for valid, 0 for padding).
+        padding_side: "right" or "left".
+
+    Returns:
+        Projected features [B, T, OutputDim].
+    """
+
+    # 1. Stack Hidden States if needed
+    if isinstance(hidden_states, (tuple, list)):
+      # [B, T, D, L]
+      x = jnp.stack(hidden_states, axis=-1)
+    else:
+      x = hidden_states
+
+    # 2. Calculate Sequence Lengths
+    sequence_lengths = jnp.sum(attention_mask, axis=-1)
+
+    # 3. Norm and Concat
+    x_norm = _norm_and_concat_padded_batch(x, sequence_lengths, padding_side=padding_side)
+
+    # 4. Projection
+    return self.linear(x_norm)
diff --git a/src/maxdiffusion/tests/test_feature_extractor_ltx2.py b/src/maxdiffusion/tests/test_feature_extractor_ltx2.py
@@ -0,0 +1,128 @@
+"""
+Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+import torch
+import numpy as np
+import jax
+import jax.numpy as jnp
+from flax import nnx
+
+from ..models.ltx2.text_encoders.feature_extractor_ltx2 import LTX2GemmaFeatureExtractor, _norm_and_concat_padded_batch
+
+
+# ==========================================
+# PyTorch Reference Logic
+# ==========================================
+def pt_norm_and_concat_padded_batch(
+    encoded_text: torch.Tensor,
+    sequence_lengths: torch.Tensor,
+    padding_side: str = "right",
+) -> torch.Tensor:
+  b, t, d, l = encoded_text.shape
+  device = encoded_text.device
+
+  token_indices = torch.arange(t, device=device)[None, :]
+  if padding_side == "right":
+    mask = token_indices < sequence_lengths[:, None]
+  elif padding_side == "left":
+    start_indices = t - sequence_lengths[:, None]
+    mask = token_indices >= start_indices
+  else:
+    raise ValueError
+
+  mask = mask[:, :, None, None]  # [B, T, 1, 1]
+
+  eps = 1e-6
+  masked = encoded_text.masked_fill(~mask, 0.0)
+  denom = (sequence_lengths * d).view(b, 1, 1, 1)
+  mean = masked.sum(dim=(1, 2), keepdim=True) / (denom + eps)
+
+  x_min = encoded_text.masked_fill(~mask, float("inf")).amin(dim=(1, 2), keepdim=True)
+  x_max = encoded_text.masked_fill(~mask, float("-inf")).amax(dim=(1, 2), keepdim=True)
+  range_ = x_max - x_min
+
+  normed = 8 * (encoded_text - mean) / (range_ + eps)
+  normed = normed.reshape(b, t, -1)
+
+  # Apply mask
+  mask_flattened = mask.view(b, t, 1).expand(-1, -1, d * l)
+  normed = normed.masked_fill(~mask_flattened, 0.0)
+
+  return normed
+
+
+class LTX2FeatureExtractorTest(unittest.TestCase):
+
+  def setUp(self):
+    self.rng = nnx.Rngs(0)
+    self.B = 2
+    self.T = 10
+    self.D = 8
+    self.L = 3
+    self.target_dim = 16
+
+  def test_norm_parity(self):
+    # Create random input with some padding
+    np_input = np.random.randn(self.B, self.T, self.D, self.L).astype(np.float32)
+
+    # Lengths: e.g. [5, 8] out of 10
+    lengths = np.array([5, 8], dtype=np.int32)
+
+    # PyTorch Reference
+    pt_input = torch.from_numpy(np_input)
+    pt_lengths = torch.from_numpy(lengths)
+    pt_out = pt_norm_and_concat_padded_batch(pt_input, pt_lengths)
+
+    # JAX Implementation
+    jax_input = jnp.array(np_input)
+    jax_lengths = jnp.array(lengths)
+    jax_out = _norm_and_concat_padded_batch(jax_input, jax_lengths)
+
+    diff = np.abs(pt_out.numpy() - np.array(jax_out)).max()
+    print(f"\n[Norm Parity] Max Diff: {diff:.6f}")
+
+    np.testing.assert_allclose(pt_out.numpy(), np.array(jax_out), atol=1e-5)
+    print("[PASS] Normalization Logic Parity Verified.")
+
+  def test_module_forward(self):
+    # Test full module
+    model = LTX2GemmaFeatureExtractor(input_dim=self.D * self.L, output_dim=self.target_dim, rngs=self.rng)
+
+    # Create input tuple (simulate Gemma output)
+    hidden_states = [jnp.array(np.random.randn(self.B, self.T, self.D)) for _ in range(self.L)]
+
+    # Attention Mask [B, T]
+    mask = np.zeros((self.B, self.T), dtype=np.int32)
+    mask[0, :5] = 1
+    mask[1, :8] = 1
+    jax_mask = jnp.array(mask)
+
+    output = model(tuple(hidden_states), jax_mask)
+
+    expected_shape = (self.B, self.T, self.target_dim)
+    self.assertEqual(output.shape, expected_shape)
+
+    # Check padding regions are zero
+    # Batch 0, indices 5: should be 0
+    padding_val = output[0, 5:, :]
+    self.assertTrue(jnp.all(padding_val == 0.0), "Padding region should be zero")
+
+    print("\n[PASS] Feature Extractor Module Forward Pass Verified.")
+
+
+if __name__ == "__main__":
+  unittest.main()