feat: add LTX2 smoke test and fix pipeline state sharding

Perseus14 · Perseus14 · commit 98d4fcbbeac4 · 2026-04-26T20:44:58.000Z
diff --git a/docs/sharding_strategy_design.md b/docs/sharding_strategy_design.md
@@ -0,0 +1,233 @@
+# Design Doc: Centralized and Configuration-Driven Sharding Strategy
+
+## Objective
+To centralize the sharding logic in MaxDiffusion, enabling hardware-specific optimizations (e.g., for TPU v6e vs v7x) without hardcoding checks in model layers or polluting constructors with sharding parameters.
+
+## Background
+Currently, sharding specifications are often hardcoded within model layers or determined by ad-hoc hardware checks (e.g., checking `jax.devices()[0].device_kind`). This makes the code:
+-   **Hard to maintain and extend**: Adding support for new hardware requires modifying multiple files.
+-   **Difficult to test**: It's hard to test different sharding strategies on the same hardware for debugging or benchmarking.
+-   **Cluttered**: Model definition code is mixed with hardware-specific execution policies.
+
+In the `prisha/ltx2_opt` branch, we see initial attempts to address this by abstracting TPU type detection, but the sharding specs themselves are still hardcoded based on the detected hardware in [attention_ltx2.py](https://github.com/AI-Hypercomputer/maxdiffusion/blob/main/src/maxdiffusion/models/ltx2/attention_ltx2.py).
+
+### Proposed Design
+
+We propose a design that combines **Discrete Logical Rulesets** and **Explicit Parameter Passing at the Top Level** to achieve a clean separation of concerns while adhering to JAX and Flax NNX best practices.
+
+### 1. Configuration
+We will add a `sharding` section to the YAML configuration files, allowing independent overrides for different model components (e.g., Transformer, VAE).
+
+Example in `ltx2_video.yml`:
+```yaml
+sharding:
+  transformer: 'ironwood'
+  vae: 'default'
+  text_encoder: 'default'
+```
+
+#### Auto-Detection & Backward Compatibility
+To improve usability and ensure backward compatibility:
+-   **Auto-Detection**: Specifying the sharding strategy is **optional**. If omitted (or if a legacy config file lacks the `sharding` block), `pyconfig.py` will auto-detect the TPU hardware generation at startup and set the strategy to the optimal default for that chip (e.g., `'ironwood'` for v7x).
+-   **Logging**: The resolved strategy will be explicitly logged to maintain transparency.
+-   **Overrides**: Users can always override this auto-detection by explicitly setting the strategy in the YAML file or via CLI.
+
+
+### 2. Discrete Logical Rulesets (Model-Specific File)
+To keep the code simple and avoid file clutter, we organize the sharding specs into a single file per model, located in the model's directory. This keeps the sharding logic close to the model code for better readability by model developers.
+
+For LTX2, this file will be `src/maxdiffusion/models/ltx2/logical_sharding_ltx2.py`.
+
+This file will contain the discrete specs, the registry, and the factory function:
+
+```python
+from dataclasses import dataclass
+from typing import Any, Optional
+
+
+# --- Discrete Specs ---
+@dataclass
+class LTX2DiTShardingSpecs:
+  """Sharding specs for the LTX2 Diffusion Transformer."""
+
+  qkv_kernel: tuple
+  out_kernel: tuple
+  out_bias: tuple
+  norm_scale: tuple = ("norm",)
+  embed_bias: tuple = ("embed",)
+
+
+@dataclass
+class TextEncoderShardingSpecs:
+  """Specs for the Text Encoder execution."""
+
+  use_batched_text_encoder: bool = False
+  text_encoder_kernel: Optional[tuple] = None
+
+
+@dataclass
+class VAEShardingSpecs:
+  """Sharding specs for the VAE."""
+
+  vae_conv_kernel: Optional[tuple] = None
+
+
+# --- Unified Registry for LTX2 ---
+STRATEGIES = {
+    "ironwood": {
+        "ltx2_dit": LTX2DiTShardingSpecs(
+            qkv_kernel=(None, "heads"),
+            out_kernel=("heads", None),
+            out_bias=(None,),
+        ),
+        "text_encoder": TextEncoderShardingSpecs(
+            use_batched_text_encoder=True,
+            text_encoder_kernel=(None, "embed"),
+        ),
+        "vae": VAEShardingSpecs(vae_conv_kernel=("batch", None, None, None)),
+    },
+    "trillium": {
+        "ltx2_dit": LTX2DiTShardingSpecs(
+            qkv_kernel=("embed", "heads"),
+            out_kernel=("heads", "embed"),
+            out_bias=("embed",),
+        ),
+        "text_encoder": TextEncoderShardingSpecs(
+            use_batched_text_encoder=False,
+            text_encoder_kernel=(None, "embed"),
+        ),
+        "vae": VAEShardingSpecs(vae_conv_kernel=(None, None, None, None)),
+    },
+}
+
+
+def get_sharding_specs(strategy_name: str, component_name: str) -> Any:
+  """Unified factory to get specs for any component."""
+  hardware_profile = STRATEGIES.get(strategy_name, STRATEGIES["trillium"])
+  specs = hardware_profile.get(component_name)
+  if specs is None:
+    raise ValueError(f"Component {component_name} not found in strategy {strategy_name}")
+  return specs
+```
+
+
+### 3. Application (Unpacking at the Top Level)
+
+To avoid coupling low-level layers to model-specific strategy objects, the top-level model (e.g., `LTX2VideoTransformer3DModel`) will accept the specs object, but will **unpack** it and pass only the specific tuples or `PartitionSpec`s down to the leaf nodes (like `LTX2Attention`).
+
+#### In the Pipeline
+The pipeline file (e.g., `ltx2_pipeline.py`) reads the strategy name from the config, retrieves the specific specs object for each component, and passes it to the respective top-level model.
+
+```python
+# 1. Read component-specific strategy names from config
+sharding_config = getattr(self.config, "sharding", {})
+transformer_strategy = sharding_config.get("transformer", "default")
+te_strategy = sharding_config.get("text_encoder", "default")
+
+# 2. Get the specific specs for components
+dit_specs = get_sharding_specs(transformer_strategy, "ltx2_dit")
+te_specs = get_sharding_specs(te_strategy, "text_encoder")
+
+# 3. Use for pipeline execution choices
+if te_specs.use_batched_text_encoder:
+  # ...
+
+# 4. Pass to the top-level model
+self.transformer = LTX2VideoTransformer3DModel(
+    # ...
+    sharding_specs=dit_specs,
+)
+```
+
+#### In Model Layers
+The top-level model receives the specs object and unpacks it for its children.
+
+Example in `LTX2VideoTransformer3DModel`:
+```python
+class LTX2VideoTransformer3DModel(nnx.Module):
+
+  def __init__(self, ..., sharding_specs: LTX2DiTShardingSpecs):
+    # Unpack and pass specific tuples to blocks
+    self.block = LTX2VideoTransformerBlock(
+        ...,
+        qkv_sharding_spec=sharding_specs.qkv_kernel,
+        out_sharding_spec=sharding_specs.out_kernel,
+        out_bias_sharding_spec=sharding_specs.out_bias,
+    )
+```
+
+Example in `LTX2Attention` (Leaf Node):
+```python
+class LTX2Attention(nnx.Module):
+
+  def __init__(
+      self,
+      ...,
+      qkv_sharding_spec: tuple,
+      out_sharding_spec: tuple,
+      out_bias_sharding_spec: tuple,
+  ):
+    # Use the specific tuples directly, completely agnostic to the parent strategy
+    self.qkv_sharding_spec = qkv_sharding_spec
+    # ...
+```
+
+### 4. Logical-to-Physical Mesh Mapping
+Logical axis names like `"heads"` and `"embed"` must be bound to a physical JAX Mesh.
+
+In MaxDiffusion, this mapping is handled at the top level via `logical_axis_rules` (typically defined in the YAML config file). These rules map logical axis names to physical mesh axes (e.g., `"data"`, `"model"`, `"fsdp"`).
+
+Different TPU topologies (v6e vs v7x) have different optimal physical mesh dimensions. We handle this by selecting the appropriate config file via the CLI, or by overriding the `logical_axis_rules` directly from the CLI.
+
+Example of overriding `logical_axis_rules` directly via CLI:
+
+```bash
+python src/maxdiffusion/generate_ltx2.py src/maxdiffusion/configs/ltx2_video.yml logical_axis_rules="[('heads', 'model'), ('embed', 'data')]"
+```
+
+### 5. Startup Validation
+To ensure that the configuration and code are in sync, we propose adding a validation step at startup (e.g., in the pipeline or `pyconfig.py`).
+
+**Problem**: If a logical sharding spec uses an axis name (e.g., `"heads"`) that is not defined in the active `logical_axis_rules`, JAX might fail late or silently fall back to suboptimal sharding.
+
+**Solution**: 
+1.  Collect all logical axis names used in the active sharding strategies.
+2.  Cross-reference them with the keys in `logical_axis_rules`.
+3.  If any logical axis name is missing from `logical_axis_rules`, raise a `ValueError` to fail fast.
+4.  Allow users to bypass this check with a `--skip_sharding_validation` flag if they explicitly want to proceed with potential defaults.
+
+## Performance Considerations
+-   This is purely a code-structuring change and does not introduce any runtime overhead.
+-   The specs returned by the factory are static strings or tuples of strings, which are perfectly traced by JAX and compiled by XLA.
+
+## Alternatives Considered
+
+### 1. Hardcoded Hardware Checks
+Checking `device_kind` directly in the model components.
+-   **Why rejected**: Scattered checks make the code hard to maintain, extend, and test.
+
+### 2. Excessive Configuration/Plumbing (Pure YAML or Individual Constructor Arguments)
+Putting all specs in YAML or passing every spec individually through all constructors.
+-   **Why rejected**: Leads to either bloated configuration files or polluted constructors in intermediate layers. We struck a balance by using dataclasses at the top level and unpacking them for leaf nodes.
+
+### 3. Monolithic or Global State Objects
+Using a class-based strategy per hardware or a global singleton manager.
+-   **Why rejected**: Leads to class explosion or violates JAX functional purity principles by introducing global state.
+
+## Prototype Plan: LTX2
+We will use the LTX2 model as a prototype to validate this design.
+
+1.  **Create** the `src/maxdiffusion/models/ltx2/logical_sharding_ltx2.py` file with the specs and factory.
+2.  **Update** [ltx2_pipeline.py](https://github.com/AI-Hypercomputer/maxdiffusion/blob/main/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py) to read the config and get the strategy.
+3.  **Update** `transformer_ltx2.py` and `attention_ltx2.py` to accept and use the strategy object.
+4.  **Verify** by:
+    *   Adding unit tests for the factory and strategy objects.
+    *   Running existing LTX2 integration tests with both `ironwood` and `trillium` strategies to ensure no regressions.
+
+## Shared Components (e.g., `attention_flax.py`)
+For components shared across different models (like `NNXSimpleFeedForward` in [attention_flax.py](https://github.com/AI-Hypercomputer/maxdiffusion/blob/main/src/maxdiffusion/models/attention_flax.py)), we will pass the specific sharding specs as arguments to their constructors, and the LTX2-specific caller will fetch those values from the respective specs object.
+
+## Future Expansion
+If the prototype succeeds on LTX2, we plan to expand this pattern to other models like **WAN** and **Flux** by adding corresponding strategies and factories.
+
+
diff --git a/setup.sh b/setup.sh
@@ -22,6 +22,8 @@
 # Enable "exit immediately if any command fails" option
 set -e
 export DEBIAN_FRONTEND=noninteractive
+export PIP_INDEX_URL=https://pypi.org/simple
+export UV_INDEX_URL=https://pypi.org/simple
 
 echo "Checking Python version..."
 # This command will fail if the Python version is less than 3.12
@@ -106,8 +108,13 @@ if [[ -n $JAX_VERSION && ! ($MODE == "stable" || -z $MODE) ]]; then
   exit 1
 fi
 
-# Set uv to use system python by default
-export UV_SYSTEM_PYTHON=1
+# Set uv to use system python if not in a virtual environment
+if python3 -c 'import sys; sys.exit(0 if sys.prefix != sys.base_prefix else 1)'; then
+  echo "Virtual environment detected. UV will use it."
+else
+  echo "System Python detected. Setting UV_SYSTEM_PYTHON=1."
+  export UV_SYSTEM_PYTHON=1
+fi
 
 # Install dependencies from requirements.txt first
 python3 -m uv pip install -U --resolution=lowest \
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -170,8 +170,8 @@ def create_model(rngs: nnx.Rngs, ltx2_config: dict):
   for path, val in flax.traverse_util.flatten_dict(params).items():
     if restored_checkpoint:
       path = path[:-1]
-    sharding = logical_state_sharding[path].value
-    state[path].value = device_put_replicated(val, sharding)
+    sharding = logical_state_sharding[path].get_value()
+    state[path].set_value(device_put_replicated(val, sharding))
   state = nnx.from_flat_state(state)
 
   transformer = nnx.merge(graphdef, state, rest_of_state)
@@ -351,10 +351,10 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
     for path, val in flax.traverse_util.flatten_dict(params).items():
       sharding = logical_state_sharding.get(path)
       if sharding is not None:
-        sharding = sharding.value
-        state[path].value = device_put_replicated(val, sharding)
+        sharding = sharding.get_value()
+        state[path].set_value(device_put_replicated(val, sharding))
       else:
-        state[path].value = jax.device_put(val)
+        state[path].set_value(jax.device_put(val))
 
     state = nnx.from_flat_state(state)
     connectors = nnx.merge(graphdef, state, rest_of_state)
@@ -393,16 +393,16 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
     for path, val in flax.traverse_util.flatten_dict(params).items():
       sharding = logical_state_sharding.get(path)
       if sharding is not None:
-        sharding = sharding.value
+        sharding = sharding.get_value()
         try:
           replicate_vae = config.replicate_vae
         except ValueError:
           replicate_vae = False
         if replicate_vae:
           sharding = NamedSharding(mesh, P())
-        state[path].value = device_put_replicated(val, sharding)
+        state[path].set_value(device_put_replicated(val, sharding))
       else:
-        state[path].value = jax.device_put(val)
+        state[path].set_value(jax.device_put(val))
 
     state = nnx.from_flat_state(state)
     vae = nnx.merge(graphdef, state, rest_of_state)
@@ -441,16 +441,16 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
     for path, val in flax.traverse_util.flatten_dict(params).items():
       sharding = logical_state_sharding.get(path)
       if sharding is not None:
-        sharding = sharding.value
+        sharding = sharding.get_value()
         try:
           replicate_vae = config.replicate_vae
         except ValueError:
           replicate_vae = False
         if replicate_vae:
           sharding = NamedSharding(mesh, P())
-        state[path].value = device_put_replicated(val, sharding)
+        state[path].set_value(device_put_replicated(val, sharding))
       else:
-        state[path].value = jax.device_put(val)
+        state[path].set_value(jax.device_put(val))
 
     state = nnx.from_flat_state(state)
     audio_vae = nnx.merge(graphdef, state, rest_of_state)
@@ -510,10 +510,10 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
     for path, val in flax.traverse_util.flatten_dict(params).items():
       sharding = logical_state_sharding.get(path)
       if sharding is not None:
-        sharding = sharding.value
-        state[path].value = device_put_replicated(val, sharding)
+        sharding = sharding.get_value()
+        state[path].set_value(device_put_replicated(val, sharding))
       else:
-        state[path].value = jax.device_put(val)
+        state[path].set_value(jax.device_put(val))
 
     state = nnx.from_flat_state(state)
     vocoder = nnx.merge(graphdef, state, rest_of_state)
diff --git a/src/maxdiffusion/tests/ltx2_smoke_test.py b/src/maxdiffusion/tests/ltx2_smoke_test.py