changes made for pipeline components loading

prishajain1 · prishajain1 · commit 18e06e81b468 · 2026-02-26T13:56:17.000+05:30
diff --git a/src/maxdiffusion/checkpointing/ltx2_checkpointer.py b/src/maxdiffusion/checkpointing/ltx2_checkpointer.py
@@ -85,97 +85,18 @@ def load_ltx2_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[di
     max_logging.log(f"optimizer found in checkpoint {'opt_state' in restored_checkpoint.ltx2_state.keys()}")
     return restored_checkpoint, step
 
-  def load_diffusers_checkpoint(self):
-    config = self.config
-    max_logging.log("Loading LTX2 components from Hugging Face base models.")
-    
-    # 1. Tokenizer
-    max_logging.log("Loading Gemma Tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(
-        config.pretrained_model_name_or_path,
-        subfolder="tokenizer",
-    )
-    # 3. Connectors
-    max_logging.log("Loading Connectors...")
-    connectors = LTX2AudioVideoGemmaTextEncoder.from_pretrained(
-        config.pretrained_model_name_or_path,
-        subfolder="connectors",
-    )
-    
-    # 4. Video VAE
-    max_logging.log("Loading Video VAE...")
-    vae = LTX2VideoAutoencoderKL.from_pretrained(
-        config.pretrained_model_name_or_path,
-        subfolder="vae",
-    )
-
-    # 5. Audio VAE
-    max_logging.log("Loading Audio VAE...")
-    audio_vae = FlaxAutoencoderKLLTX2Audio.from_pretrained(
-        config.pretrained_model_name_or_path,
-        subfolder="audio_vae",
-    )
-
-    # 6. Transformer
-    max_logging.log("Loading Transformer...")
-    # NOTE: Transformer weights are usually sharded and loaded separately in generation scripts
-    # This just instantiates the architecture wrapper or loads full weights.
-    # In MaxDiffusion we typically let the pipeline or generation script handle sharded loading
-    # but we load the raw config/eval shape here.
-    transformer = LTX2VideoTransformer3DModel.from_pretrained(
-         config.pretrained_model_name_or_path,
-         subfolder="transformer",
-    )
-
-    # 7. Vocoder
-    max_logging.log("Loading Vocoder...")
-    vocoder = LTX2Vocoder.from_pretrained(
-         config.pretrained_model_name_or_path,
-         subfolder="vocoder",
-    )
-
-    # 8. Scheduler
-    max_logging.log("Loading Scheduler...")
-    scheduler = FlaxFlowMatchScheduler.from_pretrained(
-         config.pretrained_model_name_or_path,
-         subfolder="scheduler",
-    )
-    # 2. Text Encoder (PyTorch)
-    max_logging.log("Loading Gemma3 Text Encoder...")
-    text_encoder = Gemma3ForConditionalGeneration.from_pretrained(
-        config.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        torch_dtype=torch.bfloat16,
-    )
-    text_encoder.eval()
-    
-    
-
-    pipeline = LTX2Pipeline(
-        scheduler=scheduler,
-        vae=vae,
-        audio_vae=audio_vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        connectors=connectors,
-        transformer=transformer,
-        vocoder=vocoder,
-    )
-    
-    return pipeline
-
-  def load_checkpoint(self, step=None) -> Tuple[LTX2Pipeline, Optional[dict], Optional[int]]:
+  def load_checkpoint(self, step=None, vae_only=False, load_transformer=True) -> Tuple[LTX2Pipeline, Optional[dict], Optional[int]]:
     restored_checkpoint, step = self.load_ltx2_configs_from_orbax(step)
     opt_state = None
+
     if restored_checkpoint:
-      max_logging.log("Loading LTX2 pipeline from checkpoint (TODO: implement fully if needed)")
-      # pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint)
-      # if "opt_state" in restored_checkpoint.ltx2_state.keys():
-      #   opt_state = restored_checkpoint.ltx2_state["opt_state"]
-      pipeline = self.load_diffusers_checkpoint() # Fallback for now 
+      max_logging.log("Loading LTX2 pipeline from checkpoint")
+      pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint, vae_only, load_transformer)
+      if "opt_state" in restored_checkpoint.ltx2_state.keys():
+        opt_state = restored_checkpoint.ltx2_state["opt_state"]
     else:
-      max_logging.log("No checkpoint found, loading default pipeline.")
-      pipeline = self.load_diffusers_checkpoint()
+      max_logging.log("No checkpoint found, loading pipeline from pretrained hub")
+      pipeline = LTX2Pipeline.from_pretrained(self.config, vae_only, load_transformer)
 
     return pipeline, opt_state, step
 
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2.py
@@ -1111,7 +1111,7 @@ def __call__(
     return hidden_states
 
 
-class LTX2VideoAutoencoderKL(nnx.Module, ConfigMixin):
+class LTX2VideoAutoencoderKL(nnx.Module, FlaxModelMixin, ConfigMixin):
   _supports_gradient_checkpointing = True
   config_name = "config.json"
 
diff --git a/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py b/src/maxdiffusion/models/ltx2/autoencoder_kl_ltx2_audio.py
@@ -11,6 +11,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import BaseOutput
 from ..vae_flax import FlaxDiagonalGaussianDistribution
+from ..modeling_flax_utils import FlaxModelMixin
 
 
 LATENT_DOWNSAMPLE_FACTOR = 4
@@ -624,7 +625,7 @@ def __call__(self, z, target_frames=None, target_mel_bins=None, train: bool = Fa
         return h
 
 
-class FlaxAutoencoderKLLTX2Audio(nnx.Module, ConfigMixin):
+class FlaxAutoencoderKLLTX2Audio(nnx.Module, FlaxModelMixin, ConfigMixin):
     """
     LTX2 audio VAE wrapper handling normalization, patchification, and latent sampling.
     Operates in NHWC format (Batch, Time, Freq, Channels).
diff --git a/src/maxdiffusion/models/ltx2/vocoder_ltx2.py b/src/maxdiffusion/models/ltx2/vocoder_ltx2.py
@@ -21,6 +21,8 @@
 import jax.numpy as jnp
 from flax import nnx
 from ... import common_types
+from maxdiffusion.configuration_utils import ConfigMixin, register_to_config
+from maxdiffusion.models.modeling_flax_utils import FlaxModelMixin
 
 Array = common_types.Array
 DType = common_types.DType
@@ -87,11 +89,12 @@ def __call__(self, x: Array) -> Array:
     return x
 
 
-class LTX2Vocoder(nnx.Module):
+class LTX2Vocoder(nnx.Module, FlaxModelMixin, ConfigMixin):
   """
   LTX 2.0 vocoder for converting generated mel spectrograms back to audio waveforms.
   """
 
+  @register_to_config
   def __init__(
       self,
       in_channels: int = 128,
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py