AI-Hypercomputer · Serenagu525 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -374,6 +374,7 @@
   _import_structure["models.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
   _import_structure["models.flux.transformers.transformer_flux_flax"] = ["FluxTransformer2DModel"]
   _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
+  _import_structure["models.ltx_video.transformers.transformer3d"] = ["Transformer3DModel"]
   _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
   _import_structure["schedulers"].extend(
       [
@@ -453,6 +454,7 @@
     from .models.modeling_flax_utils import FlaxModelMixin
     from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
     from .models.flux.transformers.transformer_flux_flax import FluxTransformer2DModel
+    from .models.ltx_video.transformers.transformer3d import Transformer3DModel
     from .models.vae_flax import FlaxAutoencoderKL
     from .pipelines import FlaxDiffusionPipeline
     from .schedulers import (

@@ -213,7 +213,10 @@ def load_state_if_possible(
     max_logging.log(f"restoring from this run's directory latest step {latest_step}")
     try:
       if not enable_single_replica_ckpt_restoring:
-        item = {checkpoint_item: orbax.checkpoint.args.PyTreeRestore(item=abstract_unboxed_pre_state)}
+        if checkpoint_item == " ":
+          return checkpoint_manager.restore(latest_step, args=ocp.args.StandardRestore(abstract_unboxed_pre_state))
+        else:
+          item = {checkpoint_item: orbax.checkpoint.args.PyTreeRestore(item=abstract_unboxed_pre_state)}
         return checkpoint_manager.restore(latest_step, args=orbax.checkpoint.args.Composite(**item))
 
       def map_to_pspec(data):

@@ -0,0 +1,68 @@
+#  Copyright 2025 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+#hardware
+hardware: 'tpu'
+skip_jax_distributed_system: False
+
+jax_cache_dir: ''
+weights_dtype: 'bfloat16'
+activations_dtype: 'bfloat16'
+
+
+run_name: ''
+output_dir: 'ltx-video-output'
+save_config_to_gcs: False
+
+#parallelism
+mesh_axes: ['data', 'fsdp', 'tensor']
+logical_axis_rules: [
+                      ['batch', 'data'],
+                      ['activation_heads', 'fsdp'],
+                      ['activation_batch', ['data','fsdp']],
+                      ['activation_kv', 'tensor'],
+                      ['mlp','tensor'],
+                      ['embed','fsdp'],
+                      ['heads', 'tensor'],
+                      ['norm', 'fsdp'],
+                      ['conv_batch', ['data','fsdp']],
+                      ['out_channels', 'tensor'],
+                      ['conv_out', 'fsdp'],
+                      ['conv_in', 'fsdp']
+                    ]
+data_sharding: [['data', 'fsdp', 'tensor']]
+dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
+dcn_fsdp_parallelism: -1
+dcn_tensor_parallelism: 1
+ici_data_parallelism: 1
+ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
+ici_tensor_parallelism: 1
+
+
+
+
+learning_rate_schedule_steps: -1
+max_train_steps: 500 #TODO: change this
+pretrained_model_name_or_path: ''
+unet_checkpoint: ''
+dataset_name: 'diffusers/pokemon-gpt4-captions'
+train_split: 'train'
+dataset_type: 'tf'
+cache_latents_text_encoder_outputs: True
+per_device_batch_size: 1
+compile_topology_num_slices: -1 
+quantization_local_shard_count: -1
+jit_initializers: True 
+enable_single_replica_ckpt_restoring: False
@@ -0,0 +1,114 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from absl import app
+from typing import Sequence
+import jax
+import json
+from maxdiffusion.models.ltx_video.transformers.transformer3d import Transformer3DModel
+import os
+import functools
+import jax.numpy as jnp
+from maxdiffusion import pyconfig
+from maxdiffusion.max_utils import (
+    create_device_mesh,
+    setup_initial_state,
+    get_memory_allocations,
+)
+from jax.sharding import Mesh
+import orbax.checkpoint as ocp
+
+
+def validate_transformer_inputs(
+    prompt_embeds, fractional_coords, latents, noise_cond, segment_ids, encoder_attention_segment_ids
+):
+  print("prompts_embeds.shape: ", prompt_embeds.shape, prompt_embeds.dtype)
+  print("fractional_coords.shape: ", fractional_coords.shape, fractional_coords.dtype)
+  print("latents.shape: ", latents.shape, latents.dtype)
+  print("noise_cond.shape: ", noise_cond.shape, noise_cond.dtype)
+  print("noise_cond.shape: ", noise_cond.shape, noise_cond.dtype)
+  print("segment_ids.shape: ", segment_ids.shape, segment_ids.dtype)
+  print("encoder_attention_segment_ids.shape: ", encoder_attention_segment_ids.shape, encoder_attention_segment_ids.dtype)
+
+
+def run(config):
+
+  key = jax.random.PRNGKey(42)
+
+  devices_array = create_device_mesh(config)
+  mesh = Mesh(devices_array, config.mesh_axes)
+
+  base_dir = os.path.dirname(__file__)
+
+  ##load in model config
+  config_path = os.path.join(base_dir, "models/ltx_video/xora_v1.2-13B-balanced-128.json")
+  with open(config_path, "r") as f:
+    model_config = json.load(f)
+  ckpt_path = model_config["ckpt_path"]
+
+  ignored_keys = [
+      "_class_name",
+      "_diffusers_version",
+      "_name_or_path",
+      "causal_temporal_positioning",
+      "in_channels",
+      "ckpt_path",
+  ]
+  in_channels = model_config["in_channels"]
+  for name in ignored_keys:
+    if name in model_config:
+      del model_config[name]
+
+  transformer = Transformer3DModel(
+      **model_config, dtype=jnp.float32, gradient_checkpointing="matmul_without_batch", sharding_mesh=mesh
+  )
+  transformer_param_shapes = transformer.init_weights(  # noqa: F841
+      in_channels, key, model_config["caption_channels"], eval_only=True
+  )
+  weights_init_fn = functools.partial(
+      transformer.init_weights, in_channels, key, model_config["caption_channels"], eval_only=True
+  )
+
+  checkpoint_manager = ocp.CheckpointManager(ckpt_path)
+  transformer_state, transformer_state_shardings = setup_initial_state(
+      model=transformer,
+      tx=None,
+      config=config,
+      mesh=mesh,
+      weights_init_fn=weights_init_fn,
+      checkpoint_manager=checkpoint_manager,
+      checkpoint_item=" ",
+      model_params=None,
+      training=False,
+  )
+
+  transformer_state = jax.device_put(transformer_state, transformer_state_shardings)
+  get_memory_allocations()
+
+  states = {}
+  state_shardings = {}
+
+  state_shardings["transformer"] = transformer_state_shardings
+  states["transformer"] = transformer_state
+
+
+def main(argv: Sequence[str]) -> None:
+  pyconfig.initialize(argv)
+  run(pyconfig.config)
+
+
+if __name__ == "__main__":
+  app.run(main)
@@ -402,7 +402,10 @@ def setup_initial_state(
           config.enable_single_replica_ckpt_restoring,
       )
       if state:
-        state = state[checkpoint_item]
+        if checkpoint_item == " ":
+          state = state
+        else:
+          state = state[checkpoint_item]
     if not state:
       max_logging.log(f"Could not find the item in orbax, creating state...")
       init_train_state_partial = functools.partial(

@@ -13,9 +13,7 @@
 # limitations under the License.
 
 from typing import TYPE_CHECKING
-
-from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, is_flax_available, is_torch_available
-
+from maxdiffusion.utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, is_flax_available, is_torch_available
 
 _import_structure = {}
 
@@ -32,6 +30,7 @@
   from .vae_flax import FlaxAutoencoderKL
   from .lora import *
   from .flux.transformers.transformer_flux_flax import FluxTransformer2DModel
+  from .ltx_video.transformers.transformer3d import Transformer3DModel
 
 else:
   import sys

@@ -0,0 +1,16 @@
+# Copyright 2025 Lightricks Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This implementation is based on the Torch version available at:
+# https://github.com/Lightricks/LTX-Video/tree/main
@@ -0,0 +1,86 @@
+# Copyright 2025 Lightricks Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Lightricks/LTX-Video/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This implementation is based on the Torch version available at:
+# https://github.com/Lightricks/LTX-Video/tree/main
+from enum import Enum, auto
+from typing import Optional
+
+import jax
+from flax import linen as nn
+
+SKIP_GRADIENT_CHECKPOINT_KEY = "skip"
+
+
+class GradientCheckpointType(Enum):
+  """
+  Defines the type of the gradient checkpoint we will have
+
+  NONE - means no gradient checkpoint
+  FULL - means full gradient checkpoint, wherever possible (minimum memory usage)
+  MATMUL_WITHOUT_BATCH - means gradient checkpoint for every linear/matmul operation,
+                          except for ones that involve batch dimension - that means that all attention and projection
+                          layers will have gradient checkpoint, but not the backward with respect to the parameters
+  """
+
+  NONE = auto()
+  FULL = auto()
+  MATMUL_WITHOUT_BATCH = auto()
+
+  @classmethod
+  def from_str(cls, s: Optional[str] = None) -> "GradientCheckpointType":
+    """
+    Constructs the gradient checkpoint type from a string
+
+    Args:
+        s (Optional[str], optional): The name of the gradient checkpointing policy. Defaults to None.
+
+    Returns:
+        GradientCheckpointType: The policy that corresponds to the string
+    """
+    if s is None:
+      s = "none"
+    return GradientCheckpointType[s.upper()]
+
+  def to_jax_policy(self):
+    """
+    Converts the gradient checkpoint type to a jax policy
+    """
+    match self:
+      case GradientCheckpointType.NONE:
+        return SKIP_GRADIENT_CHECKPOINT_KEY
+      case GradientCheckpointType.FULL:
+        return None
+      case GradientCheckpointType.MATMUL_WITHOUT_BATCH:
+        return jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
+
+  def apply(self, module: nn.Module) -> nn.Module:
+    """
+    Applies a gradient checkpoint policy to a module
+    if no policy is needed, it will return the module as is
+
+    Args:
+        module (nn.Module): the module to apply the policy to
+
+    Returns:
+        nn.Module: the module with the policy applied
+    """
+    policy = self.to_jax_policy()
+    if policy == SKIP_GRADIENT_CHECKPOINT_KEY:
+      return module
+    return nn.remat(  # pylint: disable=invalid-name
+        module,
+        prevent_cse=False,
+        policy=policy,
+    )