AI-Hypercomputer
diff --git a/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements.txt‎
Lines changed: 4 additions & 3 deletions b/‎requirements.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/maxdiffusion/checkpointing/wan_checkpointer.py‎
Lines changed: 66 additions & 0 deletions b/‎src/maxdiffusion/checkpointing/wan_checkpointer.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_t2v.yml‎ ‎src/maxdiffusion/configs/base_wan_14b.yml‎src/maxdiffusion/configs/base_wan_t2v.yml renamed to src/maxdiffusion/configs/base_wan_14b.yml
Lines changed: 41 additions & 39 deletions b/‎src/maxdiffusion/configs/base_wan_t2v.yml‎ ‎src/maxdiffusion/configs/base_wan_14b.yml‎src/maxdiffusion/configs/base_wan_t2v.yml renamed to src/maxdiffusion/configs/base_wan_14b.yml
Lines changed: 41 additions & 39 deletions
diff --git a/‎src/maxdiffusion/data_preprocessing/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎src/maxdiffusion/data_preprocessing/__init__.py‎
Lines changed: 15 additions & 0 deletions
@@ -50,7 +50,7 @@ jobs:
         ruff check .
     - name: PyTest
       run: | 
-        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ python3 -m pytest
+        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ python3 -m pytest -x
 #  add_pull_ready:
 #    if: github.ref != 'refs/heads/main'
 #    permissions:
 
@@ -35,13 +35,13 @@ MaxDiffusion supports
 * Stable Diffusion 2 base (training and inference)
 * Stable Diffusion 2.1 (training and inference)
 * Stable Diffusion XL (training and inference).
+* Flux Dev and Schnell (Training and inference).
 * Stable Diffusion Lightning (inference).
 * Hyper-SD XL LoRA loading (inference).
 * Load Multiple LoRA (SDXL inference).
 * ControlNet inference (Stable Diffusion 1.4 & SDXL).
 * Dreambooth training support for Stable Diffusion 1.x,2.x.
 
-**WARNING: The training code is purely experimental and is under development.**
 
 # Table of Contents
 
 
@@ -1,13 +1,14 @@
-jax>=0.4.30
+--extra-index-url https://download.pytorch.org/whl/cpu
+jax==0.5.3
 jaxlib>=0.4.30
 grain-nightly==0.0.10
 google-cloud-storage==2.17.0
 absl-py
 datasets
 flax>=0.10.2
 optax>=0.2.3
-torch==2.5.1
-torchvision==0.20.1
+torch==2.6.0
+torchvision>=0.20.1
 ftfy
 tensorboard>=2.17.0
 tensorboardx==2.6.2.2
 
@@ -0,0 +1,66 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from abc import ABC
+from flax import nnx
+from maxdiffusion.checkpointing.checkpointing_utils import (create_orbax_checkpoint_manager)
+from ..pipelines.wan.wan_pipeline import WanPipeline
+from .. import max_logging, max_utils
+
+WAN_CHECKPOINT = "WAN_CHECKPOINT"
+
+
+class WanCheckpointer(ABC):
+
+  def __init__(self, config, checkpoint_type):
+    self.config = config
+    self.checkpoint_type = checkpoint_type
+
+    self.checkpoint_manager = create_orbax_checkpoint_manager(
+        self.config.checkpoint_dir,
+        enable_checkpointing=True,
+        save_interval_steps=1,
+        checkpoint_type=checkpoint_type,
+        dataset_type=config.dataset_type,
+    )
+
+  def _create_optimizer(self, model, config, learning_rate):
+    learning_rate_scheduler = max_utils.create_learning_rate_schedule(
+        learning_rate, config.learning_rate_schedule_steps, config.warmup_steps_fraction, config.max_train_steps
+    )
+    tx = max_utils.create_optimizer(config, learning_rate_scheduler)
+    return nnx.Optimizer(model, tx), learning_rate_scheduler
+
+  def load_wan_configs_from_orbax(self, step):
+    max_logging.log("Restoring stable diffusion configs")
+    if step is None:
+      step = self.checkpoint_manager.latest_step()
+      if step is None:
+        return None
+
+  def load_diffusers_checkpoint(self):
+    pipeline = WanPipeline.from_pretrained(self.config)
+    return pipeline
+
+  def load_checkpoint(self, step=None):
+    model_configs = self.load_wan_configs_from_orbax(step)
+
+    if model_configs:
+      raise NotImplementedError("model configs should not exist in orbax")
+    else:
+      pipeline = self.load_diffusers_checkpoint()
+
+    return pipeline
@@ -18,25 +18,22 @@ run_name: ''
 metrics_file: "" # for testing, local file that stores scalar metrics. If empty, no metrics are written.
 # If true save metrics such as loss and TFLOPS to GCS in {base_output_directory}/{run_name}/metrics/
 write_metrics: True
+
+timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
+write_timing_metrics: True 
+
 gcs_metrics: False
 # If true save config to GCS in {base_output_directory}/{run_name}/
 save_config_to_gcs: False
 log_period: 100
 
 pretrained_model_name_or_path: 'Wan-AI/Wan2.1-T2V-14B-Diffusers'
 
-# Flux params
-flux_name: "flux-dev"
-max_sequence_length: 512
-time_shift: True
-base_shift: 0.5
-max_shift: 1.15
-# offloads t5 encoder after text encoding to save memory.
-offload_encoders: True
-
+# Overrides the transformer from pretrained_model_name_or_path
+wan_transformer_pretrained_model_name_or_path: ''
 
 unet_checkpoint: ''
-revision: 'refs/pr/95'
+revision: ''
 # This will convert the weights to this dtype.
 # When running inference on TPUv5e, use weights_dtype: 'bfloat16'
 weights_dtype: 'bfloat16'
@@ -59,24 +56,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
 
 flash_block_sizes: {}
-# Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
-# flash_block_sizes: {
-#   "block_q" : 1536,
-#   "block_kv_compute" : 1536,
-#   "block_kv" : 1536,
-#   "block_q_dkv" : 1536,
-#   "block_kv_dkv" : 1536,
-#   "block_kv_dkv_compute" : 1536,
-#   "block_q_dq" : 1536,
-#   "block_kv_dq" : 1536
-# }
 # GroupNorm groups
 norm_num_groups: 32
 
-# If train_new_unet, unet weights will be randomly initialized to train the unet from scratch
-# else they will be loaded from pretrained_model_name_or_path
-train_new_unet: False
-
 # train text_encoder - Currently not supported for SDXL
 train_text_encoder: False
 text_encoder_learning_rate: 4.25e-6
@@ -133,15 +115,17 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
                       ['batch', 'data'],
+                      ['activation_heads', 'fsdp'],
                       ['activation_batch', ['data','fsdp']],
-                      ['activation_heads', 'tensor'],
                       ['activation_kv', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
                       ['heads', 'tensor'],
+                      ['norm', 'fsdp'],
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
                       ['conv_out', 'fsdp'],
+                      ['conv_in', 'fsdp']
                     ]
 data_sharding: [['data', 'fsdp', 'tensor']]
 
@@ -152,22 +136,23 @@ data_sharding: [['data', 'fsdp', 'tensor']]
 dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
 dcn_fsdp_parallelism: -1
 dcn_tensor_parallelism: 1
-ici_data_parallelism: -1
-ici_fsdp_parallelism: 1  # recommended ICI axis to be auto-sharded
+ici_data_parallelism: 1
+ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
 ici_tensor_parallelism: 1
 
 # Dataset
 # Replace with dataset path or train_data_dir. One has to be set.
 dataset_name: 'diffusers/pokemon-gpt4-captions'
 train_split: 'train'
-dataset_type: 'tf'
+dataset_type: 'tfrecord'
 cache_latents_text_encoder_outputs: True
 # cache_latents_text_encoder_outputs only apply to dataset_type="tf",
 # only apply to small dataset that fits in memory
 # prepare image latents and text encoder outputs
 # Reduce memory consumption and reduce step time during training
 # transformed dataset is saved at dataset_save_location
-dataset_save_location: '/tmp/pokemon-gpt4-captions_xl'
+dataset_save_location: ''
+load_tfrecord_cached: True
 train_data_dir: ''
 dataset_config_name: ''
 jax_cache_dir: ''
@@ -192,17 +177,23 @@ checkpoint_every: -1
 enable_single_replica_ckpt_restoring: False
 
 # Training loop
-learning_rate: 4.e-7
+learning_rate: 1.e-5
 scale_lr: False
 max_train_samples: -1
 # max_train_steps takes priority over num_train_epochs.
-max_train_steps: 200
+max_train_steps: 1500
 num_train_epochs: 1
 seed: 0
 output_dir: 'sdxl-model-finetuned'
 per_device_batch_size: 1
+# If global_batch_size % jax.device_count is not 0, use FSDP sharding.
+global_batch_size: 0
+
+# For creating tfrecords from dataset
+tfrecords_dir: ''
+no_records_per_shard: 0
 
-warmup_steps_fraction: 0.0
+warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
 
 # However you may choose a longer schedule (learning_rate_schedule_steps > steps), in which case the training will end before
@@ -212,7 +203,7 @@ learning_rate_schedule_steps: -1 # By default the length of the schedule is set
 adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradients.
 adam_b2: 0.999 # Exponential decay rate to track the second moment of past gradients.
 adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
-adam_weight_decay: 1.e-2 # AdamW Weight decay
+adam_weight_decay: 0 # AdamW Weight decay
 max_grad_norm: 1.0
 
 enable_profiler: False
@@ -222,14 +213,25 @@ skip_first_n_steps_for_profiler: 5
 profiler_steps: 10
 
 # Generation parameters
-prompt: "A magical castle in the middle of a forest, artistic drawing"
-prompt_2: "A magical castle in the middle of a forest, artistic drawing"
-negative_prompt: "purple, red"
+prompt: "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+prompt_2: "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
 do_classifier_free_guidance: True
-guidance_scale: 3.5
+height: 480
+width: 832
+num_frames: 81
+guidance_scale: 5.0
+flow_shift: 3.0
+
+# skip layer guidance
+slg_layers: [9]
+slg_start: 0.2
+slg_end: 1.0
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
-num_inference_steps: 50
+num_inference_steps: 30
+fps: 24
+save_final_checkpoint: False
 
 # SDXL Lightning parameters
 lightning_from_pt: True
 
@@ -0,0 +1,15 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ """