AI-Hypercomputer
diff --git a/‎README.md‎
Lines changed: 11 additions & 3 deletions b/‎README.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎requirements_with_jax_stable_stack.txt‎
Lines changed: 2 additions & 2 deletions b/‎requirements_with_jax_stable_stack.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py‎
Lines changed: 4 additions & 9 deletions b/‎src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 8 additions & 0 deletions b/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 8 additions & 0 deletions b/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_xl.yml‎
Lines changed: 10 additions & 0 deletions b/‎src/maxdiffusion/configs/base_xl.yml‎
Lines changed: 10 additions & 0 deletions
@@ -53,7 +53,7 @@ MaxDiffusion supports
   - [Dreambooth](#dreambooth)
   - [Inference](#inference)
   - [Flux](#flux)
-    - [Flash Attention for GPU:](#flash-attention-for-gpu)
+    - [Fused Attention for GPU:](#fused-attention-for-gpu)
   - [Hyper SDXL LoRA](#hyper-sdxl-lora)
   - [Load Multiple LoRA](#load-multiple-lora)
   - [SDXL Lightning](#sdxl-lightning)
@@ -83,6 +83,14 @@ After installation completes, run the training script.
   python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml run_name="my_xl_run" output_dir="gs://your-bucket/" per_device_batch_size=1
   ```
 
+  On GPUS with Fused Attention:
+
+  First install Transformer Engine by following the [instructions here](#fused-attention-for-gpu).
+
+  ```bash
+  NVTE_FUSED_ATTN=1 python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml hardware=gpu run_name='test-sdxl-train' output_dir=/tmp/ train_new_unet=true train_text_encoder=false cache_latents_text_encoder_outputs=true max_train_steps=200 weights_dtype=bfloat16 resolution=512 per_device_batch_size=1 attention="cudnn_flash_te" jit_initializers=False
+  ```
+
   To generate images with a trained checkpoint, run:
 
   ```bash
@@ -176,8 +184,8 @@ To generate images, run the following command:
   python src/maxdiffusion/generate_flux.py src/maxdiffusion/configs/base_flux_schnell.yml jax_cache_dir=/tmp/cache_dir run_name=flux_test output_dir=/tmp/ prompt="photograph of an electronics chip in the shape of a race car with trillium written on its side" per_device_batch_size=1 ici_data_parallelism=1 ici_fsdp_parallelism=-1 offload_encoders=False
   ```
 
-    ## Flash Attention for GPU:
-    Flash Attention for GPU is supported via TransformerEngine. Installation instructions:
+    ## Fused Attention for GPU:
+    Fused Attention for GPU is supported via TransformerEngine. Installation instructions:
 
     ```bash
     cd maxdiffusion
 
@@ -24,9 +24,10 @@ tensorflow-datasets>=4.9.6
 ruff>=0.1.5,<=0.2
 git+https://github.com/mlperf/logging.git
 opencv-python-headless==4.10.0.84
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 tokenizers==0.21.0
 huggingface_hub==0.24.7
 transformers==4.48.1
 einops==0.8.0
 sentencepiece
+aqtp
@@ -7,14 +7,14 @@ flax>=0.10.2
 ftfy
 git+https://github.com/mlperf/logging.git
 google-cloud-storage==2.17.0
-grain-nightly
+grain-nightly==0.0.10
 huggingface_hub==0.24.7
 jax>=0.4.30
 jaxlib>=0.4.30
 Jinja2
 opencv-python-headless==4.10.0.84
 optax>=0.2.3
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 parameterized
 Pillow
 pyink
 
@@ -88,14 +88,11 @@ def create_unet_state(self, pipeline, params, checkpoint_item_name, is_training)
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=None if self.config.train_new_unet else params.get("unet", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    if not self.config.train_new_unet:
-      unet_state = unet_state.replace(params=params.get("unet", None))
-      unet_state = jax.device_put(unet_state, state_mesh_shardings)
     return unet_state, state_mesh_shardings, learning_rate_scheduler
 
   def create_vae_state(self, pipeline, params, checkpoint_item_name, is_training=False):
@@ -153,20 +150,18 @@ def create_text_encoder_2_state(self, pipeline, params, checkpoint_item_name, is
         input_shape=(self.total_train_batch_size, pipeline.tokenizer.model_max_length),
     )
 
-    state, state_mesh_shardings = max_utils.setup_initial_state(
+    # state, state_mesh_shardings =
+    return max_utils.setup_initial_state(
         model=pipeline.text_encoder_2,
         tx=tx,
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=params.get("text_encoder_2", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    state = state.replace(params=params.get("text_encoder_2", None))
-    state = jax.device_put(state, state_mesh_shardings)
-    return state, state_mesh_shardings
 
   def restore_data_iterator_state(self, data_iterator):
     if (
 
@@ -37,6 +37,11 @@ activations_dtype: 'bfloat16'
 # fp32 activations and fp32 weights with HIGHEST will provide the best precision
 # at the cost of time.
 precision: "DEFAULT"
+
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
@@ -216,3 +221,7 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
@@ -37,6 +37,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
@@ -217,3 +221,7 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
@@ -38,6 +38,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
@@ -231,3 +235,8 @@ prior_loss_weight: 1.0
 num_class_images: 100
 # If true, set dataset_save_location.
 cache_dreambooth_dataset: False
+
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
@@ -51,6 +51,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
@@ -260,3 +264,8 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
+
@@ -50,6 +50,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
@@ -268,3 +272,7 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
@@ -38,6 +38,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
@@ -233,3 +237,9 @@ controlnet_model_name_or_path: 'diffusers/controlnet-canny-sdxl-1.0'
 controlnet_from_pt: True
 controlnet_conditioning_scale: 0.5
 controlnet_image: 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c1/Google_%22G%22_logo.svg/1024px-Google_%22G%22_logo.svg.png'
+enable_mllog: False
+
+quantization: ''
+# Shard the range finding operation for quantization. By default this is set to number of slices.
+quantization_local_shard_count: -1
+compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.