flag to jit initializers

entrpn · entrpn · commit fcafb4acedb5 · 2025-03-14T20:42:23.000Z
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ After installation completes, run the training script.
   First install Transformer Engine by following the [instructions here](#fused-attention-for-gpu).
 
   ```bash
-  NVTE_FUSED_ATTN=1 python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml hardware=gpu run_name='test-sdxl-train' output_dir=/tmp/ train_text_encoder=false cache_latents_text_encoder_outputs=true max_train_steps=200 weights_dtype=float16 activations_dtype=float16 per_device_batch_size=1 attention="cudnn_flash_te"
+  NVTE_FUSED_ATTN=1 python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml hardware=gpu run_name='test-sdxl-train' output_dir=/tmp/ train_new_unet=true train_text_encoder=false cache_latents_text_encoder_outputs=true max_train_steps=200 weights_dtype=bfloat16 resolution=512 per_device_batch_size=1 attention="cudnn_flash_te" jit_initializers=False
   ```
 
   To generate images with a trained checkpoint, run:
diff --git a/src/maxdiffusion/configs/base14.yml b/src/maxdiffusion/configs/base14.yml
@@ -37,6 +37,11 @@ activations_dtype: 'bfloat16'
 # fp32 activations and fp32 weights with HIGHEST will provide the best precision
 # at the cost of time.
 precision: "DEFAULT"
+
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base21.yml b/src/maxdiffusion/configs/base21.yml
@@ -37,6 +37,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base_2_base.yml b/src/maxdiffusion/configs/base_2_base.yml
@@ -38,6 +38,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base_flux_dev.yml b/src/maxdiffusion/configs/base_flux_dev.yml
@@ -51,6 +51,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base_flux_schnell.yml b/src/maxdiffusion/configs/base_flux_schnell.yml
@@ -50,6 +50,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base_xl.yml b/src/maxdiffusion/configs/base_xl.yml
@@ -38,6 +38,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
diff --git a/src/maxdiffusion/configs/base_xl_lightning.yml b/src/maxdiffusion/configs/base_xl_lightning.yml
@@ -36,6 +36,10 @@ activations_dtype: 'bfloat16'
 # at the cost of time.
 precision: "DEFAULT"
 
+# if False state is not jitted and instead replicate is called. This is good for debugging on single host
+# It must be True for multi-host.
+jit_initializers: True
+
 # Set true to load weights from pytorch
 from_pt: False
 split_head_dim: True
diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py
@@ -413,11 +413,19 @@ def setup_initial_state(
           training=training,
           eval_only=False,
       )
-      state = jax.jit(
-          init_train_state_partial,
-          in_shardings=None,
-          out_shardings=state_mesh_shardings,
-      )()
+      if config.jit_initializers:
+        state = jax.jit(
+            init_train_state_partial,
+            in_shardings=None,
+            out_shardings=state_mesh_shardings,
+        )()
+      else:
+        state = init_train_state_partial()
+        if model_params:
+          state = state.replace(params=model_params)
+        state = jax.device_put(state, state_mesh_shardings)
+      if model_params:
+        state = state.replace(params=model_params)
 
   state = unbox_logicallypartioned_trainstate(state)