tf processing on cpu

jfacevedo-google · jfacevedo-google · commit 6f536d89cb27 · 2025-03-04T19:13:34.000Z
diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py
@@ -404,21 +404,18 @@ def setup_initial_state(
         state = state[checkpoint_item]
     if not state:
       max_logging.log(f"Could not find the item in orbax, creating state...")
-      init_train_state_partial = functools.partial(
-          init_train_state,
-          model=model,
-          tx=tx,
-          weights_init_fn=weights_init_fn,
-          params=model_params,
-          training=training,
-          eval_only=False,
+      state = init_train_state(
+        model=model,
+        tx=tx,
+        weights_init_fn=weights_init_fn,
+        params=model_params,
+        training=training,
+        eval_only=False
       )
+      if model_params:
+        state = state.replace(params=model_params)
 
-      state = jax.jit(
-          init_train_state_partial,
-          in_shardings=None,
-          out_shardings=state_mesh_shardings,
-      )()
+      state = jax.device_put(state, state_mesh_shardings)
 
   state = unbox_logicallypartioned_trainstate(state)
 
diff --git a/src/maxdiffusion/train_sdxl.py b/src/maxdiffusion/train_sdxl.py
@@ -46,4 +46,10 @@ def main(argv: Sequence[str]) -> None:
 
 
 if __name__ == "__main__":
+  import os
+  import tensorflow as tf
+  import torch
+  tf.config.set_visible_devices([], "GPU")
+  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
+  torch.set_default_device("cpu")
   app.run(main)
diff --git a/src/maxdiffusion/trainers/base_stable_diffusion_trainer.py b/src/maxdiffusion/trainers/base_stable_diffusion_trainer.py
@@ -83,18 +83,6 @@ def start_training(self):
     # create train states
     train_states = {}
     state_shardings = {}
-    unet_state, unet_state_mesh_shardings, unet_learning_rate_scheduler = self.create_unet_state(
-        # ambiguous here, but if self.params.get("unet") doesn't exist
-        # Then its 1 of 2 scenarios:
-        # 1. unet state will be loaded directly from orbax
-        # 2. a new unet is being trained from scratch.
-        pipeline=pipeline,
-        params=params,
-        checkpoint_item_name="unet_state",
-        is_training=True,
-    )
-    train_states["unet_state"] = unet_state
-    state_shardings["unet_state_shardings"] = unet_state_mesh_shardings
     vae_state, vae_state_mesh_shardings = self.create_vae_state(
         pipeline=pipeline, params=params, checkpoint_item_name="vae_state", is_training=False
     )
@@ -131,6 +119,19 @@ def start_training(self):
     if self.config.dataset_type == "grain":
       data_iterator = self.restore_data_iterator_state(data_iterator)
 
+    unet_state, unet_state_mesh_shardings, unet_learning_rate_scheduler = self.create_unet_state(
+        # ambiguous here, but if self.params.get("unet") doesn't exist
+        # Then its 1 of 2 scenarios:
+        # 1. unet state will be loaded directly from orbax
+        # 2. a new unet is being trained from scratch.
+        pipeline=pipeline,
+        params=params,
+        checkpoint_item_name="unet_state",
+        is_training=True,
+    )
+    train_states["unet_state"] = unet_state
+    state_shardings["unet_state_shardings"] = unet_state_mesh_shardings
+
     data_shardings = self.get_data_shardings()
     # Compile train_step
     p_train_step = self.compile_train_step(pipeline, params, train_states, state_shardings, data_shardings)