Fix device put error in unet state and text encoder 2 state (#155)

coolkp · web-flow · commit 2228609613ab · 2025-03-11T14:12:47.000-07:00
Signed-off-by: kunjan patel &lt;kunjanp@google.com&gt;
diff --git a/requirements.txt b/requirements.txt
@@ -24,7 +24,7 @@ tensorflow-datasets>=4.9.6
 ruff>=0.1.5,<=0.2
 git+https://github.com/mlperf/logging.git
 opencv-python-headless==4.10.0.84
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 tokenizers==0.21.0
 huggingface_hub==0.24.7
 transformers==4.48.1
diff --git a/requirements_with_jax_stable_stack.txt b/requirements_with_jax_stable_stack.txt
@@ -14,7 +14,7 @@ jaxlib>=0.4.30
 Jinja2
 opencv-python-headless==4.10.0.84
 optax>=0.2.3
-orbax-checkpoint==0.10.2
+orbax-checkpoint==0.10.3
 parameterized
 Pillow
 pyink
diff --git a/src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py b/src/maxdiffusion/checkpointing/base_stable_diffusion_checkpointer.py
@@ -88,14 +88,11 @@ def create_unet_state(self, pipeline, params, checkpoint_item_name, is_training)
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=None if self.config.train_new_unet else params.get("unet", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    if not self.config.train_new_unet:
-      unet_state = unet_state.replace(params=params.get("unet", None))
-      unet_state = jax.device_put(unet_state, state_mesh_shardings)
     return unet_state, state_mesh_shardings, learning_rate_scheduler
 
   def create_vae_state(self, pipeline, params, checkpoint_item_name, is_training=False):
@@ -153,20 +150,18 @@ def create_text_encoder_2_state(self, pipeline, params, checkpoint_item_name, is
         input_shape=(self.total_train_batch_size, pipeline.tokenizer.model_max_length),
     )
 
-    state, state_mesh_shardings = max_utils.setup_initial_state(
+    # state, state_mesh_shardings =
+    return max_utils.setup_initial_state(
         model=pipeline.text_encoder_2,
         tx=tx,
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=None,
+        model_params=params.get("text_encoder_2", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
     )
-    state = state.replace(params=params.get("text_encoder_2", None))
-    state = jax.device_put(state, state_mesh_shardings)
-    return state, state_mesh_shardings
 
   def restore_data_iterator_state(self, data_iterator):
     if (