Added orbax saving and a new file for inference that utilizes the pipeline.

ksikiric · ksikiric · commit bfec2c8c3f70 · 2025-04-16T07:35:40.000Z
diff --git a/src/maxdiffusion/checkpointing/checkpointing_utils.py b/src/maxdiffusion/checkpointing/checkpointing_utils.py
@@ -57,16 +57,21 @@ def create_orbax_checkpoint_manager(
   max_logging.log(f"checkpoint dir: {checkpoint_dir}")
   p = epath.Path(checkpoint_dir)
 
-  item_names = (
-      "unet_config",
-      "vae_config",
-      "text_encoder_config",
-      "scheduler_config",
-      "unet_state",
-      "vae_state",
-      "text_encoder_state",
-      "tokenizer_config",
-  )
+  if checkpoint_type == FLUX_CHECKPOINT:
+    item_names = ("flux_state", "flux_config",
+                  "vae_state", "vae_config",
+                  "scheduler", "scheduler_config")
+  else:
+    item_names = (
+        "unet_config",
+        "vae_config",
+        "text_encoder_config",
+        "scheduler_config",
+        "unet_state",
+        "vae_state",
+        "text_encoder_state",
+        "tokenizer_config",
+    )
   if checkpoint_type == STABLE_DIFFUSION_XL_CHECKPOINT or checkpoint_type == FLUX_CHECKPOINT:
     item_names += (
         "text_encoder_2_state",
@@ -140,6 +145,7 @@ def load_params_from_path(
 
   ckpt_path = os.path.join(config.checkpoint_dir, str(step), checkpoint_item)
   ckpt_path = epath.Path(ckpt_path)
+  ckpt_path = os.path.abspath(ckpt_path)
 
   restore_args = ocp.checkpoint_utils.construct_restore_args(unboxed_abstract_params)
   restored = ckptr.restore(
diff --git a/src/maxdiffusion/checkpointing/flux_checkpointer.py b/src/maxdiffusion/checkpointing/flux_checkpointer.py
@@ -17,6 +17,8 @@
 from abc import ABC
 from contextlib import nullcontext
 import functools
+import json
+import os
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
@@ -59,8 +61,10 @@ def __init__(self, config, checkpoint_type):
     self.mesh = Mesh(self.devices_array, self.config.mesh_axes)
     self.total_train_batch_size = self.config.total_train_batch_size
 
+    checkpoint_dir = os.path.abspath(self.config.checkpoint_dir)
+
     self.checkpoint_manager = create_orbax_checkpoint_manager(
-        self.config.checkpoint_dir,
+        checkpoint_dir,
         enable_checkpointing=True,
         save_interval_steps=1,
         checkpoint_type=checkpoint_type,
@@ -117,7 +121,7 @@ def create_vae_state(self, pipeline, params, checkpoint_item_name, is_training=F
         config=self.config,
         mesh=self.mesh,
         weights_init_fn=weights_init_fn,
-        model_params=params,
+        model_params=params.get("flux_vae", None),
         checkpoint_manager=self.checkpoint_manager,
         checkpoint_item=checkpoint_item_name,
         training=is_training,
@@ -149,20 +153,35 @@ def save_checkpoint(self, train_step, pipeline, train_states):
     def config_to_json(model_or_config):
       return json.loads(model_or_config.to_json_string())
     items = {
-        "config": ocp.args.JsonSave({"model_name": self.config.model_name}),
+        "flux_config": ocp.args.JsonSave(config_to_json(pipeline.flux)),
+        "vae_config": ocp.args.JsonSave(config_to_json(pipeline.vae)),
+        "scheduler_config": ocp.args.JsonSave(config_to_json(pipeline.scheduler))
     }
 
     items[FLUX_STATE_KEY] = ocp.args.PyTreeSave(train_states[FLUX_STATE_KEY])
+    items["vae_state"] = ocp.args.PyTreeSave(train_states["vae_state"])
+    items["scheduler"] = ocp.args.PyTreeSave(train_states["scheduler"])
 
     self.checkpoint_manager.save(train_step, args=ocp.args.Composite(**items))
 
   def load_params(self, step=None):
 
     self.checkpoint_format = _CHECKPOINT_FORMAT_ORBAX
   
-  def load_flux_configs_from_orbax(self):
-    # TODO - load configs from orbax
-    return None
+  def load_flux_configs_from_orbax(self, step):
+    max_logging.log("Restoring stable diffusion configs")
+    if step is None:
+      step = self.checkpoint_manager.latest_step()
+      if step is None:
+        return None
+
+    restore_args = {
+        "flux_config": ocp.args.JsonRestore(),
+        "vae_config": ocp.args.JsonRestore(),
+        "scheduler_config": ocp.args.JsonRestore(),
+    }
+
+    return (self.checkpoint_manager.restore(step, args=ocp.args.Composite(**restore_args)), None)
 
   def load_diffusers_checkpoint(self):
     flash_block_sizes = max_utils.get_flash_block_sizes(self.config)
@@ -238,12 +257,65 @@ def load_diffusers_checkpoint(self):
 
   def load_checkpoint(self, step=None, scheduler_class=None):
 
-    model_configs = self.load_flux_configs_from_orbax()
+    model_configs = self.load_flux_configs_from_orbax(step)
 
     pipeline, params = None, {}
 
     if model_configs:
-      print("TODO - load configs from orbax")
+      if jax.device_count() == jax.local_device_count():
+        context = jax.default_device(jax.devices("cpu")[0])
+      else:
+        context = nullcontext()
+
+      with context:
+        clip_encoder = FlaxCLIPTextModel.from_pretrained(
+          self.config.clip_model_name_or_path, dtype=self.config.weights_dtype
+        )
+        clip_tokenizer = CLIPTokenizer.from_pretrained(
+          self.config.clip_model_name_or_path,
+          max_length=77,
+          use_fast=True
+        )
+        t5_encoder = FlaxT5EncoderModel.from_pretrained(self.config.t5xxl_model_name_or_path, dtype=self.config.weights_dtype)
+        t5_tokenizer = AutoTokenizer.from_pretrained(
+          self.config.t5xxl_model_name_or_path,
+          max_length=self.config.max_sequence_length,
+          use_fast=True
+        )
+
+        vae = FlaxAutoencoderKL.from_config(
+          model_configs[0]["vae_config"],
+          dtype=self.config.activations_dtype,
+          weights_dtype=self.config.weights_dtype,
+          from_pt=self.config.from_pt,
+        )
+
+        transformer = FluxTransformer2DModel.from_config(
+          model_configs[0]["flux_config"],
+          mesh=self.mesh,
+          split_head_dim=self.config.split_head_dim,
+          attention_kernel=self.config.attention,
+          flash_block_sizes=max_utils.get_flash_block_sizes(self.config),
+          dtype=self.config.activations_dtype,
+          weights_dtype=self.config.weights_dtype,
+          precision=max_utils.get_precision(self.config),
+          from_pt=self.config.from_pt,
+        )
+
+        pipeline = FluxPipeline(
+          t5_encoder,
+          clip_encoder,
+          vae,
+          t5_tokenizer,
+          clip_tokenizer,
+          transformer,
+          None,
+          dtype=self.config.activations_dtype,
+          mesh=self.mesh,
+          config=self.config,
+          rng=self.rng
+        )
+
     else:
       pipeline, params = self.load_diffusers_checkpoint()
     
diff --git a/src/maxdiffusion/generate_flux_pipeline.py b/src/maxdiffusion/generate_flux_pipeline.py
@@ -0,0 +1,127 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from typing import Sequence
+from absl import app
+from contextlib import ExitStack
+import functools
+import time
+import numpy as np
+from PIL import Image
+import jax
+
+from maxdiffusion import pyconfig, max_logging, max_utils
+
+from maxdiffusion.checkpointing.flux_checkpointer import FluxCheckpointer
+from maxdiffusion.checkpointing.checkpointing_utils import load_params_from_path
+from maxdiffusion.max_utils import setup_initial_state
+
+def run(config):
+  checkpoint_loader = FluxCheckpointer(config, "FLUX_CHECKPOINT")
+  pipeline, params = checkpoint_loader.load_checkpoint()
+
+  if not params:
+    ## VAE
+    weights_init_fn = functools.partial(pipeline.vae.init_weights, rng=checkpoint_loader.rng)
+    unboxed_abstract_state, _, _ = max_utils.get_abstract_state(
+        pipeline.vae, None, config, checkpoint_loader.mesh, weights_init_fn, False
+    )
+    # load unet params from orbax checkpoint
+    vae_params = load_params_from_path(
+        config, checkpoint_loader.checkpoint_manager, unboxed_abstract_state.params, "vae_state"
+    )
+
+    vae_state = {"params": vae_params}
+
+    ## Flux
+    weights_init_fn = functools.partial(pipeline.flux.init_weights,
+                                        rngs=checkpoint_loader.rng,
+                                        max_sequence_length=config.max_sequence_length)
+
+    unboxed_abstract_state, _, _ = max_utils.get_abstract_state(
+        pipeline.flux, None, config, checkpoint_loader.mesh, weights_init_fn, False
+    )
+    # load unet params from orbax checkpoint
+    flux_params = load_params_from_path(
+        config, checkpoint_loader.checkpoint_manager, unboxed_abstract_state.params, "flux_state"
+    )
+    flux_state = {"params": flux_params}
+  else:
+    weights_init_fn = functools.partial(
+      pipeline.flux.init_weights,
+      rngs=checkpoint_loader.rng,
+      max_sequence_length=config.max_sequence_length,
+      eval_only=False
+    )
+    transformer_state, flux_state_shardings = setup_initial_state(
+        model=pipeline.flux,
+        tx=None,
+        config=config,
+        mesh=checkpoint_loader.mesh,
+        weights_init_fn=weights_init_fn,
+        model_params=None,
+        training=False,
+    )
+    transformer_state = transformer_state.replace(params=params["flux_transformer_params"])
+    transformer_state = jax.device_put(transformer_state, flux_state_shardings)
+
+    weights_init_fn = functools.partial(pipeline.vae.init_weights, rng=checkpoint_loader.rng)
+    vae_state, _ = setup_initial_state(
+        model=pipeline.vae,
+        tx=None,
+        config=config,
+        mesh=checkpoint_loader.mesh,
+        weights_init_fn=weights_init_fn,
+        model_params=params['flux_vae'],
+        training=False,
+    )
+
+    vae_state = {"params": vae_state.params}
+    flux_state = {"params": transformer_state.params}
+
+  t0 = time.perf_counter()
+  with ExitStack() as stack:
+    imgs = pipeline(flux_params=flux_state,
+                    timesteps=50,
+                    vae_params=vae_state).block_until_ready()
+  t1 = time.perf_counter()
+  max_logging.log(f"Compile time: {t1 - t0:.1f}s.")
+
+  t0 = time.perf_counter()
+  with ExitStack() as stack:
+    imgs = pipeline(flux_params=flux_state,
+                    timesteps=50,
+                    vae_params=vae_state).block_until_ready()
+  imgs = jax.experimental.multihost_utils.process_allgather(imgs, tiled=True)
+  t1 = time.perf_counter()
+  max_logging.log(f"Inference time: {t1 - t0:.1f}s.")
+  imgs = np.array(imgs)
+  imgs = (imgs * 0.5 + 0.5).clip(0, 1)
+  imgs = np.transpose(imgs, (0, 2, 3, 1))
+  imgs = np.uint8(imgs * 255)
+  for i, image in enumerate(imgs):
+    Image.fromarray(image).save(f"flux_{i}.png")
+
+  return imgs
+
+
+def main(argv: Sequence[str]) -> None:
+  pyconfig.initialize(argv)
+  run(pyconfig.config)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/src/maxdiffusion/pipelines/flux/flux_pipeline.py b/src/maxdiffusion/pipelines/flux/flux_pipeline.py
@@ -102,12 +102,12 @@ def unpack(self, x: Array, height: int, width: int) -> Array:
   def vae_decode(self, latents, vae, state, config):
     img = self.unpack(x=latents, height=config.resolution, width=config.resolution)
     img = img / vae.config.scaling_factor + vae.config.shift_factor
-    img = vae.apply({"params": state.params}, img, deterministic=True, method=vae.decode).sample
+    img = vae.apply({"params": state["params"]}, img, deterministic=True, method=vae.decode).sample
     return img
 
   def vae_encode(self, latents, vae, state):
     img = vae.apply(
-        {"params": state.params},
+        {"params": state["params"]},
         latents,
         deterministic=True,
         method=vae.encode).latent_dist.mode()
@@ -297,7 +297,7 @@ def loop_body(
       t_prev = p_ts[step]
       t_vec = jnp.full((latents.shape[0],), t_curr, dtype=latents.dtype)
       pred = transformer.apply(
-          {"params": state.params},
+          {"params": state['params']},
           hidden_states=latents,
           img_ids=latent_image_ids,
           encoder_hidden_states=prompt_embeds,
diff --git a/src/maxdiffusion/trainers/flux_trainer.py b/src/maxdiffusion/trainers/flux_trainer.py
@@ -65,15 +65,7 @@ def __init__(self, config):
       raise ValueError("this script currently doesn't support training text_encoders")
 
   def post_training_steps(self, pipeline, params, train_states, msg=""):
-    imgs = pipeline(flux_params=train_states[FLUX_STATE_KEY],
-                    timesteps=50,
-                    vae_params=train_states["vae_state"])
-    imgs = np.array(imgs)
-    imgs = (imgs * 0.5 + 0.5).clip(0, 1)
-    imgs = np.transpose(imgs, (0, 2, 3, 1))
-    imgs = np.uint8(imgs * 255)
-    for i, image in enumerate(imgs):
-      Image.fromarray(image).save(f"flux_{msg}_{i}.png")
+    pass
 
   def create_scheduler(self, pipeline, params):
     noise_scheduler, noise_scheduler_state = FlaxEulerDiscreteScheduler.from_pretrained(
@@ -113,7 +105,7 @@ def start_training(self):
 
 
     vae_state, vae_state_mesh_shardings = self.create_vae_state(
-        pipeline=pipeline, params=params[FLUX_VAE_PARAMS_KEY], checkpoint_item_name=VAE_STATE_KEY, is_training=False
+        pipeline=pipeline, params=params, checkpoint_item_name=VAE_STATE_KEY, is_training=False
     )
     train_states[VAE_STATE_KEY] = vae_state
     state_shardings[VAE_STATE_SHARDINGS_KEY] = vae_state_mesh_shardings
@@ -131,14 +123,13 @@ def start_training(self):
     flux_state, flux_state_mesh_shardings, flux_learning_rate_scheduler = self.create_flux_state(
         # ambiguous here, but if params=None
         # Then its 1 of 2 scenarios:
-        # 1. unet state will be loaded directly from orbax
-        # 2. a new unet is being trained from scratch.
+        # 1. flux state will be loaded directly from orbax
+        # 2. a new flux is being trained from scratch.
         pipeline=pipeline,
         params=None, # Params are loaded inside create_flux_state
         checkpoint_item_name=FLUX_STATE_KEY,
         is_training=True,
     )
-    flux_state = flux_state.replace(params=params[FLUX_TRANSFORMER_PARAMS_KEY])
     flux_state = jax.device_put(flux_state, flux_state_mesh_shardings)
     train_states[FLUX_STATE_KEY] = flux_state
     state_shardings[FLUX_STATE_SHARDINGS_KEY] = flux_state_mesh_shardings
@@ -162,7 +153,7 @@ def start_training(self):
     )
     # 6. save final checkpoint
     # Hook
-    #self.post_training_steps(pipeline, params, train_states, "after_training")
+    self.post_training_steps(pipeline, params, train_states, "after_training")
 
   def get_shaped_batch(self, config, pipeline=None):
     """Return the shape of the batch - this is what eval_shape would return for the
@@ -408,13 +399,9 @@ def training_loop(self, p_train_step, pipeline, params, train_states, data_itera
       if self.config.enable_profiler and step == last_profiling_step:
         max_utils.deactivate_profiler(self.config)
 
-    if self.config.write_metrics:
-      write_metrics(
-          writer, local_metrics_file, running_gcs_metrics, train_metric, self.config.max_train_steps - 1, self.config
-      )
-
     train_states[FLUX_STATE_KEY] = flux_state
-    max_logging.log(f"Average time per step: {sum(times[2:], datetime.timedelta(0)) / len(times[2:])}")
+    if len(times) > 0:
+      max_logging.log(f"Average time per step: {sum(times[2:], datetime.timedelta(0)) / len(times[2:])}")
     if self.config.save_final_checkpoint:
       max_logging.log(f"Saving checkpoint for step {step}")
       self.save_checkpoint(step, pipeline, train_states)