Formatting

coolkp · coolkp · commit b2956a776077 · 2025-06-17T13:34:39.000Z
Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -100,13 +100,8 @@ def prepare_sample(features):
     input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
     prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
     text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
-    
-    return {
-        "pixel_values": pixel_values,
-        "input_ids": input_ids,
-        "prompt_embeds": prompt_embeds,
-        "text_embeds": text_embeds
-    }
+
+    return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
 
   # This pipeline reads the sharded files and applies the parsing and preparation.
   filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
@@ -125,6 +120,7 @@ def prepare_sample(features):
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
+
 # TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
 def make_tfrecord_iterator(
     config,
@@ -138,22 +134,20 @@ def make_tfrecord_iterator(
   maxdiffusion/pedagogical_examples/to_tfrecords.py
   """
   if config.cache_latents_text_encoder_outputs and os.path.isdir(config.dataset_save_location):
-    return make_cached_tfrecord_iterator(config, dataloading_host_index, 
-                                         dataloading_host_count, mesh,
-                                         global_batch_size)
+    return make_cached_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size)
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
       "clip_embeddings": tf.io.FixedLenFeature([], tf.string),
   }
-  
+
   def _parse_tfrecord_fn(example):
     return tf.io.parse_single_example(example, feature_description)
 
   def prepare_sample(features):
     moments = tf.io.parse_tensor(tnp.asarray(features["moments"]), out_type=tf.float32)
     clip_embeddings = tf.io.parse_tensor(tnp.asarray(features["clip_embeddings"]), out_type=tf.float32)
     return {"pixel_values": moments, "input_ids": clip_embeddings}
-    
+
   filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
   train_ds = (
       tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
diff --git a/src/maxdiffusion/pedagogical_examples/dataset_tf_cache_to_tfrecord.py b/src/maxdiffusion/pedagogical_examples/dataset_tf_cache_to_tfrecord.py
@@ -4,26 +4,29 @@
 from datasets import load_from_disk
 import numpy as np
 
+
 def _bytes_feature(value):
   """Returns a bytes_list from a serialized tensor."""
   if not isinstance(value, tf.Tensor):
-      value = tf.constant(value)
+    value = tf.constant(value)
   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()]))
 
+
 def create_4_feature_example(record):
-    """Creates a tf.train.Example proto with all 4 pre-computed features."""
-    pixel_values = tf.io.serialize_tensor(record['pixel_values'])
-    input_ids = tf.io.serialize_tensor(record['input_ids'])
-    prompt_embeds = tf.io.serialize_tensor(record['prompt_embeds'])
-    text_embeds = tf.io.serialize_tensor(record['text_embeds'])
-
-    feature = {
-        "pixel_values": _bytes_feature(pixel_values),
-        "input_ids": _bytes_feature(input_ids),
-        "prompt_embeds": _bytes_feature(prompt_embeds),
-        "text_embeds": _bytes_feature(text_embeds)
-    }
-    return tf.train.Example(features=tf.train.Features(feature=feature))
+  """Creates a tf.train.Example proto with all 4 pre-computed features."""
+  pixel_values = tf.io.serialize_tensor(record["pixel_values"])
+  input_ids = tf.io.serialize_tensor(record["input_ids"])
+  prompt_embeds = tf.io.serialize_tensor(record["prompt_embeds"])
+  text_embeds = tf.io.serialize_tensor(record["text_embeds"])
+
+  feature = {
+      "pixel_values": _bytes_feature(pixel_values),
+      "input_ids": _bytes_feature(input_ids),
+      "prompt_embeds": _bytes_feature(prompt_embeds),
+      "text_embeds": _bytes_feature(text_embeds),
+  }
+  return tf.train.Example(features=tf.train.Features(feature=feature))
+
 
 def run(args):
   """Main processing function."""
@@ -41,56 +44,52 @@ def run(args):
       tf.io.TFRecordWriter(os.path.join(tfrecords_dir, f"shard-{i:05d}-of-{num_shards:05d}.tfrecord"))
       for i in range(num_shards)
   ]
-  
+
   print(f"Writing {len(processed_ds)} records into {num_shards} TFRecord shards...")
-  
+
   for i, record in enumerate(processed_ds):
-      # Create a new record with explicit casting for float types
-      casted_record = {
-          "pixel_values": np.float32(record['pixel_values']),
-          "input_ids": record['input_ids'], # This is already integer type
-          "prompt_embeds": np.float32(record['prompt_embeds']),
-          "text_embeds": np.float32(record['text_embeds'])
-      }
-      
-      writer_index = i % num_shards
-      tf_example = create_4_feature_example(casted_record)
-      writers[writer_index].write(tf_example.SerializeToString())
+    # Create a new record with explicit casting for float types
+    casted_record = {
+        "pixel_values": np.float32(record["pixel_values"]),
+        "input_ids": record["input_ids"],  # This is already integer type
+        "prompt_embeds": np.float32(record["prompt_embeds"]),
+        "text_embeds": np.float32(record["text_embeds"]),
+    }
+
+    writer_index = i % num_shards
+    tf_example = create_4_feature_example(casted_record)
+    writers[writer_index].write(tf_example.SerializeToString())
 
   for writer in writers:
-      writer.close()
-  
+    writer.close()
+
   print("TFRecord conversion complete.")
 
 
 def main():
   """Parses command-line arguments and runs the conversion."""
-  parser = argparse.ArgumentParser(
-      description="Convert a cached Hugging Face dataset to sharded TFRecords."
-  )
+  parser = argparse.ArgumentParser(description="Convert a cached Hugging Face dataset to sharded TFRecords.")
   parser.add_argument(
       "--dataset_save_location",
       type=str,
       required=False,
       default="/tmp/pokemon-gpt4-captions_xl",
-      help="Path to the cached dataset created by the training pipeline."
+      help="Path to the cached dataset created by the training pipeline.",
   )
   parser.add_argument(
       "--tfrecords_dir",
       type=str,
       required=False,
       default="/tmp/cached_pokemon_tfrecords_sharded",
-      help="Output directory to save the sharded TFRecord files."
+      help="Output directory to save the sharded TFRecord files.",
   )
   parser.add_argument(
-      "--data_num_shards",
-      type=int,
-      default=128,
-      help="Number of shards to split the TFRecord dataset into."
+      "--data_num_shards", type=int, default=128, help="Number of shards to split the TFRecord dataset into."
   )
-  
+
   args = parser.parse_args()
   run(args)
 
+
 if __name__ == "__main__":
-  main()
+  main()
diff --git a/src/maxdiffusion/train_utils.py b/src/maxdiffusion/train_utils.py
@@ -69,28 +69,31 @@ def record_scalar_metrics(metrics, step_time_delta, per_device_tflops, lr):
   metrics["scalar"].update({"perf/per_device_tflops_per_sec": per_device_tflops / step_time_delta.total_seconds()})
   metrics["scalar"].update({"learning/current_learning_rate": lr})
 
+
 _metrics_queue = queue.Queue()
 _buffered_step = None
 _buffered_metrics = None
 
+
 def _tensorboard_writer_worker(writer, config):
-    """
-    A worker function that runs in a separate thread.
-    It waits for metrics to appear in the queue and writes them to TensorBoard.
-    """
-    while True:
-        data = _metrics_queue.get()
-        if data is None:
-            break
-        metrics, step = data
-        if jax.process_index() == 0:
-            for metric_name in metrics.get("scalar", []):
-                writer.add_scalar(metric_name, np.array(metrics["scalar"][metric_name]), step)
-            for metric_name in metrics.get("scalars", []):
-                writer.add_scalars(metric_name, metrics["scalars"][metric_name], step)
-
-            if step % config.log_period == 0:
-                writer.flush()
+  """
+  A worker function that runs in a separate thread.
+  It waits for metrics to appear in the queue and writes them to TensorBoard.
+  """
+  while True:
+    data = _metrics_queue.get()
+    if data is None:
+      break
+    metrics, step = data
+    if jax.process_index() == 0:
+      for metric_name in metrics.get("scalar", []):
+        writer.add_scalar(metric_name, np.array(metrics["scalar"][metric_name]), step)
+      for metric_name in metrics.get("scalars", []):
+        writer.add_scalars(metric_name, metrics["scalars"][metric_name], step)
+
+      if step % config.log_period == 0:
+        writer.flush()
+
 
 def write_metrics(writer, local_metrics_file, running_gcs_metrics, metrics, step, config):
   """Entry point for all metrics writing in Train's Main.
@@ -108,12 +111,11 @@ def write_metrics(writer, local_metrics_file, running_gcs_metrics, metrics, step
   if _buffered_metrics is not None:
     if config.metrics_file:
       max_utils.write_metrics_locally(_buffered_metrics, _buffered_step, config, local_metrics_file)
-    
+
     if _buffered_step is None:
       raise ValueError(f"When writing metrics, {_buffered_step=} was none")
     write_metrics_to_tensorboard(writer, _buffered_metrics, _buffered_step, config)
 
-
     if config.gcs_metrics and jax.process_index() == 0:
       running_gcs_metrics = max_utils.write_metrics_for_gcs(_buffered_metrics, _buffered_step, config, running_gcs_metrics)
 
diff --git a/src/maxdiffusion/trainers/sdxl_trainer.py b/src/maxdiffusion/trainers/sdxl_trainer.py
@@ -40,7 +40,7 @@
     load_next_batch,
     record_scalar_metrics,
     write_metrics,
-    _metrics_queue
+    _metrics_queue,
 )
 
 from maxdiffusion.checkpointing.base_stable_diffusion_checkpointer import (STABLE_DIFFUSION_XL_CHECKPOINT)
@@ -67,7 +67,7 @@ def get_shaped_batch(self, config, pipeline):
     total_train_batch_size = config.total_train_batch_size
     shaped_batch = {}
 
-    if self.config.dataset_type in ["tf","tfrecord"] and self.config.cache_latents_text_encoder_outputs:
+    if self.config.dataset_type in ["tf", "tfrecord"] and self.config.cache_latents_text_encoder_outputs:
       batch_image_shape = (
           total_train_batch_size,
           pipeline.unet.config.in_channels,
@@ -92,7 +92,7 @@ def get_shaped_batch(self, config, pipeline):
 
   def get_data_shardings(self):
     data_sharding = jax.sharding.NamedSharding(self.mesh, P(*self.config.data_sharding))
-    if self.config.dataset_type in ["tf","tfrecord"]  and self.config.cache_latents_text_encoder_outputs:
+    if self.config.dataset_type in ["tf", "tfrecord"] and self.config.cache_latents_text_encoder_outputs:
       data_sharding = {
           "input_ids": data_sharding,
           "pixel_values": data_sharding,
@@ -188,11 +188,7 @@ def compile_train_step(self, pipeline, params, train_states, state_shardings, da
   def training_loop(self, p_train_step, pipeline, params, train_states, data_iterator, unet_learning_rate_scheduler):
 
     writer = max_utils.initialize_summary_writer(self.config)
-    writer_thread = threading.Thread(
-        target=_tensorboard_writer_worker,
-        args=(writer, self.config),
-        daemon=True
-    )
+    writer_thread = threading.Thread(target=_tensorboard_writer_worker, args=(writer, self.config), daemon=True)
     writer_thread.start()
     unet_state = train_states["unet_state"]
     vae_state = train_states["vae_state"]
@@ -228,14 +224,14 @@ def training_loop(self, p_train_step, pipeline, params, train_states, data_itera
       for step in np.arange(start_step, self.config.max_train_steps):
         if self.config.enable_profiler and step == first_profiling_step:
           max_utils.activate_profiler(self.config)
-        
+
         next_batch_future = executor.submit(load_next_batch, data_iterator, example_batch, self.config)
         start_step_time = datetime.datetime.now()
         with jax.profiler.StepTraceAnnotation("train-new", step_num=step):
           (unet_state, train_metric, train_rngs) = p_train_step(
               unet_state, vae_state, text_encoder_state, text_encoder_2_state, example_batch, train_rngs
           )
-          train_metric['scalar']['learning/loss'].block_until_ready()
+          train_metric["scalar"]["learning/loss"].block_until_ready()
         samples_count = self.total_train_batch_size * (step + 1)
         last_step_completion = datetime.datetime.now()
         time_difference = last_step_completion - start_step_time
@@ -247,7 +243,7 @@ def training_loop(self, p_train_step, pipeline, params, train_states, data_itera
         if self.config.write_metrics:
           write_metrics(writer, local_metrics_file, running_gcs_metrics, train_metric, step, self.config)
         example_batch = next_batch_future.result()
-        
+
         if step != 0 and self.config.checkpoint_every != -1 and samples_count % self.config.checkpoint_every == 0:
           train_states["unet_state"] = unet_state
           train_states["vae_state"] = vae_state
@@ -265,7 +261,7 @@ def training_loop(self, p_train_step, pipeline, params, train_states, data_itera
     _metrics_queue.put(None)
     writer_thread.join()
     if writer:
-        writer.flush()
+      writer.flush()
     train_states["unet_state"] = unet_state
     train_states["text_encoder_state"] = text_encoder_state
     train_states["text_encoder_2_state"] = text_encoder_2_state
@@ -369,5 +365,5 @@ def compute_loss(state_params):
   new_state = unet_state.apply_gradients(grads=grad["unet"])
 
   metrics = {"scalar": {"learning/loss": loss}, "scalars": {}}
-   
+
   return new_state, metrics, new_train_rng