Rebase, Optimize batch loading and metrics writing, replace PositionalSharding with NamedSharding (#186)

coolkp · hx89 · commit 727fdcbccc3b · 2025-07-15T12:33:59.000-07:00
* fix profiling

* Use torch cpu, async write to tensorboard, script to convert latents to tfrecord, batch iterator for tfrecord cached, namedsharding instead of positional sharding

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Replace positional sharding with named sharding

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Formatting

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Formatting

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Fallback to regular tfrecord iterator for datasets without all the processed features

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* README update

---------

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -126,6 +126,55 @@ def prepare_sample(features):
   return train_iter
 
 
+# TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
+def make_tfrecord_iterator(
+    config,
+    dataloading_host_index,
+    dataloading_host_count,
+    mesh,
+    global_batch_size,
+):
+  """
+  New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
+  latents, input_ids, prompt_embeds, and text_embeds.
+  """
+  feature_description = {
+      "pixel_values": tf.io.FixedLenFeature([], tf.string),
+      "input_ids": tf.io.FixedLenFeature([], tf.string),
+      "prompt_embeds": tf.io.FixedLenFeature([], tf.string),
+      "text_embeds": tf.io.FixedLenFeature([], tf.string),
+  }
+
+  def _parse_tfrecord_fn(example):
+    return tf.io.parse_single_example(example, feature_description)
+
+  def prepare_sample(features):
+    pixel_values = tf.io.parse_tensor(features["pixel_values"], out_type=tf.float32)
+    input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
+    prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
+    text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
+
+    return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
+
+  # This pipeline reads the sharded files and applies the parsing and preparation.
+  filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
+
+  train_ds = (
+      tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
+      .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
+      .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
+      .map(prepare_sample, num_parallel_calls=AUTOTUNE)
+      .shuffle(global_batch_size * 10)
+      .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
+      .repeat(-1)
+      .prefetch(AUTOTUNE)
+  )
+
+  # This wraps the tf.data.Dataset for use in the multi-host JAX environment.
+  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
+  return train_iter
+
+
 def make_cached_tfrecord_iterator(
     config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
 ):