Optimize batch loading and metrics writing, replace PositionalSharding with NamedSharding (#186)

coolkp · hx89 · commit d392bf425fb1 · 2025-07-15T12:14:55.000-07:00
* fix profiling

* Use torch cpu, async write to tensorboard, script to convert latents to tfrecord, batch iterator for tfrecord cached, namedsharding instead of positional sharding

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Replace positional sharding with named sharding

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Formatting

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Formatting

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* Fallback to regular tfrecord iterator for datasets without all the processed features

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;

* README update

---------

Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -78,6 +78,53 @@ def make_tf_iterator(
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
+def make_cached_tfrecord_iterator(
+    config,
+    dataloading_host_index,
+    dataloading_host_count,
+    mesh,
+    global_batch_size,
+):
+  """
+  New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
+  latents, input_ids, prompt_embeds, and text_embeds.
+  """
+  feature_description = {
+      "pixel_values": tf.io.FixedLenFeature([], tf.string),
+      "input_ids": tf.io.FixedLenFeature([], tf.string),
+      "prompt_embeds": tf.io.FixedLenFeature([], tf.string),
+      "text_embeds": tf.io.FixedLenFeature([], tf.string),
+  }
+
+  def _parse_tfrecord_fn(example):
+    return tf.io.parse_single_example(example, feature_description)
+
+  def prepare_sample(features):
+    pixel_values = tf.io.parse_tensor(features["pixel_values"], out_type=tf.float32)
+    input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
+    prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
+    text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
+
+    return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
+
+  # This pipeline reads the sharded files and applies the parsing and preparation.
+  filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
+
+  train_ds = (
+      tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
+      .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
+      .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
+      .map(prepare_sample, num_parallel_calls=AUTOTUNE)
+      .shuffle(global_batch_size * 10)
+      .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
+      .repeat(-1)
+      .prefetch(AUTOTUNE)
+  )
+
+  # This wraps the tf.data.Dataset for use in the multi-host JAX environment.
+  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
+  return train_iter
+
 
 def make_cached_tfrecord_iterator(
     config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
@@ -120,22 +167,12 @@ def make_tfrecord_iterator(
 
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
-  # Dataset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
-  if (
-      config.cache_latents_text_encoder_outputs
+  # Datset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
+  if (config.cache_latents_text_encoder_outputs
       and os.path.isdir(config.dataset_save_location)
-      and "load_tfrecord_cached" in config.get_keys()
-      and config.load_tfrecord_cached
-  ):
-    return make_cached_tfrecord_iterator(
-        config,
-        dataloading_host_index,
-        dataloading_host_count,
-        mesh,
-        global_batch_size,
-        feature_description,
-        prepare_sample_fn,
-    )
+      and 'load_tfrecord_cached'in config.get_keys()
+      and config.load_tfrecord_cached):
+    return make_cached_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size)
 
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -198,7 +198,11 @@ def load_vae(cls, devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: H
     # This replaces random params with the model.
     params = load_wan_vae(config.pretrained_model_name_or_path, params, "cpu")
     params = jax.tree_util.tree_map(lambda x: x.astype(config.weights_dtype), params)
+<<<<<<< HEAD
     params = jax.device_put(params, NamedSharding(mesh, P()))
+=======
+    params = jax.device_put(params, NamedSharding(devices_array, P()))
+>>>>>>> f344ab0 (Optimize batch loading and metrics writing, replace PositionalSharding with NamedSharding (#186))
     wan_vae = nnx.merge(graphdef, params)
     p_create_sharded_logical_model = partial(create_sharded_logical_model, logical_axis_rules=config.logical_axis_rules)
     # Shard