refactor tfrecord function and change eval loss name

susanbao · susanbao · commit 82e37c1c9ee6 · 2025-08-14T07:49:41.000Z
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -78,105 +78,45 @@ def make_tf_iterator(
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
-def make_cached_tfrecord_iterator(
-    dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, dataset_path, is_training: bool
-):
-  """
-  New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
-  latents, input_ids, prompt_embeds, and text_embeds.
-  """
-
-  def _parse_tfrecord_fn(example):
-    return tf.io.parse_single_example(example, feature_description)
-
-  # This pipeline reads the sharded files and applies the parsing and preparation.
-  filenames = tf.io.gfile.glob(os.path.join(dataset_path, "*"))
-  ds = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
-
-  # --- PADDING LOGIC FOR EVALUATION ---
-  if not is_training:
-      num_eval_samples = 0
-      for _ in ds:
-          num_eval_samples += 1
-
-      remainder = num_eval_samples % global_batch_size
-      if remainder != 0:
-          num_to_pad = global_batch_size - remainder
-          # Create a dataset of padding samples from the beginning
-          padding_ds = ds.take(num_to_pad)
-          # Add the padding samples to the end
-          ds = ds.concatenate(padding_ds)
-          print(f"Padded evaluation dataset with {num_to_pad} samples.")
-
-  ds = (
-    ds.shard(num_shards=dataloading_host_count, index=dataloading_host_index)
-    .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-    .map(prepare_sample_fn, num_parallel_calls=AUTOTUNE)
-  )
-  if is_training:
-    ds = (
-      ds.shuffle(global_batch_size * 10)
-      .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
-      .repeat(-1)
-      .prefetch(AUTOTUNE)
-    )
-  # For Evaluation
-  else:
-    ds = (
-      ds.batch(global_batch_size // dataloading_host_count, drop_remainder=False)
-      .prefetch(AUTOTUNE)
-    )
-
-  # This wraps the tf.data.Dataset for use in the multi-host JAX environment.
-  iter = multihost_dataloading.MultiHostDataLoadIterator(ds, mesh)
-  return iter
-
-
 # TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
 def _make_tfrecord_iterator(
-    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, dataset_path, is_training: bool
+    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description_fn, prepare_sample_fn, dataset_path, is_training: bool
 ):
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
   # Dataset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
+  # if is_training is True, loads the training dataset. If False, loads the evaluation dataset.
 
   # checks that the dataset path is valid. In case of gcs, the existance of the dir is not checked.
-  # if is_training is True, loads the training dataset. If False, loads the evaluation dataset.
   is_dataset_dir_valid = "gs://" in config.dataset_save_location or os.path.isdir(config.dataset_save_location)
 
-  if (
-      config.cache_latents_text_encoder_outputs
-      and is_dataset_dir_valid
-      and "load_tfrecord_cached" in config.get_keys()
-      and config.load_tfrecord_cached
-  ):
-    return make_cached_tfrecord_iterator(
-        dataloading_host_index,
-        dataloading_host_count,
-        mesh,
-        global_batch_size,
-        feature_description,
-        prepare_sample_fn,
-        dataset_path,
-        is_training
-    )
+  # Determine whether to use the "cached" dataset, which requires externally
+  # provided parsing functions, or the default one with its internal parsing logic.
+  make_cached_tfrecord_iterator = (
+    config.cache_latents_text_encoder_outputs
+    and is_dataset_dir_valid
+    and "load_tfrecord_cached" in config.get_keys()
+    and config.load_tfrecord_cached
+  )
 
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
       "clip_embeddings": tf.io.FixedLenFeature([], tf.string),
   }
 
+  used_feature_description = feature_description_fn if make_cached_tfrecord_iterator else feature_description
+
   def _parse_tfrecord_fn(example):
-    return tf.io.parse_single_example(example, feature_description)
+    return tf.io.parse_single_example(example, used_feature_description)
 
   def prepare_sample(features):
     moments = tf.io.parse_tensor(tnp.asarray(features["moments"]), out_type=tf.float32)
     clip_embeddings = tf.io.parse_tensor(tnp.asarray(features["clip_embeddings"]), out_type=tf.float32)
     return {"pixel_values": moments, "input_ids": clip_embeddings}
 
   filenames = tf.io.gfile.glob(os.path.join(dataset_path, "*"))
-
   ds = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
+
   # --- PADDING LOGIC FOR EVALUATION ---
   if not is_training:
     num_eval_samples = 0
@@ -191,11 +131,12 @@ def prepare_sample(features):
         # Add the padding samples to the end
         ds = ds.concatenate(padding_ds)
         print(f"Padded evaluation dataset with {num_to_pad} samples.")
-  
+
+  used_prepare_sample = prepare_sample_fn if make_cached_tfrecord_iterator else prepare_sample
   ds = (
     ds.shard(num_shards=dataloading_host_count, index=dataloading_host_index)
     .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-    .map(prepare_sample, num_parallel_calls=AUTOTUNE)
+    .map(used_prepare_sample, num_parallel_calls=AUTOTUNE)
   )
   if is_training:
     ds = (
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -251,7 +251,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
               with mesh:
                 eval_batch = load_next_batch(eval_data_iterator, None, self.config)
                 metrics, eval_rng = p_eval_step(state, eval_batch, eval_rng, scheduler_state)
-                eval_metrics.append(metrics["scalar"]["eval/loss"])
+                eval_metrics.append(metrics["scalar"]["learning/eval_loss"])
             except StopIteration:
               # This block is executed when the iterator has no more data
               break
@@ -260,7 +260,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
             eval_loss = jnp.mean(jnp.array(eval_metrics))
             max_logging.log(f"Step {step}, Eval loss: {eval_loss:.4f}")
             if writer:
-              writer.add_scalar("eval/loss", eval_loss, step)
+              writer.add_scalar("learning/eval_loss", eval_loss, step)
           else:
             max_logging.log(f"Step {step}, evaluation dataset was empty.")
         example_batch = next_batch_future.result()
@@ -373,7 +373,7 @@ def loss_fn(params):
   loss = loss_fn(state.params)
 
   # Structure the metrics for logging and aggregation
-  metrics = {"scalar": {"eval/loss": loss}}
+  metrics = {"scalar": {"learning/eval_loss": loss}}
 
   # Return the computed metrics and the new RNG key for the next eval step
   return metrics, new_rng