training pipeline with image dataset.

Juan Acevedo · Juan Acevedo · commit afc888288de0 · 2025-06-26T01:12:12.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -141,14 +141,15 @@ ici_tensor_parallelism: 1
 # Replace with dataset path or train_data_dir. One has to be set.
 dataset_name: 'diffusers/pokemon-gpt4-captions'
 train_split: 'train'
-dataset_type: 'tf'
+dataset_type: 'tfrecord'
 cache_latents_text_encoder_outputs: True
 # cache_latents_text_encoder_outputs only apply to dataset_type="tf",
 # only apply to small dataset that fits in memory
 # prepare image latents and text encoder outputs
 # Reduce memory consumption and reduce step time during training
 # transformed dataset is saved at dataset_save_location
-dataset_save_location: '/tmp/pokemon-gpt4-captions_xl'
+dataset_save_location: ''
+load_tfrecord_cached: True
 train_data_dir: ''
 dataset_config_name: ''
 jax_cache_dir: ''
diff --git a/src/maxdiffusion/data_preprocessing/wan_txt2vid_data_preprocessing.py b/src/maxdiffusion/data_preprocessing/wan_txt2vid_data_preprocessing.py
@@ -112,25 +112,26 @@ def generate_dataset(config, pipeline):
   for i in range(0, len(ds), batch_size):
     rng, new_rng = jax.random.split(rng)
     text = ds[i:i+batch_size]['text']
-    video = ds[i:i+batch_size]['image']
+    videos = ds[i:i+batch_size]['image']
     
-    video = [np.expand_dims(np.array(i), axis=0) for i in video]
-    video = video_processor.preprocess_video(video, height=config.height, width=config.width)
-    video = jnp.array(np.array(video), dtype=config.weights_dtype)
+    videos = [video_processor.preprocess_video([video], height=config.height, width=config.width) for video in videos]
+    video = jnp.array(np.squeeze(np.array(videos), axis=1), dtype=config.weights_dtype)
     with mesh:
       latents = p_vae_encode(video=video, rng=new_rng)
+      latents = jnp.transpose(latents, (0, 4, 1, 2, 3))
     encoder_hidden_states = text_encode(pipeline, text)
-    example = create_example(latents, encoder_hidden_states)
-    writer.write(example)
-    shard_record_count += batch_size
-    global_record_count += batch_size
+    for latent, encoder_hidden_state in zip(latents, encoder_hidden_states):
+      writer.write(create_example(latent, encoder_hidden_state))
+      shard_record_count += 1
+      global_record_count += 1
+
     if shard_record_count >= no_records_per_shard:
       writer.close()
+      tf_rec_num +=1
       writer = tf.io.TFRecordWriter(
           tfrecords_dir + "/file_%.2i-%i.tfrec" % (tf_rec_num, (global_record_count + no_records_per_shard))
       )
       shard_record_count = 0
-    tf_rec_num +=1
 
 
 
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -21,9 +21,10 @@
 from maxdiffusion.utils import export_to_video
 
 
-def run(config):
+def run(config, pipeline=None, filename_prefix=''):
   print("seed: ", config.seed)
-  pipeline = WanPipeline.from_pretrained(config)
+  if pipeline is None:
+    pipeline = WanPipeline.from_pretrained(config)
   s0 = time.perf_counter()
 
   # Skip layer guidance
@@ -59,7 +60,7 @@ def run(config):
 
   print("compile time: ", (time.perf_counter() - s0))
   for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
+    export_to_video(videos[i], f"{filename_prefix}wan_output_{config.seed}_{i}.mp4", fps=config.fps)
   s0 = time.perf_counter()
   videos = pipeline(
       prompt=prompt,
diff --git a/src/maxdiffusion/input_pipeline/_tfds_data_processing.py b/src/maxdiffusion/input_pipeline/_tfds_data_processing.py
@@ -79,37 +79,25 @@ def make_cached_tfrecord_iterator(
     dataloading_host_count,
     mesh,
     global_batch_size,
+    feature_description,
+    prepare_sample_fn
 ):
   """
   New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
   latents, input_ids, prompt_embeds, and text_embeds.
   """
-  feature_description = {
-      "pixel_values": tf.io.FixedLenFeature([], tf.string),
-      "input_ids": tf.io.FixedLenFeature([], tf.string),
-      "prompt_embeds": tf.io.FixedLenFeature([], tf.string),
-      "text_embeds": tf.io.FixedLenFeature([], tf.string),
-  }
 
   def _parse_tfrecord_fn(example):
     return tf.io.parse_single_example(example, feature_description)
 
-  def prepare_sample(features):
-    pixel_values = tf.io.parse_tensor(features["pixel_values"], out_type=tf.float32)
-    input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
-    prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
-    text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
-
-    return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
-
   # This pipeline reads the sharded files and applies the parsing and preparation.
   filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
 
   train_ds = (
       tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
       .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
       .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-      .map(prepare_sample, num_parallel_calls=AUTOTUNE)
+      .map(prepare_sample_fn, num_parallel_calls=AUTOTUNE)
       .shuffle(global_batch_size * 10)
       .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
       .repeat(-1)
@@ -128,6 +116,8 @@ def make_tfrecord_iterator(
     dataloading_host_count,
     mesh,
     global_batch_size,
+    feature_description,
+    prepare_sample_fn
 ):
   """Iterator for TFRecord format. For Laion dataset,
   check out preparation script
@@ -136,12 +126,20 @@ def make_tfrecord_iterator(
 
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
-  # Datset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
+  # Dataset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
   if (config.cache_latents_text_encoder_outputs
       and os.path.isdir(config.dataset_save_location)
       and 'load_tfrecord_cached'in config.get_keys()
       and config.load_tfrecord_cached):
-    return make_cached_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size)
+    return make_cached_tfrecord_iterator(
+      config,
+      dataloading_host_index,
+      dataloading_host_count,
+      mesh,
+      global_batch_size,
+      feature_description,
+      prepare_sample_fn
+    )
 
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
diff --git a/src/maxdiffusion/input_pipeline/input_pipeline_interface.py b/src/maxdiffusion/input_pipeline/input_pipeline_interface.py
@@ -50,8 +50,18 @@ def make_data_iterator(
     global_batch_size,
     tokenize_fn=None,
     image_transforms_fn=None,
+    feature_description=None,
+    prepare_sample_fn=None
 ):
   """Make data iterator for SD1, 2, XL, dataset_types in (hf, tf, tfrecord)"""
+  
+  if config.dataset_type == "hf" or config.dataset_type == "tf":
+    if tokenize_fn is None or image_transforms_fn is None:
+      raise ValueError(f"dataset type {config.dataset_type} needs to pass a tokenize_fn and image_transforms_fn")
+  
+  if config.dataset_type == "tfrecord" and config.cache_latents_text_encoder_outputs and feature_description is None or prepare_sample_fn is None:
+    raise ValueError(f"dataset type {config.dataset_type} needs to pass a feature_description dictionary and prepare_sample_fn function when cache_latents_text_encoder_outputs is True.")
+
   if config.dataset_type == "hf":
     return _hf_data_processing.make_hf_streaming_iterator(
         config,
@@ -87,6 +97,8 @@ def make_data_iterator(
         dataloading_host_count,
         mesh,
         global_batch_size,
+        feature_description,
+        prepare_sample_fn
     )
   else:
     assert False, f"Unknown dataset_type {config.dataset_type}, dataset_type must be in (tf, tfrecord, hf, grain)"
diff --git a/src/maxdiffusion/trainers/sdxl_trainer.py b/src/maxdiffusion/trainers/sdxl_trainer.py
@@ -20,6 +20,7 @@
 import threading
 import time
 import numpy as np
+import tensorflow as tf
 import jax
 import jax.numpy as jnp
 from jax.sharding import PartitionSpec as P
@@ -140,6 +141,21 @@ def load_dataset(self, pipeline, params, train_states):
         p_vae_apply=p_vae_apply,
     )
 
+    feature_description = {
+      "pixel_values": tf.io.FixedLenFeature([], tf.string),
+      "input_ids": tf.io.FixedLenFeature([], tf.string),
+      "prompt_embeds": tf.io.FixedLenFeature([], tf.string),
+      "text_embeds": tf.io.FixedLenFeature([], tf.string),
+    }
+
+    def prepare_sample(features):
+      pixel_values = tf.io.parse_tensor(features["pixel_values"], out_type=tf.float32)
+      input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
+      prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
+      text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
+
+      return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
+
     data_iterator = make_data_iterator(
         config,
         jax.process_index(),
@@ -148,6 +164,8 @@ def load_dataset(self, pipeline, params, train_states):
         total_train_batch_size,
         tokenize_fn=tokenize_fn,
         image_transforms_fn=image_transforms_fn,
+        feature_description=feature_description,
+        prepare_sample_fn=prepare_sample
     )
 
     return data_iterator
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py