AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/maxdiffusion/data_preprocessing/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎src/maxdiffusion/data_preprocessing/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/maxdiffusion/data_preprocessing/wan_txt2vid_data_preprocessing.py‎
Lines changed: 153 additions & 0 deletions b/‎src/maxdiffusion/data_preprocessing/wan_txt2vid_data_preprocessing.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 12 additions & 8 deletions b/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/maxdiffusion/input_pipeline/_tfds_data_processing.py‎
Lines changed: 19 additions & 30 deletions b/‎src/maxdiffusion/input_pipeline/_tfds_data_processing.py‎
Lines changed: 19 additions & 30 deletions
diff --git a/‎src/maxdiffusion/input_pipeline/input_pipeline_interface.py‎
Lines changed: 19 additions & 0 deletions b/‎src/maxdiffusion/input_pipeline/input_pipeline_interface.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 2 additions & 6 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 2 additions & 6 deletions
@@ -29,6 +29,9 @@ log_period: 100
 
 pretrained_model_name_or_path: 'Wan-AI/Wan2.1-T2V-14B-Diffusers'
 
+# Overrides the transformer from pretrained_model_name_or_path
+wan_transformer_pretrained_model_name_or_path: ''
+
 unet_checkpoint: ''
 revision: ''
 # This will convert the weights to this dtype.
 
@@ -0,0 +1,15 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ """
@@ -0,0 +1,153 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+"""
+Prepare tfrecords with latents and text embeddings preprocessed.
+1. Download the dataset
+"""
+
+import os
+import functools
+from absl import app
+from typing import Sequence, Union, List
+from datasets import load_dataset
+import numpy as np
+import jax
+import jax.numpy as jnp
+from jax.sharding import Mesh
+from maxdiffusion import pyconfig, max_utils
+from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
+from maxdiffusion.video_processor import VideoProcessor
+
+import tensorflow as tf
+
+
+def image_feature(value):
+  """Returns a bytes_list from a string / byte."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))
+
+
+def bytes_feature(value):
+  """Returns a bytes_list from a string / byte."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()]))
+
+
+def float_feature(value):
+  """Returns a float_list from a float / double."""
+  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+
+
+def int64_feature(value):
+  """Returns an int64_list from a bool / enum / int / uint."""
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+
+def float_feature_list(value):
+  """Returns a list of float_list from a float / double."""
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def create_example(latent, hidden_states):
+  latent = tf.io.serialize_tensor(latent)
+  hidden_states = tf.io.serialize_tensor(hidden_states)
+  feature = {
+      "latents": bytes_feature(latent),
+      "encoder_hidden_states": bytes_feature(hidden_states),
+  }
+  example = tf.train.Example(features=tf.train.Features(feature=feature))
+  return example.SerializeToString()
+
+
+def text_encode(pipeline, prompt: Union[str, List[str]]):
+  encoder_hidden_states = pipeline._get_t5_prompt_embeds(prompt)
+  encoder_hidden_states = encoder_hidden_states.detach().numpy()
+  return encoder_hidden_states
+
+
+def vae_encode(video, rng, vae, vae_cache):
+  latent = vae.encode(video, feat_cache=vae_cache)
+  latent = latent.latent_dist.sample(rng)
+  return latent
+
+
+def generate_dataset(config, pipeline):
+
+  tfrecords_dir = config.tfrecords_dir
+  if not os.path.exists(tfrecords_dir):
+    os.makedirs(tfrecords_dir)
+
+  tf_rec_num = 0
+  no_records_per_shard = config.no_records_per_shard
+  global_record_count = 0
+  writer = tf.io.TFRecordWriter(
+      tfrecords_dir + "/file_%.2i-%i.tfrec" % (tf_rec_num, (global_record_count + no_records_per_shard))
+  )
+  shard_record_count = 0
+
+  # create mesh
+  devices_array = max_utils.create_device_mesh(config)
+  mesh = Mesh(devices_array, config.mesh_axes)
+  rng = jax.random.key(config.seed)
+
+  vae_scale_factor_spatial = 2 ** len(pipeline.vae.temperal_downsample)
+  video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor_spatial)
+
+  # jit vae fun.
+  p_vae_encode = jax.jit(functools.partial(vae_encode, vae=pipeline.vae, vae_cache=pipeline.vae_cache))
+
+  # Load dataset
+  ds = load_dataset(config.dataset_name, split="train")
+  ds = ds.shuffle(seed=config.seed)
+  ds = ds.select_columns([config.caption_column, config.image_column])
+  batch_size = 10
+  for i in range(0, len(ds), batch_size):
+    rng, new_rng = jax.random.split(rng)
+    text = ds[i : i + batch_size]["text"]
+    videos = ds[i : i + batch_size]["image"]
+
+    videos = [video_processor.preprocess_video([video], height=config.height, width=config.width) for video in videos]
+    video = jnp.array(np.squeeze(np.array(videos), axis=1), dtype=config.weights_dtype)
+    with mesh:
+      latents = p_vae_encode(video=video, rng=new_rng)
+      latents = jnp.transpose(latents, (0, 4, 1, 2, 3))
+    encoder_hidden_states = text_encode(pipeline, text)
+    for latent, encoder_hidden_state in zip(latents, encoder_hidden_states):
+      writer.write(create_example(latent, encoder_hidden_state))
+      shard_record_count += 1
+      global_record_count += 1
+
+    if shard_record_count >= no_records_per_shard:
+      writer.close()
+      tf_rec_num += 1
+      writer = tf.io.TFRecordWriter(
+          tfrecords_dir + "/file_%.2i-%i.tfrec" % (tf_rec_num, (global_record_count + no_records_per_shard))
+      )
+      shard_record_count = 0
+
+
+def run(config):
+  pipeline = WanPipeline.from_pretrained(config, load_transformer=False)
+  # Don't need the transformer for preprocessing.
+  generate_dataset(config, pipeline)
+
+
+def main(argv: Sequence[str]) -> None:
+  pyconfig.initialize(argv)
+  run(pyconfig.config)
+
+
+if __name__ == "__main__":
+  app.run(main)
@@ -16,13 +16,12 @@
 import jax
 import time
 from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
-from maxdiffusion import pyconfig, max_logging
+from maxdiffusion import pyconfig, max_logging, max_utils
 from absl import app
 from maxdiffusion.utils import export_to_video
 
 jax.config.update('jax_use_shardy_partitioner', True)
 
-
 def run(config, pipeline=None, filename_prefix=""):
   print("seed: ", config.seed)
   if pipeline is None:
@@ -61,8 +60,12 @@ def run(config, pipeline=None, filename_prefix=""):
   )
 
   print("compile time: ", (time.perf_counter() - s0))
+  saved_video_path = []
   for i in range(len(videos)):
-    export_to_video(videos[i], f"{filename_prefix}wan_output_{config.seed}_{i}.mp4", fps=config.fps)
+    video_path = f"{filename_prefix}wan_output_{config.seed}_{i}.mp4"
+    export_to_video(videos[i], video_path, fps=config.fps)
+    saved_video_path.append(video_path)
+
   s0 = time.perf_counter()
   videos = pipeline(
       prompt=prompt,
@@ -76,12 +79,11 @@ def run(config, pipeline=None, filename_prefix=""):
       slg_start=slg_start,
       slg_end=slg_end,
   )
-  print("generation time: ", (time.perf_counter() - s0))
-  for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
+  print("compile time: ", (time.perf_counter() - s0))
 
   s0 = time.perf_counter()
-  with jax.profiler.trace("/tmp/trace/"):
+  if config.enable_profiler:
+    max_utils.activate_profiler(config)
     videos = pipeline(
         prompt=prompt,
         negative_prompt=negative_prompt,
@@ -94,7 +96,9 @@ def run(config, pipeline=None, filename_prefix=""):
         slg_start=slg_start,
         slg_end=slg_end,
     )
-  print("generation time: ", (time.perf_counter() - s0))
+    max_utils.deactivate_profiler(config)
+    print("generation time: ", (time.perf_counter() - s0))
+  return saved_video_path
 
 
 def main(argv: Sequence[str]) -> None:
 
@@ -73,43 +73,26 @@ def make_tf_iterator(
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
+
 def make_cached_tfrecord_iterator(
-    config,
-    dataloading_host_index,
-    dataloading_host_count,
-    mesh,
-    global_batch_size,
+    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
 ):
   """
   New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
   latents, input_ids, prompt_embeds, and text_embeds.
   """
-  feature_description = {
-      "pixel_values": tf.io.FixedLenFeature([], tf.string),
-      "input_ids": tf.io.FixedLenFeature([], tf.string),
-      "prompt_embeds": tf.io.FixedLenFeature([], tf.string),
-      "text_embeds": tf.io.FixedLenFeature([], tf.string),
-  }
 
   def _parse_tfrecord_fn(example):
     return tf.io.parse_single_example(example, feature_description)
 
-  def prepare_sample(features):
-    pixel_values = tf.io.parse_tensor(features["pixel_values"], out_type=tf.float32)
-    input_ids = tf.io.parse_tensor(features["input_ids"], out_type=tf.int32)
-    prompt_embeds = tf.io.parse_tensor(features["prompt_embeds"], out_type=tf.float32)
-    text_embeds = tf.io.parse_tensor(features["text_embeds"], out_type=tf.float32)
-
-    return {"pixel_values": pixel_values, "input_ids": input_ids, "prompt_embeds": prompt_embeds, "text_embeds": text_embeds}
-
   # This pipeline reads the sharded files and applies the parsing and preparation.
   filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
 
   train_ds = (
       tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
       .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
       .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-      .map(prepare_sample, num_parallel_calls=AUTOTUNE)
+      .map(prepare_sample_fn, num_parallel_calls=AUTOTUNE)
       .shuffle(global_batch_size * 10)
       .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
       .repeat(-1)
@@ -123,11 +106,7 @@ def prepare_sample(features):
 
 # TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
 def make_tfrecord_iterator(
-    config,
-    dataloading_host_index,
-    dataloading_host_count,
-    mesh,
-    global_batch_size,
+    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
 ):
   """Iterator for TFRecord format. For Laion dataset,
   check out preparation script
@@ -136,12 +115,22 @@ def make_tfrecord_iterator(
 
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
-  # Datset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
-  if (config.cache_latents_text_encoder_outputs
+  # Dataset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
+  if (
+      config.cache_latents_text_encoder_outputs
       and os.path.isdir(config.dataset_save_location)
-      and 'load_tfrecord_cached'in config.get_keys()
-      and config.load_tfrecord_cached):
-    return make_cached_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size)
+      and "load_tfrecord_cached" in config.get_keys()
+      and config.load_tfrecord_cached
+  ):
+    return make_cached_tfrecord_iterator(
+        config,
+        dataloading_host_index,
+        dataloading_host_count,
+        mesh,
+        global_batch_size,
+        feature_description,
+        prepare_sample_fn,
+    )
 
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
 
@@ -50,8 +50,25 @@ def make_data_iterator(
     global_batch_size,
     tokenize_fn=None,
     image_transforms_fn=None,
+    feature_description=None,
+    prepare_sample_fn=None,
 ):
   """Make data iterator for SD1, 2, XL, dataset_types in (hf, tf, tfrecord)"""
+
+  if config.dataset_type == "hf" or config.dataset_type == "tf":
+    if tokenize_fn is None or image_transforms_fn is None:
+      raise ValueError(f"dataset type {config.dataset_type} needs to pass a tokenize_fn and image_transforms_fn")
+
+  if (
+      config.dataset_type == "tfrecord"
+      and config.cache_latents_text_encoder_outputs
+      and feature_description is None
+      and prepare_sample_fn is None
+  ):
+    raise ValueError(
+        f"dataset type {config.dataset_type} needs to pass a feature_description dictionary and prepare_sample_fn function when cache_latents_text_encoder_outputs is True."
+    )
+
   if config.dataset_type == "hf":
     return _hf_data_processing.make_hf_streaming_iterator(
         config,
@@ -87,6 +104,8 @@ def make_data_iterator(
         dataloading_host_count,
         mesh,
         global_batch_size,
+        feature_description,
+        prepare_sample_fn,
     )
   else:
     assert False, f"Unknown dataset_type {config.dataset_type}, dataset_type must be in (tf, tfrecord, hf, grain)"
 
@@ -187,7 +187,7 @@ def _tpu_flash_attention(
   axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
   named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
 
-  cp_size=1
+  shard_head_size=mesh.shape['tensor']
 
   @functools.partial(
       jax.jit,
@@ -200,12 +200,11 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
     splash_kernel = splash_attention_kernel.make_splash_mha(
       mask=multi_head_mask,
       head_shards=shard_head_size, # the sizes of the axis is sharding over heads
-      q_seq_shards=cp_size,
+      q_seq_shards=num_fsdp_shards,
       block_sizes=block_sizes,
     )
     return splash_kernel
 
-  shard_head_size = mesh.shape["tensor"]
   mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
   multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
   splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
@@ -223,10 +222,7 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
     check_rep=False
   )
   def wrap_flash_attention(query, key, value, splash_kernel):
-    #full_k = jax.lax.all_to_all(key, axis_name='fsdp', split_axis=2, concat_axis=2, tiled=True)
-    #full_v = jax.lax.all_to_all(value, axis_name='fsdp', split_axis=2, concat_axis=2, tiled=True)
     attention_output = jax.vmap(splash_kernel)(query, key, value)
-    #attention_output = jax.vmap(splash_kernel)(query, full_k, full_v)
     return attention_output
 
   devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"]