|
| 1 | +""" |
| 2 | + Copyright 2025 Google LLC |
| 3 | +
|
| 4 | + Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + you may not use this file except in compliance with the License. |
| 6 | + You may obtain a copy of the License at |
| 7 | +
|
| 8 | + https://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | + Unless required by applicable law or agreed to in writing, software |
| 11 | + distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + See the License for the specific language governing permissions and |
| 14 | + limitations under the License. |
| 15 | +""" |
| 16 | + |
| 17 | +""" |
| 18 | +Prepare tfrecords with latents and text embeddings preprocessed. |
| 19 | +1. Download the dataset |
| 20 | +""" |
| 21 | + |
| 22 | +import os |
| 23 | +import functools |
| 24 | +from absl import app |
| 25 | +from typing import Sequence, Union, List |
| 26 | +from datasets import load_dataset |
| 27 | +import numpy as np |
| 28 | +import jax |
| 29 | +import jax.numpy as jnp |
| 30 | +from jax.sharding import Mesh |
| 31 | +from maxdiffusion import pyconfig, max_utils |
| 32 | +from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline |
| 33 | +from maxdiffusion.video_processor import VideoProcessor |
| 34 | + |
| 35 | +import tensorflow as tf |
| 36 | + |
| 37 | + |
| 38 | +def image_feature(value): |
| 39 | + """Returns a bytes_list from a string / byte.""" |
| 40 | + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()])) |
| 41 | + |
| 42 | + |
| 43 | +def bytes_feature(value): |
| 44 | + """Returns a bytes_list from a string / byte.""" |
| 45 | + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()])) |
| 46 | + |
| 47 | + |
| 48 | +def float_feature(value): |
| 49 | + """Returns a float_list from a float / double.""" |
| 50 | + return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) |
| 51 | + |
| 52 | + |
| 53 | +def int64_feature(value): |
| 54 | + """Returns an int64_list from a bool / enum / int / uint.""" |
| 55 | + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) |
| 56 | + |
| 57 | + |
| 58 | +def float_feature_list(value): |
| 59 | + """Returns a list of float_list from a float / double.""" |
| 60 | + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) |
| 61 | + |
| 62 | + |
| 63 | +def create_example(latent, hidden_states): |
| 64 | + latent = tf.io.serialize_tensor(latent) |
| 65 | + hidden_states = tf.io.serialize_tensor(hidden_states) |
| 66 | + feature = { |
| 67 | + "latents": bytes_feature(latent), |
| 68 | + "encoder_hidden_states": bytes_feature(hidden_states), |
| 69 | + } |
| 70 | + example = tf.train.Example(features=tf.train.Features(feature=feature)) |
| 71 | + return example.SerializeToString() |
| 72 | + |
| 73 | + |
| 74 | +def text_encode(pipeline, prompt: Union[str, List[str]]): |
| 75 | + encoder_hidden_states = pipeline._get_t5_prompt_embeds(prompt) |
| 76 | + encoder_hidden_states = encoder_hidden_states.detach().numpy() |
| 77 | + return encoder_hidden_states |
| 78 | + |
| 79 | + |
| 80 | +def vae_encode(video, rng, vae, vae_cache): |
| 81 | + latent = vae.encode(video, feat_cache=vae_cache) |
| 82 | + latent = latent.latent_dist.sample(rng) |
| 83 | + return latent |
| 84 | + |
| 85 | + |
| 86 | +def generate_dataset(config, pipeline): |
| 87 | + |
| 88 | + tfrecords_dir = config.tfrecords_dir |
| 89 | + if not os.path.exists(tfrecords_dir): |
| 90 | + os.makedirs(tfrecords_dir) |
| 91 | + |
| 92 | + tf_rec_num = 0 |
| 93 | + no_records_per_shard = config.no_records_per_shard |
| 94 | + global_record_count = 0 |
| 95 | + writer = tf.io.TFRecordWriter( |
| 96 | + tfrecords_dir + "/file_%.2i-%i.tfrec" % (tf_rec_num, (global_record_count + no_records_per_shard)) |
| 97 | + ) |
| 98 | + shard_record_count = 0 |
| 99 | + |
| 100 | + # create mesh |
| 101 | + devices_array = max_utils.create_device_mesh(config) |
| 102 | + mesh = Mesh(devices_array, config.mesh_axes) |
| 103 | + rng = jax.random.key(config.seed) |
| 104 | + |
| 105 | + vae_scale_factor_spatial = 2 ** len(pipeline.vae.temperal_downsample) |
| 106 | + video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor_spatial) |
| 107 | + |
| 108 | + # jit vae fun. |
| 109 | + p_vae_encode = jax.jit(functools.partial(vae_encode, vae=pipeline.vae, vae_cache=pipeline.vae_cache)) |
| 110 | + |
| 111 | + # Load dataset |
| 112 | + ds = load_dataset(config.dataset_name, split="train") |
| 113 | + ds = ds.shuffle(seed=config.seed) |
| 114 | + ds = ds.select_columns([config.caption_column, config.image_column]) |
| 115 | + batch_size = 10 |
| 116 | + for i in range(0, len(ds), batch_size): |
| 117 | + rng, new_rng = jax.random.split(rng) |
| 118 | + text = ds[i : i + batch_size]["text"] |
| 119 | + videos = ds[i : i + batch_size]["image"] |
| 120 | + |
| 121 | + videos = [video_processor.preprocess_video([video], height=config.height, width=config.width) for video in videos] |
| 122 | + video = jnp.array(np.squeeze(np.array(videos), axis=1), dtype=config.weights_dtype) |
| 123 | + with mesh: |
| 124 | + latents = p_vae_encode(video=video, rng=new_rng) |
| 125 | + latents = jnp.transpose(latents, (0, 4, 1, 2, 3)) |
| 126 | + encoder_hidden_states = text_encode(pipeline, text) |
| 127 | + for latent, encoder_hidden_state in zip(latents, encoder_hidden_states): |
| 128 | + writer.write(create_example(latent, encoder_hidden_state)) |
| 129 | + shard_record_count += 1 |
| 130 | + global_record_count += 1 |
| 131 | + |
| 132 | + if shard_record_count >= no_records_per_shard: |
| 133 | + writer.close() |
| 134 | + tf_rec_num += 1 |
| 135 | + writer = tf.io.TFRecordWriter( |
| 136 | + tfrecords_dir + "/file_%.2i-%i.tfrec" % (tf_rec_num, (global_record_count + no_records_per_shard)) |
| 137 | + ) |
| 138 | + shard_record_count = 0 |
| 139 | + |
| 140 | + |
| 141 | +def run(config): |
| 142 | + pipeline = WanPipeline.from_pretrained(config, load_transformer=False) |
| 143 | + # Don't need the transformer for preprocessing. |
| 144 | + generate_dataset(config, pipeline) |
| 145 | + |
| 146 | + |
| 147 | +def main(argv: Sequence[str]) -> None: |
| 148 | + pyconfig.initialize(argv) |
| 149 | + run(pyconfig.config) |
| 150 | + |
| 151 | + |
| 152 | +if __name__ == "__main__": |
| 153 | + app.run(main) |
0 commit comments