add option to replicate vae. Fix cross attn splash.

entrpn · entrpn · commit 3e540c5984b1 · 2025-08-14T22:34:28.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -40,6 +40,9 @@ weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
 
+# Replicates vae across devices instead of using the model's sharding annotations for sharding.
+replicate_vae: False
+
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 # fp32 activations and fp32 weights with HIGHEST will provide the best precision
diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py
@@ -221,6 +221,10 @@ def walk_and_upload_blobs(config, output_dir):
 
 
 def device_put_replicated(x, sharding):
+  """
+  Although the name indiciates replication, this function can be used
+  to also shard an array based on sharding.
+  """
   return jax.make_array_from_callback(x.shape, sharding, lambda index: x[index])
 
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -166,20 +166,25 @@ def _tpu_flash_attention(
     dtype: jnp.dtype = jnp.float32,
 ) -> jax.Array:
   """TPU Flash Attention"""
-
-  max_block_size = 1024 if dtype == jnp.bfloat16 else 512
+  q_max_block_size = 1024 if dtype == jnp.bfloat16 else 512
+  # Cross-attention where kv dims are much smaller due to encoder_hidden_states.
+  # If kv seq_len is padded too much, it causes issues in attention calculations.
+  if key.shape[1] != query.shape[1]:
+    kv_max_block_size = key.shape[1]
+  else:
+    kv_max_block_size = q_max_block_size
   if flash_block_sizes:
     block_sizes = flash_block_sizes
   else:
     block_sizes = splash_attention_kernel.BlockSizes(
-        block_q=min(max_block_size, query.shape[2]),
-        block_kv_compute=min(max_block_size, key.shape[2]),
-        block_kv=min(max_block_size, key.shape[2]),
-        block_q_dkv=min(max_block_size, query.shape[2]),
-        block_kv_dkv=min(max_block_size, key.shape[2]),
-        block_kv_dkv_compute=min(max_block_size, query.shape[2]),
-        block_q_dq=min(max_block_size, query.shape[2]),
-        block_kv_dq=min(max_block_size, query.shape[2]),
+        block_q=min(q_max_block_size, query.shape[2]),
+        block_kv_compute=min(kv_max_block_size, key.shape[2]),
+        block_kv=min(kv_max_block_size, key.shape[2]),
+        block_q_dkv=min(q_max_block_size, query.shape[2]),
+        block_kv_dkv=min(kv_max_block_size, key.shape[2]),
+        block_kv_dkv_compute=min(kv_max_block_size, query.shape[2]),
+        block_q_dq=min(q_max_block_size, query.shape[2]),
+        block_kv_dq=min(kv_max_block_size, query.shape[2]),
     )
 
   num_fsdp_shards = mesh.shape["fsdp"]
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -220,6 +220,8 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
     params = jax.tree_util.tree_map(lambda x: x.astype(config.weights_dtype), params)
     for path, val in flax.traverse_util.flatten_dict(params).items():
       sharding = logical_state_sharding[path].value
+      if config.replicate_vae:
+        sharding = NamedSharding(mesh, P())
       state[path].value = device_put_replicated(val, sharding)
     state = nnx.from_flat_state(state)
 
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -36,6 +36,7 @@
 from maxdiffusion.utils import load_video
 from skimage.metrics import structural_similarity as ssim
 from flax.training import train_state
+from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
 
 
 class TrainState(train_state.TrainState):
@@ -53,6 +54,10 @@ def generate_sample(config, pipeline, filename_prefix):
   """
   Generates a video to validate training did not corrupt the model
   """
+  if not hasattr(pipeline, "vae"):
+    wan_vae, vae_cache = WanPipeline.load_vae(pipeline.mesh.devices, pipeline.mesh, nnx.Rngs(jax.random.key(config.seed)), config)
+    pipeline.vae = wan_vae
+    pipeline.vae_cache = vae_cache
   return generate_wan(config, pipeline, filename_prefix)
 
 
@@ -140,10 +145,13 @@ def prepare_sample(features):
   def start_training(self):
 
     pipeline = self.load_checkpoint()
-    # del pipeline.vae
-
     # Generate a sample before training to compare against generated sample after training.
     pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
+    
+    # save some memory.
+    del pipeline.vae
+    del pipeline.vae_cache
+
     mesh = pipeline.mesh
     data_iterator = self.load_dataset(mesh)