Add batch divisibility check for VAE input sharding (#316)

prishajain1 · web-flow · commit 5b0b8cf46373 · 2026-01-23T10:28:34.000-08:00
* Adding check for batch size divisibility before sharding video condition tensor

* pyink checks

* Removed unused var

* Moving commit retrieval to before JAX setup init

* fix

* replaced boundary_timestep with ratio in wan2.2 t2v

* replaced boundary_timestep with ratio in wan2.2 t2v
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -300,7 +300,7 @@ guidance_scale_high: 4.0
 # The timestep threshold. If `t` is at or above this value,
 # the `high_noise_model` is considered as the required model.
 # timestep to switch between low noise and high noise transformer
-boundary_timestep: 875
+boundary_ratio: 0.875
 
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -134,7 +134,6 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           num_inference_steps=config.num_inference_steps,
           guidance_scale_low=config.guidance_scale_low,
           guidance_scale_high=config.guidance_scale_high,
-          boundary=config.boundary_timestep,
       )
     else:
       raise ValueError(f"Unsupported model_name for T2Vin config: {model_key}")
@@ -162,13 +161,12 @@ def inference_generate_video(config, pipeline, filename_prefix=""):
   return
 
 
-def run(config, pipeline=None, filename_prefix=""):
+def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   model_key = config.model_name
   writer = max_utils.initialize_summary_writer(config)
   if jax.process_index() == 0 and writer:
     max_logging.log(f"TensorBoard logs will be written to: {config.tensorboard_dir}")
 
-    commit_hash = get_git_commit_hash()
     if commit_hash:
       writer.add_text("inference/git_commit_hash", commit_hash, global_step=0)
       max_logging.log(f"Git Commit Hash: {commit_hash}")
@@ -250,12 +248,13 @@ def run(config, pipeline=None, filename_prefix=""):
 
 
 def main(argv: Sequence[str]) -> None:
+  commit_hash = get_git_commit_hash()
   pyconfig.initialize(argv)
   try:
     flax.config.update("flax_always_shard_variable", False)
   except LookupError:
     pass
-  run(pyconfig.config)
+  run(pyconfig.config, commit_hash=commit_hash)
 
 
 if __name__ == "__main__":
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -384,7 +384,9 @@ def ring_scan_body(carry, _):
           return (m, l, o, k_next, v_next), None
 
         initial_carry = (m, l, o, k1, v1)
-        (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_context_shards - 1)
+        (m_final, l_final, o_final, _, _), _ = jax.lax.scan(
+            ring_scan_body, initial_carry, None, length=num_context_shards - 1
+        )
 
         attention_output = o_final / l_final[..., None]
       else:
@@ -749,6 +751,7 @@ def __init__(
     self.dpa_layer = None
     if attention_kernel == "cudnn_flash_te":
       from transformer_engine.jax.flax.transformer import DotProductAttention  # pytype: disable=import-error
+
       jax.config.update("jax_use_shardy_partitioner", False)
 
       dpa_layer = DotProductAttention(
@@ -829,6 +832,7 @@ def setup(self):
     self.dpa_layer = None
     if self.attention_kernel == "cudnn_flash_te":
       from transformer_engine.jax.flax.transformer import DotProductAttention  # pytype: disable=import-error
+
       jax.config.update("jax_use_shardy_partitioner", False)
 
       dpa_layer = DotProductAttention(
@@ -848,7 +852,6 @@ def setup(self):
       variables = {}
       self.dpa_layer = functools.partial(dpa_layer.apply, variables)
 
-
   def apply_attention(self, query: Array, key: Array, value: Array, attention_mask: Array = None):
     return _apply_attention(
         query=query,
diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py
@@ -29,10 +29,11 @@
 
 CACHE_T = 2
 try:
-  flax.config.update('flax_always_shard_variable', False)
+  flax.config.update("flax_always_shard_variable", False)
 except LookupError:
   pass
 
+
 # Helper to ensure kernel_size, stride, padding are tuples of 3 integers
 def _canonicalize_tuple(x: Union[int, Sequence[int]], rank: int, name: str) -> Tuple[int, ...]:
   """Canonicalizes a value to a tuple of integers."""
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -544,8 +544,10 @@ def prepare_latents_i2v_base(
     vae_dtype = getattr(self.vae, "dtype", jnp.float32)
     video_condition = video_condition.astype(vae_dtype)
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
-      sharding_spec = P(self.config.mesh_axes[0], None, None, None, None)
-      video_condition = jax.lax.with_sharding_constraint(video_condition, sharding_spec)
+      data_mesh_size = self.mesh.shape[self.config.mesh_axes[0]]
+      if video_condition.shape[0] % data_mesh_size == 0:
+        sharding_spec = P(self.config.mesh_axes[0], None, None, None, None)
+        video_condition = jax.lax.with_sharding_constraint(video_condition, sharding_spec)
       encoded_output = self.vae.encode(video_condition, self.vae_cache)[0].mode()
 
     # Normalize latents
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -118,7 +118,8 @@ def __call__(
     )
     # Set the TE shard_guard context_manager if using TE cudnn_flash attention
     if self.config.attention == "cudnn_flash_te":
-      from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error
+      from transformer_engine.jax.sharding import global_shard_guard, MeshResource  # pytype: disable=import-error
+
       shard_guard = global_shard_guard(MeshResource(cp_resource="context"))
     else:
       shard_guard = nullcontext()
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -38,6 +38,7 @@ def __init__(
     super().__init__(config=config, **kwargs)
     self.low_noise_transformer = low_noise_transformer
     self.high_noise_transformer = high_noise_transformer
+    self.boundary_ratio = config.boundary_ratio
 
   @classmethod
   def _load_and_init(cls, config, restored_checkpoint=None, vae_only=False, load_transformer=True):
@@ -103,7 +104,6 @@ def __call__(
       num_inference_steps: int = 50,
       guidance_scale_low: float = 3.0,
       guidance_scale_high: float = 4.0,
-      boundary: int = 875,
       num_videos_per_prompt: Optional[int] = 1,
       max_sequence_length: int = 512,
       latents: jax.Array = None,
@@ -129,18 +129,21 @@ def __call__(
     low_noise_graphdef, low_noise_state, low_noise_rest = nnx.split(self.low_noise_transformer, nnx.Param, ...)
     high_noise_graphdef, high_noise_state, high_noise_rest = nnx.split(self.high_noise_transformer, nnx.Param, ...)
 
+    boundary_timestep = self.boundary_ratio * self.scheduler.config.num_train_timesteps
+
     p_run_inference = partial(
         run_inference_2_2,
         guidance_scale_low=guidance_scale_low,
         guidance_scale_high=guidance_scale_high,
-        boundary=boundary,
+        boundary=boundary_timestep,
         num_inference_steps=num_inference_steps,
         scheduler=self.scheduler,
         scheduler_state=scheduler_state,
     )
     # Set the TE shard_guard context_manager if using TE cudnn_flash attention
     if self.config.attention == "cudnn_flash_te":
-      from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error
+      from transformer_engine.jax.sharding import global_shard_guard, MeshResource  # pytype: disable=import-error
+
       shard_guard = global_shard_guard(MeshResource(cp_resource="context"))
     else:
       shard_guard = nullcontext()
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -201,7 +201,9 @@ def user_init(raw_keys):
     raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"])
     # Verify qkv is sharded across sequence.
     if raw_keys["attention"] == "ring" or raw_keys["attention_sharding_uniform"]:
-      max_logging.log(f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set.")
+      max_logging.log(
+          f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set."
+      )
       logical_axis_rules = list(raw_keys["logical_axis_rules"])
       max_logging.log(f"Initial logical axis rules: {logical_axis_rules}")
       new_rules = []
diff --git a/src/maxdiffusion/tests/wan_transformer_test.py b/src/maxdiffusion/tests/wan_transformer_test.py
@@ -125,9 +125,7 @@ def test_wan_time_text_embedding(self):
 
       encoder_hidden_states_shape = (batch_size, time_freq_dim * 2, text_embed_dim)
       dummy_encoder_hidden_states = jnp.ones(encoder_hidden_states_shape)
-      temb, timestep_proj, encoder_hidden_states, _, _ = layer(
-          dummy_timestep, dummy_encoder_hidden_states
-      )
+      temb, timestep_proj, encoder_hidden_states, _, _ = layer(dummy_timestep, dummy_encoder_hidden_states)
       assert temb.shape == (batch_size, dim)
       assert timestep_proj.shape == (batch_size, time_proj_dim)
       assert encoder_hidden_states.shape == (batch_size, time_freq_dim * 2, dim)
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -309,7 +309,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
         pretty_string = pprint.pformat(state_spec.opt_state, indent=4, width=60)
         max_logging.log(pretty_string)
         max_logging.log("------------------------------------------------")
-    if self.config.hardware != 'gpu':
+    if self.config.hardware != "gpu":
       max_utils.delete_pytree(params)
     data_shardings = self.get_data_shardings(mesh)
     eval_data_shardings = self.get_eval_data_shardings(mesh)
@@ -368,14 +368,16 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
 
         # Designate the context parallel axis for sharding
         if self.config.attention == "cudnn_flash_te":
-          from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error
+          from transformer_engine.jax.sharding import global_shard_guard, MeshResource  # pytype: disable=import-error
+
           shard_guard = global_shard_guard(MeshResource(cp_resource="context"))
         else:
           shard_guard = nullcontext()
 
         next_batch_future = executor.submit(load_next_batch, train_data_iterator, example_batch, self.config)
-        with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh, \
-        shard_guard, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        with jax.profiler.StepTraceAnnotation(
+            "train", step_num=step
+        ), pipeline.mesh, shard_guard, nn_partitioning.axis_rules(self.config.logical_axis_rules):
           state, scheduler_state, train_metric, rng = p_train_step(state, example_batch, rng, scheduler_state)
           train_metric["scalar"]["learning/loss"].block_until_ready()
         last_step_completion = datetime.datetime.now()