feat: fori_loop-based denoising for WAN inference

Perseus14 · Perseus14 · commit d4d4501d05fa · 2026-04-11T23:50:11.000+05:30
Replace Python denoising loop with jax.lax.fori_loop for the non-cache
paths of WAN 2.1 and WAN 2.2 pipelines (both T2V and I2V). This compiles
the entire denoising loop as a single XLA program, eliminating per-step
Python dispatch overhead (~100 dispatches for 50 steps).

WAN 2.1 (T2V &amp; I2V): single transformer + fori_loop
WAN 2.2 (T2V &amp; I2V): dual-transformer selection via jax.lax.cond inside
fori_loop (both transformers share graphdef)

The fori_loop path is used when no caching (CFG-Cache, MagCache,
SenCache) is enabled. Existing Python loop paths are preserved as
fallback for cache-enabled configurations.

Scheduler state is pre-initialized with concrete values (step_index=0,
last_sample=zeros, begin_index=0) to ensure consistent pytree structure
across all fori_loop iterations.
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .wan_pipeline import WanPipeline, transformer_forward_pass, transformer_forward_pass_full_cfg, transformer_forward_pass_cfg_cache, init_magcache, magcache_step
+from ...schedulers.scheduling_unipc_multistep_flax import UniPCMultistepSchedulerState
 from ...models.wan.transformers.transformer_wan import WanModel
 from typing import List, Union, Optional
 from ...pyconfig import HyperParameters
@@ -127,6 +128,31 @@ def __call__(
 
     graphdef, state, rest_of_state = nnx.split(self.transformer, nnx.Param, ...)
 
+    # Use fori_loop path when no caching is enabled for reduced dispatch overhead.
+    use_fori = not use_cfg_cache and not use_magcache
+
+    if use_fori:
+      do_cfg = guidance_scale > 1.0
+      p_run_inference = partial(
+          run_inference_fori_2_1,
+          do_cfg=do_cfg,
+          guidance_scale=guidance_scale,
+          num_inference_steps=num_inference_steps,
+          scheduler=self.scheduler,
+      )
+      with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        latents = p_run_inference(
+            graphdef=graphdef,
+            sharded_state=state,
+            rest_of_state=rest_of_state,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            scheduler_state=scheduler_state,
+        )
+        latents = self._denormalize_latents(latents)
+      return self._decode_latents_to_video(latents)
+
     p_run_inference = partial(
         run_inference_2_1,
         guidance_scale=guidance_scale,
@@ -155,6 +181,77 @@ def __call__(
     return self._decode_latents_to_video(latents)
 
 
+@partial(jax.jit, static_argnames=("do_cfg", "guidance_scale", "num_inference_steps", "scheduler"))
+def run_inference_fori_2_1(
+    graphdef,
+    sharded_state,
+    rest_of_state,
+    latents: jnp.array,
+    prompt_embeds: jnp.array,
+    negative_prompt_embeds: jnp.array,
+    scheduler_state: UniPCMultistepSchedulerState,
+    do_cfg: bool,
+    guidance_scale: float,
+    num_inference_steps: int,
+    scheduler: FlaxUniPCMultistepScheduler,
+):
+  """Denoising loop for WAN 2.1 T2V using jax.lax.fori_loop.
+
+  The entire denoising loop runs as a single XLA program, eliminating
+  per-step Python dispatch overhead. This path is used when no caching
+  (CFG-Cache, MagCache) is enabled.
+  """
+  bsz = latents.shape[0]
+
+  # Pre-combine embeddings for CFG (static at trace time).
+  if do_cfg:
+    prompt_embeds_combined = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+  else:
+    prompt_embeds_combined = prompt_embeds
+
+  # Pre-initialize scheduler state with concrete values so the pytree
+  # structure is consistent across all fori_loop iterations.
+  # - step_index: must be int (not None) so scheduler.step skips _init_step_index
+  # - last_sample: must be array (not None) for consistent pytree; the
+  #   corrector is still skipped on step 0 because step_index > 0 is False
+  # - begin_index: set to 0 so it's a concrete int rather than None
+  scheduler_state = scheduler_state.replace(
+      step_index=0,
+      last_sample=jnp.zeros_like(latents),
+      begin_index=0,
+  )
+
+  def body_fn(step, carry):
+    latents, sched_state = carry
+    t = sched_state.timesteps[step]
+
+    wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
+
+    if do_cfg:
+      latents_input = jnp.concatenate([latents] * 2)
+      timestep = jnp.broadcast_to(t, (bsz * 2,))
+    else:
+      latents_input = latents
+      timestep = jnp.broadcast_to(t, (bsz,))
+
+    noise_pred = wan_transformer(
+        hidden_states=latents_input,
+        timestep=timestep,
+        encoder_hidden_states=prompt_embeds_combined,
+    )
+
+    if do_cfg:
+      noise_cond = noise_pred[:bsz]
+      noise_uncond = noise_pred[bsz:]
+      noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
+
+    latents, sched_state = scheduler.step(sched_state, noise_pred, t, latents).to_tuple()
+    return latents, sched_state
+
+  latents, _ = jax.lax.fori_loop(0, num_inference_steps, body_fn, (latents, scheduler_state))
+  return latents
+
+
 def run_inference_2_1(
     graphdef,
     sharded_state,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .wan_pipeline import WanPipeline, transformer_forward_pass, transformer_forward_pass_full_cfg, transformer_forward_pass_cfg_cache
+from ...schedulers.scheduling_unipc_multistep_flax import UniPCMultistepSchedulerState
 from ...models.wan.transformers.transformer_wan import WanModel
 from typing import List, Union, Optional
 from ...pyconfig import HyperParameters
@@ -150,6 +151,35 @@ def __call__(
 
     boundary_timestep = self.boundary_ratio * self.scheduler.config.num_train_timesteps
 
+    # Use fori_loop path when no caching is enabled for reduced dispatch overhead.
+    use_fori = not use_cfg_cache and not use_sen_cache
+
+    if use_fori:
+      do_cfg = guidance_scale_low > 1.0 or guidance_scale_high > 1.0
+      p_run_inference = partial(
+          run_inference_fori_2_2,
+          do_cfg=do_cfg,
+          guidance_scale_low=guidance_scale_low,
+          guidance_scale_high=guidance_scale_high,
+          boundary=boundary_timestep,
+          num_inference_steps=num_inference_steps,
+          scheduler=self.scheduler,
+      )
+      with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        latents = p_run_inference(
+            graphdef=high_noise_graphdef,
+            low_noise_state=low_noise_state,
+            low_noise_rest=low_noise_rest,
+            high_noise_state=high_noise_state,
+            high_noise_rest=high_noise_rest,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            scheduler_state=scheduler_state,
+        )
+        latents = self._denormalize_latents(latents)
+      return self._decode_latents_to_video(latents)
+
     p_run_inference = partial(
         run_inference_2_2,
         guidance_scale_low=guidance_scale_low,
@@ -179,6 +209,97 @@ def __call__(
     return self._decode_latents_to_video(latents)
 
 
+@partial(jax.jit, static_argnames=("do_cfg", "guidance_scale_low", "guidance_scale_high", "boundary", "num_inference_steps", "scheduler"))
+def run_inference_fori_2_2(
+    graphdef,
+    low_noise_state,
+    low_noise_rest,
+    high_noise_state,
+    high_noise_rest,
+    latents: jnp.array,
+    prompt_embeds: jnp.array,
+    negative_prompt_embeds: jnp.array,
+    scheduler_state: UniPCMultistepSchedulerState,
+    do_cfg: bool,
+    guidance_scale_low: float,
+    guidance_scale_high: float,
+    boundary: float,
+    num_inference_steps: int,
+    scheduler: FlaxUniPCMultistepScheduler,
+):
+  """Denoising loop for WAN 2.2 T2V using jax.lax.fori_loop.
+
+  The entire denoising loop runs as a single XLA program, eliminating
+  per-step Python dispatch overhead. Dual-transformer selection
+  (high-noise vs low-noise based on boundary timestep) is handled
+  inside the loop using jax.lax.cond.
+
+  Both transformers share the same architecture (identical graphdef),
+  so a single graphdef is used with jax.lax.cond selecting between
+  the two weight states per step.
+  """
+  bsz = latents.shape[0]
+
+  # Pre-combine embeddings for CFG (static at trace time).
+  if do_cfg:
+    prompt_embeds_combined = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+  else:
+    prompt_embeds_combined = prompt_embeds
+
+  # Pre-initialize scheduler state with concrete values for fori_loop.
+  scheduler_state = scheduler_state.replace(
+      step_index=0,
+      last_sample=jnp.zeros_like(latents),
+      begin_index=0,
+  )
+
+  def body_fn(step, carry):
+    latents, sched_state = carry
+    t = sched_state.timesteps[step]
+    use_high = t >= boundary
+
+    # Select guidance scale based on transformer phase.
+    guidance_scale = jnp.where(use_high, guidance_scale_high, guidance_scale_low)
+
+    if do_cfg:
+      latents_input = jnp.concatenate([latents] * 2)
+      timestep = jnp.broadcast_to(t, (bsz * 2,))
+    else:
+      latents_input = latents
+      timestep = jnp.broadcast_to(t, (bsz,))
+
+    # Select transformer weights via jax.lax.cond.
+    # Both branches trace through the same graphdef with different states.
+    def high_noise_forward():
+      transformer = nnx.merge(graphdef, high_noise_state, high_noise_rest)
+      return transformer(
+          hidden_states=latents_input,
+          timestep=timestep,
+          encoder_hidden_states=prompt_embeds_combined,
+      )
+
+    def low_noise_forward():
+      transformer = nnx.merge(graphdef, low_noise_state, low_noise_rest)
+      return transformer(
+          hidden_states=latents_input,
+          timestep=timestep,
+          encoder_hidden_states=prompt_embeds_combined,
+      )
+
+    noise_pred = jax.lax.cond(use_high, high_noise_forward, low_noise_forward)
+
+    if do_cfg:
+      noise_cond = noise_pred[:bsz]
+      noise_uncond = noise_pred[bsz:]
+      noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
+
+    latents, sched_state = scheduler.step(sched_state, noise_pred, t, latents).to_tuple()
+    return latents, sched_state
+
+  latents, _ = jax.lax.fori_loop(0, num_inference_steps, body_fn, (latents, scheduler_state))
+  return latents
+
+
 def run_inference_2_2(
     low_noise_graphdef,
     low_noise_state,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -15,6 +15,7 @@
 from maxdiffusion import max_logging
 from maxdiffusion.image_processor import PipelineImageInput
 from .wan_pipeline import WanPipeline, transformer_forward_pass, init_magcache, magcache_step
+from ...schedulers.scheduling_unipc_multistep_flax import UniPCMultistepSchedulerState
 from ...models.wan.transformers.transformer_wan import WanModel
 from typing import List, Union, Optional, Tuple
 from ...pyconfig import HyperParameters
@@ -236,6 +237,37 @@ def _process_image_input(img_input, height, width, num_videos_per_prompt):
     if first_frame_mask is not None:
       first_frame_mask = jax.device_put(first_frame_mask, data_sharding)
 
+    # Use fori_loop path when no caching is enabled for reduced dispatch overhead.
+    use_fori = not use_magcache
+
+    if use_fori:
+      do_cfg = guidance_scale > 1.0
+      p_run_inference = partial(
+          run_inference_fori_2_1_i2v,
+          do_cfg=do_cfg,
+          guidance_scale=guidance_scale,
+          num_inference_steps=num_inference_steps,
+          scheduler=self.scheduler,
+      )
+      with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        latents = p_run_inference(
+            graphdef=graphdef,
+            sharded_state=state,
+            rest_of_state=rest_of_state,
+            latents=latents,
+            condition=condition,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image_embeds=image_embeds,
+            scheduler_state=scheduler_state,
+        )
+        latents = jnp.transpose(latents, (0, 4, 1, 2, 3))
+        latents = self._denormalize_latents(latents)
+
+      if output_type == "latent":
+        return latents
+      return self._decode_latents_to_video(latents)
+
     p_run_inference = partial(
         run_inference_2_1_i2v,
         graphdef=graphdef,
@@ -269,6 +301,85 @@ def _process_image_input(img_input, height, width, num_videos_per_prompt):
     return self._decode_latents_to_video(latents)
 
 
+@partial(jax.jit, static_argnames=("do_cfg", "guidance_scale", "num_inference_steps", "scheduler"))
+def run_inference_fori_2_1_i2v(
+    graphdef,
+    sharded_state,
+    rest_of_state,
+    latents: jnp.array,
+    condition: jnp.array,
+    prompt_embeds: jnp.array,
+    negative_prompt_embeds: jnp.array,
+    image_embeds: jnp.array,
+    scheduler_state: UniPCMultistepSchedulerState,
+    do_cfg: bool,
+    guidance_scale: float,
+    num_inference_steps: int,
+    scheduler: FlaxUniPCMultistepScheduler,
+):
+  """Denoising loop for WAN 2.1 I2V using jax.lax.fori_loop.
+
+  The entire denoising loop runs as a single XLA program, eliminating
+  per-step Python dispatch overhead. I2V-specific: condition is concatenated
+  with latents and image_embeds is passed to the transformer.
+  """
+  bsz = latents.shape[0]
+
+  # Pre-combine embeddings for CFG (static at trace time).
+  if do_cfg:
+    prompt_embeds_combined = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
+    image_embeds_combined = jnp.concatenate([image_embeds, image_embeds], axis=0)
+    condition_combined = jnp.concatenate([condition] * 2)
+  else:
+    prompt_embeds_combined = prompt_embeds
+    image_embeds_combined = image_embeds
+    condition_combined = condition
+
+  # Pre-initialize scheduler state for fori_loop.
+  scheduler_state = scheduler_state.replace(
+      step_index=0,
+      last_sample=jnp.zeros_like(latents),
+      begin_index=0,
+  )
+
+  def body_fn(step, carry):
+    latents, sched_state = carry
+    t = sched_state.timesteps[step]
+
+    wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
+
+    if do_cfg:
+      latents_input = jnp.concatenate([latents] * 2)
+      timestep = jnp.broadcast_to(t, (bsz * 2,))
+    else:
+      latents_input = latents
+      timestep = jnp.broadcast_to(t, (bsz,))
+
+    # Concatenate condition and transpose BFHWC -> BCFHW for transformer.
+    latent_model_input = jnp.concatenate([latents_input, condition_combined], axis=-1)
+    latent_model_input = jnp.transpose(latent_model_input, (0, 4, 1, 2, 3))
+
+    noise_pred = wan_transformer(
+        hidden_states=latent_model_input,
+        timestep=timestep,
+        encoder_hidden_states=prompt_embeds_combined,
+        encoder_hidden_states_image=image_embeds_combined,
+    )
+
+    if do_cfg:
+      noise_cond = noise_pred[:bsz]
+      noise_uncond = noise_pred[bsz:]
+      noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
+
+    # Transpose BCFHW -> BFHWC back to latent space.
+    noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
+    latents, sched_state = scheduler.step(sched_state, noise_pred, t, latents).to_tuple()
+    return latents, sched_state
+
+  latents, _ = jax.lax.fori_loop(0, num_inference_steps, body_fn, (latents, scheduler_state))
+  return latents
+
+
 def run_inference_2_1_i2v(
     graphdef,
     sharded_state,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py