Implement KV cache for cross-attention in Wan T2V and I2V pipelines

Perseus14 · Perseus14 · commit b279c7ddf30f · 2026-04-15T09:42:12.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -328,6 +328,7 @@ flow_shift: 3.0
 # Skips the unconditional forward pass on ~35% of steps via residual compensation.
 # See: FasterCache (Lv et al. 2024), WAN 2.1 paper §4.4.2
 use_cfg_cache: False
+use_kv_cache: False
 use_magcache: False
 magcache_thresh: 0.12
 magcache_K: 2
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -1,4 +1,4 @@
-﻿# Copyright 2023 Google LLC
+# Copyright 2023 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -282,6 +282,7 @@ flow_shift: 3.0
 
 # Diffusion CFG cache (FasterCache-style, WAN 2.1 T2V only)
 use_cfg_cache: False
+use_kv_cache: False
 
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -304,6 +304,7 @@ boundary_ratio: 0.875
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+use_kv_cache: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208) — skip forward pass
 # when predicted output change (based on accumulated latent/timestep drift) is small
 use_sen_cache: False
diff --git a/src/maxdiffusion/configs/base_wan_i2v_14b.yml b/src/maxdiffusion/configs/base_wan_i2v_14b.yml
@@ -288,6 +288,7 @@ flow_shift: 5.0
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+use_kv_cache: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208)
 use_sen_cache: False
 use_magcache: False
diff --git a/src/maxdiffusion/configs/base_wan_i2v_27b.yml b/src/maxdiffusion/configs/base_wan_i2v_27b.yml
@@ -300,6 +300,7 @@ boundary_ratio: 0.875
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+use_kv_cache: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208)
 use_sen_cache: False
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -15,7 +15,7 @@
 import contextlib
 import functools
 import math
-from typing import Optional, Callable, Tuple
+from typing import Optional, Callable, Tuple, Dict
 import flax.linen as nn
 from flax import nnx
 import jax
@@ -1130,6 +1130,7 @@ def __call__(
       encoder_attention_mask: Optional[jax.Array] = None,
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
+      cached_kv: Optional[Dict[str, Tuple[jax.Array, jax.Array]]] = None,
   ) -> jax.Array:
     axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD))
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)
@@ -1144,16 +1145,22 @@ def __call__(
     if not is_i2v_cross_attention:
       with jax.named_scope("query_proj"):
         query_proj = self.query(hidden_states)
-      with jax.named_scope("key_proj"):
-        key_proj = self.key(encoder_hidden_states)
-      with jax.named_scope("value_proj"):
-        value_proj = self.value(encoder_hidden_states)
-
+      
       if self.qk_norm:
         with self.conditional_named_scope("attn_q_norm"):
           query_proj = self.norm_q(query_proj)
-        with self.conditional_named_scope("attn_k_norm"):
-          key_proj = self.norm_k(key_proj)
+
+      if not is_self_attention and cached_kv is not None and "text" in cached_kv:
+        key_proj, value_proj = cached_kv["text"]
+      else:
+        with jax.named_scope("key_proj"):
+          key_proj = self.key(encoder_hidden_states)
+        with jax.named_scope("value_proj"):
+          value_proj = self.value(encoder_hidden_states)
+
+        if self.qk_norm:
+          with self.conditional_named_scope("attn_k_norm"):
+            key_proj = self.norm_k(key_proj)
 
       if rotary_emb is not None:
         with self.conditional_named_scope("attn_rope"):
@@ -1211,22 +1218,29 @@ def __call__(
         query_proj_text = query_proj_raw
 
       # Text K/V
-      with self.conditional_named_scope("proj_key"):
-        key_proj_text = self.key(encoder_hidden_states_text)
-      if self.qk_norm:
-        with self.conditional_named_scope("attn_k_norm"):
-          key_proj_text = self.norm_k(key_proj_text)
-      with self.conditional_named_scope("proj_value"):
-        value_proj_text = self.value(encoder_hidden_states_text)
+      if cached_kv is not None and "text" in cached_kv:
+        key_proj_text, value_proj_text = cached_kv["text"]
+      else:
+        with self.conditional_named_scope("proj_key"):
+          key_proj_text = self.key(encoder_hidden_states_text)
+        if self.qk_norm:
+          with self.conditional_named_scope("attn_k_norm"):
+            key_proj_text = self.norm_k(key_proj_text)
+        with self.conditional_named_scope("proj_value"):
+          value_proj_text = self.value(encoder_hidden_states_text)
 
       # Image K/V (only if image embeddings are present)
       if encoder_hidden_states_img is not None:
-        with self.conditional_named_scope("add_proj_k"):
-          key_proj_img = self.add_k_proj(encoder_hidden_states_img)
-        with self.conditional_named_scope("norm_add_k"):
-          key_proj_img = self.norm_added_k(key_proj_img)
-        with self.conditional_named_scope("add_proj_v"):
-          value_proj_img = self.add_v_proj(encoder_hidden_states_img)
+        if cached_kv is not None and "image" in cached_kv:
+          key_proj_img, value_proj_img = cached_kv["image"]
+        else:
+          with self.conditional_named_scope("add_proj_k"):
+            key_proj_img = self.add_k_proj(encoder_hidden_states_img)
+          with self.conditional_named_scope("norm_add_k"):
+            key_proj_img = self.norm_added_k(key_proj_img)
+          with self.conditional_named_scope("add_proj_v"):
+            value_proj_img = self.add_v_proj(encoder_hidden_states_img)
+            
         query_proj_img = query_proj_raw
         # Check norm_added_k too
         # Checkpointing
@@ -1264,6 +1278,64 @@ def __call__(
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     return hidden_states
 
+  def compute_kv(
+      self,
+      encoder_hidden_states: jax.Array,
+      encoder_attention_mask: Optional[jax.Array] = None,
+  ) -> Dict[str, Tuple[jax.Array, jax.Array]]:
+    is_i2v_cross_attention = self.added_kv_proj_dim is not None
+
+    if not is_i2v_cross_attention:
+      with jax.named_scope("key_proj"):
+        key_proj = self.key(encoder_hidden_states)
+      with jax.named_scope("value_proj"):
+        value_proj = self.value(encoder_hidden_states)
+
+      if self.qk_norm:
+        with self.conditional_named_scope("attn_k_norm"):
+          key_proj = self.norm_k(key_proj)
+          
+      return {"text": (key_proj, value_proj)}
+    else:
+      # Image embeddings are padded to multiples of 128 for TPU flash attention
+      alignment = 128
+      if self.image_seq_len is not None:
+        image_seq_len_actual = self.image_seq_len
+      else:
+        image_seq_len_actual = 257
+      padded_img_len = ((image_seq_len_actual + alignment - 1) // alignment) * alignment  # 257 -> 384
+
+      if encoder_attention_mask is None:
+        padded_img_len = image_seq_len_actual
+
+      encoder_hidden_states_img = encoder_hidden_states[:, :padded_img_len, :]
+      encoder_hidden_states_text = encoder_hidden_states[:, padded_img_len:, :]
+
+      # Text K/V
+      with self.conditional_named_scope("proj_key"):
+        key_proj_text = self.key(encoder_hidden_states_text)
+      if self.qk_norm:
+        with self.conditional_named_scope("attn_k_norm"):
+          key_proj_text = self.norm_k(key_proj_text)
+      with self.conditional_named_scope("proj_value"):
+        value_proj_text = self.value(encoder_hidden_states_text)
+
+      # Image K/V (only if image embeddings are present)
+      if encoder_hidden_states_img is not None:
+        with self.conditional_named_scope("add_proj_k"):
+          key_proj_img = self.add_k_proj(encoder_hidden_states_img)
+        with self.conditional_named_scope("norm_add_k"):
+          key_proj_img = self.norm_added_k(key_proj_img)
+        with self.conditional_named_scope("add_proj_v"):
+          value_proj_img = self.add_v_proj(encoder_hidden_states_img)
+          
+        return {
+            "text": (key_proj_text, value_proj_text),
+            "image": (key_proj_img, value_proj_img)
+        }
+      else:
+        return {"text": (key_proj_text, value_proj_text)}
+
 
 class FlaxFluxAttention(nn.Module):
   query_dim: int
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -374,6 +374,7 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
       encoder_attention_mask: Optional[jax.Array] = None,
+      cached_kv: Optional[Dict[str, Tuple[jax.Array, jax.Array]]] = None,
   ):
     with self.conditional_named_scope("transformer_block"):
       shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
@@ -413,6 +414,7 @@ def __call__(
               deterministic=deterministic,
               rngs=rngs,
               encoder_attention_mask=encoder_attention_mask,
+              cached_kv=cached_kv,
           )
         with self.conditional_named_scope("cross_attn_residual"):
           hidden_states = hidden_states + attn_output
@@ -431,6 +433,13 @@ def __call__(
           )
       return hidden_states
 
+  def compute_kv(
+      self,
+      encoder_hidden_states: jax.Array,
+      encoder_attention_mask: Optional[jax.Array] = None,
+  ) -> Dict[str, Tuple[jax.Array, jax.Array]]:
+    return self.attn2.compute_kv(encoder_hidden_states, encoder_attention_mask)
+
 
 class WanModel(nnx.Module, FlaxModelMixin, ConfigMixin):
 
@@ -583,6 +592,53 @@ def conditional_named_scope(self, name: str):
     """Return a JAX named scope if enabled, otherwise a null context."""
     return jax.named_scope(name) if self.enable_jax_named_scopes else contextlib.nullcontext()
 
+  def compute_kv_cache(
+      self,
+      encoder_hidden_states: jax.Array,
+      encoder_hidden_states_image: Optional[jax.Array] = None,
+      timestep: Optional[jax.Array] = None,
+  ) -> Dict[str, Tuple[jax.Array, jax.Array]]:
+    if timestep is None:
+      batch_size = encoder_hidden_states.shape[0]
+      timestep = jnp.zeros((batch_size,), dtype=jnp.int32)
+      
+    with self.conditional_named_scope("condition_embedder"):
+      (
+          temb,
+          timestep_proj,
+          encoder_hidden_states,
+          encoder_hidden_states_image,
+          encoder_attention_mask,
+      ) = self.condition_embedder(timestep, encoder_hidden_states, encoder_hidden_states_image)
+
+    if encoder_hidden_states_image is not None:
+      encoder_hidden_states = jnp.concatenate([encoder_hidden_states_image, encoder_hidden_states], axis=1)
+      if encoder_attention_mask is not None:
+        text_mask = jnp.ones(
+            (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1] - encoder_hidden_states_image.shape[1]),
+            dtype=jnp.int32,
+        )
+        encoder_attention_mask = jnp.concatenate([encoder_attention_mask, text_mask], axis=1)
+
+    if self.scan_layers:
+      @nnx.vmap(in_axes=(0, None, None), out_axes=0, transform_metadata={nnx.PARTITION_NAME: "layers_per_stage"})
+      def _compute_kv(block, enc_states, enc_mask):
+         return block.compute_kv(enc_states, enc_mask)
+         
+      kv_cache = _compute_kv(self.blocks, encoder_hidden_states, encoder_attention_mask)
+    else:
+      kv_cache_list = []
+      for block in self.blocks:
+        kv_cache_list.append(block.compute_kv(encoder_hidden_states, encoder_attention_mask))
+      keys = kv_cache_list[0].keys()
+      kv_cache = {}
+      for k in keys:
+         k_list = [d[k][0] for d in kv_cache_list]
+         v_list = [d[k][1] for d in kv_cache_list]
+         kv_cache[k] = (jnp.stack(k_list, axis=0), jnp.stack(v_list, axis=0))
+         
+    return kv_cache
+
   @jax.named_scope("WanModel")
   def __call__(
       self,
@@ -597,6 +653,7 @@ def __call__(
       skip_blocks: Optional[jax.Array] = None,
       cached_residual: Optional[jax.Array] = None,
       return_residual: bool = False,
+      kv_cache: Optional[Dict[str, Tuple[jax.Array, jax.Array]]] = None,
   ) -> Union[jax.Array, Tuple[jax.Array, jax.Array], Dict[str, jax.Array]]:
     hidden_states = nn.with_logical_constraint(hidden_states, ("batch", None, None, None, None))
     batch_size, _, num_frames, height, width = hidden_states.shape
@@ -634,8 +691,14 @@ def __call__(
     def _run_all_blocks(h):
       if self.scan_layers:
 
-        def scan_fn(carry, block):
+        def scan_fn(carry, block_input):
           hidden_states_carry, rngs_carry = carry
+          if kv_cache is not None:
+            block, layer_kv_cache = block_input
+          else:
+            block = block_input
+            layer_kv_cache = None
+
           hidden_states = block(
               hidden_states_carry,
               encoder_hidden_states,
@@ -644,6 +707,7 @@ def scan_fn(carry, block):
               deterministic,
               rngs_carry,
               encoder_attention_mask,
+              cached_kv=layer_kv_cache,
           )
           new_carry = (hidden_states, rngs_carry)
           return new_carry, None
@@ -652,19 +716,28 @@ def scan_fn(carry, block):
             scan_fn, self.names_which_can_be_saved, self.names_which_can_be_offloaded, prevent_cse=not self.scan_layers
         )
         initial_carry = (h, rngs)
+
+        if kv_cache is not None:
+          scan_input = (self.blocks, kv_cache)
+        else:
+          scan_input = self.blocks
+
         final_carry, _ = nnx.scan(
             rematted_block_forward,
             length=self.num_layers,
             in_axes=(nnx.Carry, 0),
             out_axes=(nnx.Carry, 0),
-        )(initial_carry, self.blocks)
+        )(initial_carry, scan_input)
 
         h_out, _ = final_carry
       else:
         h_out = h
-        for block in self.blocks:
+        for i, block in enumerate(self.blocks):
+          layer_kv_cache = None
+          if kv_cache is not None:
+            layer_kv_cache = jax.tree_map(lambda x: x[i], kv_cache)
 
-          def layer_forward(hidden_states):
+          def layer_forward(hidden_states, l_kv):
             return block(
                 hidden_states,
                 encoder_hidden_states,
@@ -673,6 +746,7 @@ def layer_forward(hidden_states):
                 deterministic,
                 rngs,
                 encoder_attention_mask=encoder_attention_mask,
+                cached_kv=l_kv,
             )
 
           rematted_layer_forward = self.gradient_checkpoint.apply(
@@ -681,7 +755,7 @@ def layer_forward(hidden_states):
               self.names_which_can_be_offloaded,
               prevent_cse=not self.scan_layers,
           )
-          h_out = rematted_layer_forward(h_out)
+          h_out = rematted_layer_forward(h_out, layer_kv_cache)
       return h_out
 
     hidden_states_before_blocks = hidden_states
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py