use nnx.scan over for loop.

entrpn · entrpn · commit 3d2edcca81be · 2025-07-23T18:56:22.000Z
diff --git a/src/maxdiffusion/common_types.py b/src/maxdiffusion/common_types.py
@@ -43,3 +43,5 @@
 KEEP_1 = "activation_keep_1"
 KEEP_2 = "activation_keep_2"
 CONV_OUT = "activation_conv_out_channels"
+
+WAN_MODEL = "Wan2.1"
diff --git a/src/maxdiffusion/models/modeling_flax_pytorch_utils.py b/src/maxdiffusion/models/modeling_flax_pytorch_utils.py
@@ -25,6 +25,7 @@
 from chex import Array
 from ..utils import logging
 from .. import max_logging
+from .. import common_types
 
 
 logger = logging.get_logger(__name__)
@@ -86,7 +87,7 @@ def rename_key(key):
 
 # Adapted from https://github.com/huggingface/transformers/blob/c603c80f46881ae18b2ca50770ef65fa4033eacd/src/transformers/modeling_flax_pytorch_utils.py#L69
 # and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
-def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
+def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict, model_type=None):
   """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
   # conv norm or layer norm
   renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
@@ -109,9 +110,17 @@ def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dic
         renamed_pt_tuple_key = pt_tuple_key[:-2] + (rename_to, weight_name)
         if renamed_pt_tuple_key in random_flax_state_dict:
           if isinstance(random_flax_state_dict[renamed_pt_tuple_key], Partitioned):
-            assert random_flax_state_dict[renamed_pt_tuple_key].value.shape == pt_tensor.T.shape
+            # Wan 2.1 uses nnx.scan and nnx.vmap which stacks layer weights which will cause a shape mismatch
+            # from the original weights which are not stacked.
+            if model_type is not None and model_type == common_types.WAN_MODEL:
+              pass
+            else:
+              assert random_flax_state_dict[renamed_pt_tuple_key].value.shape == pt_tensor.T.shape
           else:
-            assert random_flax_state_dict[renamed_pt_tuple_key].shape == pt_tensor.T.shape
+            if model_type is not None and model_type == common_types.WAN_MODEL:
+              pass
+            else:
+              assert random_flax_state_dict[renamed_pt_tuple_key].shape == pt_tensor.T.shape
           return renamed_pt_tuple_key, pt_tensor.T
 
   if (
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -359,6 +359,7 @@ def __init__(
   ):
     inner_dim = num_attention_heads * attention_head_dim
     out_channels = out_channels or in_channels
+    self.num_layers = num_layers
 
     # 1. Patch & position embedding
     self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
@@ -396,9 +397,10 @@ def __init__(
     )
 
     # 3. Transformer blocks
-    blocks = []
-    for _ in range(num_layers):
-      block = WanTransformerBlock(
+    @nnx.split_rngs(splits=num_layers)
+    @nnx.vmap
+    def init_block(rngs):
+      return WanTransformerBlock(
           rngs=rngs,
           dim=inner_dim,
           ffn_dim=ffn_dim,
@@ -414,8 +416,7 @@ def __init__(
           precision=precision,
           attention=attention,
       )
-      blocks.append(block)
-    self.blocks = blocks
+    self.blocks = init_block(rngs)
 
     self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False)
     self.proj_out = nnx.Linear(
@@ -463,21 +464,21 @@ def __call__(
     if encoder_hidden_states_image is not None:
       raise NotImplementedError("img2vid is not yet implemented.")
 
-    def skip_block_true(hidden_states):
-      split_bs = hidden_states.shape[0] // 2
-      prev_neg_hidden_states = hidden_states[split_bs:]
+    def scan_fn(carry, block):
+      hidden_states, encoder_hidden_states, timestep_proj, rotary_emb = carry
       hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-      hidden_states = jnp.concatenate([hidden_states[:split_bs], prev_neg_hidden_states], axis=0)
-      return hidden_states
-
-    for block_idx, block in enumerate(self.blocks):
-      should_skip_block = slg_mask[block_idx] & is_uncond
-      hidden_states = jax.lax.cond(
-          should_skip_block,
-          lambda _: skip_block_true(hidden_states),  # If true, pass through original hidden_states (skip block)
-          lambda _: block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb),
-          hidden_states,
-      )
+      return (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+
+    initial_carry = (hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+    final_carry = nnx.scan(
+      scan_fn,
+      length=self.num_layers,
+      in_axes=(nnx.Carry, 0),
+      out_axes=nnx.Carry,
+    )(initial_carry, self.blocks)
+
+    hidden_states = final_carry[0]
+
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
     hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).astype(hidden_states.dtype)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -8,6 +8,7 @@
 from safetensors import safe_open
 from flax.traverse_util import unflatten_dict, flatten_dict
 from ..modeling_flax_pytorch_utils import (rename_key, rename_key_and_reshape_tensor, torch2jax, validate_flax_state_dict)
+from ...common_types import WAN_MODEL
 
 CAUSVID_TRANSFORMER_MODEL_NAME_OR_PATH = "lightx2v/Wan2.1-T2V-14B-CausVid"
 WAN_21_FUSION_X_MODEL_NAME_OR_PATH = "vrgamedevgirl84/Wan14BT2VFusioniX"
@@ -82,7 +83,7 @@ def load_fusionx_transformer(pretrained_model_name_or_path: str, eval_shapes: di
 
         pt_tuple_key = tuple(renamed_pt_key.split("."))
 
-        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
+        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL)
         flax_key = rename_for_nnx(flax_key)
         flax_key = _tuple_str_to_int(flax_key)
         flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
@@ -117,7 +118,7 @@ def load_causvid_transformer(pretrained_model_name_or_path: str, eval_shapes: di
 
         pt_tuple_key = tuple(renamed_pt_key.split("."))
 
-        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
+        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL)
         flax_key = rename_for_nnx(flax_key)
         flax_key = _tuple_str_to_int(flax_key)
         flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
@@ -196,9 +197,20 @@ def load_base_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: d
       renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
       pt_tuple_key = tuple(renamed_pt_key.split("."))
 
-      flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
+      if "blocks" in pt_tuple_key:
+        new_key = ("blocks",) + pt_tuple_key[2:]
+        block_index = int(pt_tuple_key[1])
+        pt_tuple_key = new_key
+      flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict, model_type=WAN_MODEL)
       flax_key = rename_for_nnx(flax_key)
       flax_key = _tuple_str_to_int(flax_key)
+
+      if "blocks" in flax_key:
+        if flax_key in flax_state_dict:
+          new_tensor = flax_state_dict[flax_key]
+        else:
+          new_tensor = jnp.zeros((40,) + flax_tensor.shape)
+        flax_tensor = new_tensor.at[block_index].set(flax_tensor)
       flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
     validate_flax_state_dict(eval_shapes, flax_state_dict)
     flax_state_dict = unflatten_dict(flax_state_dict)