load wan 2.1 transformer weights.

jfacevedo-google · jfacevedo-google · commit 0ef8c71a83e3 · 2025-05-14T21:37:58.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -27,7 +27,7 @@ gcs_metrics: False
 save_config_to_gcs: False
 log_period: 100
 
-pretrained_model_name_or_path: ''
+pretrained_model_name_or_path: 'Wan-AI/Wan2.1-T2V-14B-Diffusers'
 
 unet_checkpoint: ''
 revision: ''
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -618,7 +618,6 @@ def __init__(
         in_features=self.inner_dim,
         out_features=self.inner_dim,
         kernel_init=qkv_init_kernel,
-        use_bias=qkv_bias,
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -629,7 +628,6 @@ def __init__(
         in_features=self.inner_dim,
         out_features=self.inner_dim,
         kernel_init=qkv_init_kernel,
-        use_bias=qkv_bias,
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -640,7 +638,6 @@ def __init__(
         in_features=self.inner_dim,
         out_features=self.inner_dim,
         kernel_init=qkv_init_kernel,
-        use_bias=qkv_bias,
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
@@ -651,16 +648,15 @@ def __init__(
       in_features=self.inner_dim,
       out_features=self.inner_dim,
       kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed")),
-      use_bias=qkv_bias,
       dtype=dtype,
       param_dtype=weights_dtype,
       precision=precision,
     )
 
-    self.query_norm = None
-    self.key_norm = None
+    self.norm_q = None
+    self.norm_k = None
     if qk_norm is not None:
-      self.query_norm = nnx.RMSNorm(
+      self.norm_q = nnx.RMSNorm(
         num_features=self.inner_dim,
         rngs=rngs,
         epsilon=eps,
@@ -669,7 +665,7 @@ def __init__(
         param_dtype=weights_dtype
       )
 
-      self.key_norm = nnx.RMSNorm(
+      self.norm_k = nnx.RMSNorm(
         num_features=self.inner_dim,
         rngs=rngs,
         dtype=dtype,
@@ -713,8 +709,8 @@ def __call__(
     value_proj = nn.with_logical_constraint(value_proj, self.value_axis_names)
 
     if self.qk_norm:
-      query_proj = self.query_norm(query_proj)
-      key_proj = self.key_norm(key_proj)
+      query_proj = self.norm_q(query_proj)
+      key_proj = self.norm_k(key_proj)
     
     if rotary_emb is not None:
       query_proj = _unflatten_heads(query_proj, self.heads)
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -1,9 +1,10 @@
+import json
 import jax
 import jax.numpy as jnp
 from maxdiffusion import max_logging
 from huggingface_hub import hf_hub_download
 from safetensors import safe_open
-from flax.traverse_util import unflatten_dict
+from flax.traverse_util import unflatten_dict, flatten_dict
 from ..modeling_flax_pytorch_utils import (rename_key, rename_key_and_reshape_tensor, torch2jax, validate_flax_state_dict)
 
 
@@ -16,6 +17,66 @@ def _tuple_str_to_int(in_tuple):
       out_list.append(item)
   return tuple(out_list)
 
+def rename_for_nnx(key):
+  new_key = key
+  if "norm_k" in key or "norm_q" in key:
+     new_key = key[:-1] + ("scale",)
+  return new_key
+
+def load_wan_transformer(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
+  device = jax.devices(device)[0]
+  with jax.default_device(device):
+    if hf_download:
+      # download the index file for sharded models.
+      index_file_path = hf_hub_download(pretrained_model_name_or_path, subfolder="transformer", filename="diffusion_pytorch_model.safetensors.index.json")
+      # open the index file.
+      with open(index_file_path, 'r') as f:
+        index_dict = json.load(f)
+      model_files = set()
+      for key in index_dict["weight_map"].keys():
+        model_files.add(index_dict["weight_map"][key])
+      
+      model_files = list(model_files)
+      tensors = {}
+      for model_file in model_files:
+        ckpt_shard_path = hf_hub_download(
+          pretrained_model_name_or_path, subfolder="transformer", filename=model_file
+        )
+        # now get all the filenames for the model that need downloading
+        max_logging.log(f"Load and port Wan 2.1 transformer on {device}")
+
+        if ckpt_shard_path is not None:
+          with safe_open(ckpt_shard_path, framework="pt") as f:
+            for k in f.keys():
+              tensors[k] = torch2jax(f.get_tensor(k))
+      flax_state_dict = {}
+      cpu = jax.local_devices(backend="cpu")[0]
+      flattened_dict = flatten_dict(eval_shapes)
+      # turn all block numbers to strings just for matching weights. 
+      # Later they will be turned back to ints.
+      random_flax_state_dict = {}
+      for key in flattened_dict:
+        string_tuple = tuple([str(item) for item in key])
+        random_flax_state_dict[string_tuple] = flattened_dict[key]
+      del flattened_dict
+      for pt_key, tensor in tensors.items():
+        renamed_pt_key = rename_key(pt_key)
+        renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
+        renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
+        renamed_pt_key = renamed_pt_key.replace("ffn.net_2", "ffn.proj_out")
+        renamed_pt_key = renamed_pt_key.replace("ffn.net_0", "ffn.act_fn")
+        renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
+        pt_tuple_key = tuple(renamed_pt_key.split("."))
+        
+        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, tensor, random_flax_state_dict)
+        flax_key = rename_for_nnx(flax_key)
+        flax_key = _tuple_str_to_int(flax_key)
+        flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
+      validate_flax_state_dict(eval_shapes, flax_state_dict)
+      flax_state_dict = unflatten_dict(flax_state_dict)
+      del tensors
+      jax.clear_caches()
+      return flax_state_dict
 
 def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
   device = jax.devices(device)[0]