Add LoRA support for WAN models

Perseus14 · Perseus14 · commit b13347c51c29 · 2026-01-28T14:36:58.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -317,12 +317,14 @@ lightning_repo: ""
 lightning_ckpt: ""
 
 # LoRA parameters
+enable_lora: False
 # Values are lists to support multiple LoRA loading during inference in the future.
 lora_config: {
-  lora_model_name_or_path: [],
-  weight_name: [],
-  adapter_name: [],
-  scale: [],
+  rank: [64],
+  lora_model_name_or_path: ["lightx2v/Wan2.1-Distill-Loras"],
+  weight_name: ["wan2.1_t2v_14b_lora_rank64_lightx2v_4step.safetensors"],
+  adapter_name: ["wan21-distill-lora"],
+  scale: [1.0],
   from_pt: []
 }
 # Ex with values:
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -316,12 +316,15 @@ lightning_repo: ""
 lightning_ckpt: ""
 
 # LoRA parameters
+enable_lora: False
 # Values are lists to support multiple LoRA loading during inference in the future.
 lora_config: {
-  lora_model_name_or_path: [],
-  weight_name: [],
-  adapter_name: [],
-  scale: [],
+  rank: [64],
+  lora_model_name_or_path: ["lightx2v/Wan2.2-Distill-Loras"],
+  high_noise_weight_name: ["wan2.2_t2v_A14b_high_noise_lora_rank64_lightx2v_4step_1217.safetensors"],
+  low_noise_weight_name: ["wan2.2_t2v_A14b_low_noise_lora_rank64_lightx2v_4step_1217.safetensors"],
+  adapter_name: ["wan22-distill-lora"],
+  scale: [1.0],
   from_pt: []
 }
 # Ex with values:
diff --git a/src/maxdiffusion/configs/base_wan_i2v_14b.yml b/src/maxdiffusion/configs/base_wan_i2v_14b.yml
@@ -276,8 +276,8 @@ profiler_steps: 10
 enable_jax_named_scopes: False
 
 # Generation parameters
-prompt: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-prompt_2: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+prompt: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. Appearing behind him is a giant, translucent, pink spiritual manifestation (faxiang) that is synchronized with the man's action and pose." #"An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+prompt_2: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. Appearing behind him is a giant, translucent, pink spiritual manifestation (faxiang) that is synchronized with the man's action and pose." #"An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
 negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
 do_classifier_free_guidance: True
 height: 480
@@ -300,12 +300,14 @@ lightning_repo: ""
 lightning_ckpt: ""
 
 # LoRA parameters
+enable_lora: False
 # Values are lists to support multiple LoRA loading during inference in the future.
 lora_config: {
-  lora_model_name_or_path: [],
-  weight_name: [],
-  adapter_name: [],
-  scale: [],
+  rank: [64, 32],
+  lora_model_name_or_path: ["lightx2v/Wan2.1-Distill-Loras", "starsfriday/Wan2.1-Divine-Power-LoRA"],
+  weight_name: ["wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors", "divine-power.safetensors"],
+  adapter_name: ["wan21-distill-lora-i2v", "divine-power-lora"],
+  scale: [1.0, 1.0],
   from_pt: []
 }
 # Ex with values:
diff --git a/src/maxdiffusion/configs/base_wan_i2v_27b.yml b/src/maxdiffusion/configs/base_wan_i2v_27b.yml
@@ -277,8 +277,8 @@ profiler_steps: 10
 enable_jax_named_scopes: False
 
 # Generation parameters
-prompt: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
-prompt_2: "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+prompt: "orbit 180 around an astronaut on the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+prompt_2: "orbit 180 around an astronaut on the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
 negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
 do_classifier_free_guidance: True
 height: 480
@@ -288,10 +288,10 @@ flow_shift: 3.0
 
 # Reference for below guidance scale and boundary values: https://github.com/Wan-Video/Wan2.2/blob/main/wan/configs/wan_t2v_A14B.py
 # guidance scale factor for low noise transformer
-guidance_scale_low: 3.0    
+guidance_scale_low: 3.0
 
 # guidance scale factor for high noise transformer
-guidance_scale_high: 4.0   
+guidance_scale_high: 4.0
 
 # The timestep threshold. If `t` is at or above this value,
 # the `high_noise_model` is considered as the required model.
@@ -312,12 +312,15 @@ lightning_repo: ""
 lightning_ckpt: ""
 
 # LoRA parameters
+enable_lora: False
 # Values are lists to support multiple LoRA loading during inference in the future.
 lora_config: {
-  lora_model_name_or_path: [],
-  weight_name: [],
-  adapter_name: [],
-  scale: [],
+  rank: [64, 16],
+  lora_model_name_or_path: ["lightx2v/Wan2.2-Distill-Loras", "ostris/wan22_i2v_14b_orbit_shot_lora"],
+  high_noise_weight_name: ["wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_1022.safetensors", "wan22_14b_i2v_orbit_high_noise.safetensors"],
+  low_noise_weight_name: ["wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors", "wan22_14b_i2v_orbit_low_noise.safetensors"], # Empty or "wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors"
+  adapter_name: ["wan22-distill-lora", "wan22-orbit-lora"],
+  scale: [1.0, 1.0],
   from_pt: []
 }
 # Ex with values:
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -28,6 +28,7 @@
 from google.cloud import storage
 import flax
 from maxdiffusion.common_types import WAN2_1, WAN2_2
+from maxdiffusion.loaders.wan_lora_nnx_loader import Wan2_1NNXLoraLoader, Wan2_2NNXLoraLoader
 
 
 def upload_video_to_gcs(output_dir: str, video_path: str):
@@ -188,6 +189,43 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     else:
       raise ValueError(f"Unsupported model_name for checkpointer: {model_key}")
     pipeline, _, _ = checkpoint_loader.load_checkpoint()
+
+  # If LoRA is specified, inject layers and load weights.
+  if (
+      config.enable_lora
+      and hasattr(config, "lora_config")
+      and config.lora_config
+      and config.lora_config["lora_model_name_or_path"]
+  ):
+    if model_key == WAN2_1:
+      lora_loader = Wan2_1NNXLoraLoader()
+      lora_config = config.lora_config
+      for i in range(len(lora_config["lora_model_name_or_path"])):
+        pipeline = lora_loader.load_lora_weights(
+            pipeline,
+            lora_config["lora_model_name_or_path"][i],
+            transformer_weight_name=lora_config["weight_name"][i],
+            rank=lora_config["rank"][i],
+            scale=lora_config["scale"][i],
+            scan_layers=config.scan_layers,
+            dtype=config.weights_dtype,
+        )
+
+    if model_key == WAN2_2:
+      lora_loader = Wan2_2NNXLoraLoader()
+      lora_config = config.lora_config
+      for i in range(len(lora_config["lora_model_name_or_path"])):
+        pipeline = lora_loader.load_lora_weights(
+            pipeline,
+            lora_config["lora_model_name_or_path"][i],
+            high_noise_weight_name=lora_config["high_noise_weight_name"][i],
+            low_noise_weight_name=lora_config["low_noise_weight_name"][i],
+            rank=lora_config["rank"][i],
+            scale=lora_config["scale"][i],
+            scan_layers=config.scan_layers,
+            dtype=config.weights_dtype,
+        )
+
   s0 = time.perf_counter()
 
   # Using global_batch_size_to_train_on so not to create more config variables
diff --git a/src/maxdiffusion/loaders/__init__.py b/src/maxdiffusion/loaders/__init__.py
@@ -14,3 +14,4 @@
 
 from .lora_pipeline import StableDiffusionLoraLoaderMixin
 from .flux_lora_pipeline import FluxLoraLoaderMixin
+from .wan_lora_nnx_loader import Wan2_1NNXLoraLoader, Wan2_2NNXLoraLoader
diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py
@@ -608,3 +608,98 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
     raise ValueError(f"`old_state_dict` should be at this point but has: {list(old_state_dict.keys())}.")
 
   return new_state_dict
+
+
+def preprocess_wan_lora_dict(state_dict):
+  """
+  Preprocesses WAN LoRA dict to convert diff_m to modulation.diff.
+  """
+  new_d = {}
+  for k, v in state_dict.items():
+    if k.endswith(".diff_m"):
+      new_k = k.removesuffix(".diff_m") + ".modulation.diff"
+      new_d[new_k] = v
+    else:
+      new_d[k] = v
+  return new_d
+
+
+def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
+  """
+  Translates WAN NNX path to Diffusers/LoRA keys.
+  Verified against wan_utils.py mappings.
+  """
+
+  # --- 1. Embeddings (Exact Matches) ---
+  if nnx_path_str == "condition_embedder.text_embedder.linear_1":
+    return "diffusion_model.text_embedding.0"
+  if nnx_path_str == "condition_embedder.text_embedder.linear_2":
+    return "diffusion_model.text_embedding.2"
+  if nnx_path_str == "condition_embedder.time_embedder.linear_1":
+    return "diffusion_model.time_embedding.0"
+  if nnx_path_str == "condition_embedder.time_embedder.linear_2":
+    return "diffusion_model.time_embedding.2"
+  if nnx_path_str == "condition_embedder.image_embedder.norm1.layer_norm":
+    return "diffusion_model.img_emb.proj.0"
+  if nnx_path_str == "condition_embedder.image_embedder.ff.net_0":
+    return "diffusion_model.img_emb.proj.1"
+  if nnx_path_str == "condition_embedder.image_embedder.ff.net_2":
+    return "diffusion_model.img_emb.proj.3"
+  if nnx_path_str == "condition_embedder.image_embedder.norm2.layer_norm":
+    return "diffusion_model.img_emb.proj.4"
+  if nnx_path_str == "patch_embedding":
+    return "diffusion_model.patch_embedding"
+  if nnx_path_str == "proj_out":
+    return "diffusion_model.head.head"
+  if nnx_path_str == "scale_shift_table":
+    return "diffusion_model.head.modulation"
+  if nnx_path_str == "condition_embedder.time_proj":
+    return "diffusion_model.time_projection.1"
+
+  # --- 2. Map NNX Suffixes to LoRA Suffixes ---
+  suffix_map = {
+      # Self Attention (attn1)
+      "attn1.query": "self_attn.q",
+      "attn1.key": "self_attn.k",
+      "attn1.value": "self_attn.v",
+      "attn1.proj_attn": "self_attn.o",
+      # Self Attention Norms (QK Norm)
+      "attn1.norm_q": "self_attn.norm_q",
+      "attn1.norm_k": "self_attn.norm_k",
+      # Cross Attention (attn2)
+      "attn2.query": "cross_attn.q",
+      "attn2.key": "cross_attn.k",
+      "attn2.value": "cross_attn.v",
+      "attn2.proj_attn": "cross_attn.o",
+      # Cross Attention Norms (QK Norm)
+      "attn2.norm_q": "cross_attn.norm_q",
+      "attn2.norm_k": "cross_attn.norm_k",
+      # Cross Attention img
+      "attn2.add_k_proj": "cross_attn.k_img",
+      "attn2.add_v_proj": "cross_attn.v_img",
+      "attn2.norm_added_k": "cross_attn.norm_k_img",
+      # Feed Forward (ffn)
+      "ffn.act_fn.proj": "ffn.0",  # Up proj
+      "ffn.proj_out": "ffn.2",  # Down proj
+      # Global Norms & Modulation
+      "norm2.layer_norm": "norm3",
+      "adaln_scale_shift_table": "modulation",
+      "proj_out": "head.head",
+  }
+
+  # --- 3. Translation Logic ---
+  if scan_layers:
+    # Scanned Pattern: "blocks.attn1.query" -> "diffusion_model.blocks.{}.self_attn.q"
+    if nnx_path_str.startswith("blocks."):
+      inner_suffix = nnx_path_str[len("blocks.") :]
+      if inner_suffix in suffix_map:
+        return f"diffusion_model.blocks.{{}}.{suffix_map[inner_suffix]}"
+  else:
+    # Unscanned Pattern: "blocks.0.attn1.query" -> "diffusion_model.blocks.0.self_attn.q"
+    m = re.match(r"^blocks\.(\d+)\.(.+)$", nnx_path_str)
+    if m:
+      idx, inner_suffix = m.group(1), m.group(2)
+      if inner_suffix in suffix_map:
+        return f"diffusion_model.blocks.{idx}.{suffix_map[inner_suffix]}"
+
+  return None
diff --git a/src/maxdiffusion/loaders/wan_lora_nnx_loader.py b/src/maxdiffusion/loaders/wan_lora_nnx_loader.py
@@ -0,0 +1,111 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NNX-based LoRA loader for WAN models."""
+
+from flax import nnx
+from .lora_base import LoRABaseMixin
+from .lora_pipeline import StableDiffusionLoraLoaderMixin
+from ..models import lora_nnx
+from .. import max_logging
+from . import lora_conversion_utils
+
+
+class Wan2_1NNXLoraLoader(LoRABaseMixin):
+  """
+  Handles loading LoRA weights into NNX-based WAN 2.1 model.
+  Assumes WAN pipeline contains 'transformer'
+  attributes that are NNX Modules.
+  """
+
+  def load_lora_weights(
+      self,
+      pipeline: nnx.Module,
+      lora_model_path: str,
+      transformer_weight_name: str,
+      rank: int,
+      scale: float = 1.0,
+      scan_layers: bool = False,
+      dtype: str = "float32",
+      **kwargs,
+  ):
+    """
+    Merges LoRA weights into the pipeline from a checkpoint.
+    """
+    lora_loader = StableDiffusionLoraLoaderMixin()
+
+    merge_fn = lora_nnx.merge_lora_for_scanned if scan_layers else lora_nnx.merge_lora
+
+    def translate_fn(nnx_path_str):
+      return lora_conversion_utils.translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=scan_layers)
+
+    if hasattr(pipeline, "transformer") and transformer_weight_name:
+      max_logging.log(f"Merging LoRA into transformer with rank={rank}")
+      h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=transformer_weight_name, **kwargs)
+      h_state_dict = lora_conversion_utils.preprocess_wan_lora_dict(h_state_dict)
+      merge_fn(pipeline.transformer, h_state_dict, rank, scale, translate_fn, dtype=dtype)
+    else:
+      max_logging.log("transformer not found or no weight name provided for LoRA.")
+
+    return pipeline
+
+
+class Wan2_2NNXLoraLoader(LoRABaseMixin):
+  """
+  Handles loading LoRA weights into NNX-based WAN 2.2 model.
+  Assumes WAN pipeline contains 'high_noise_transformer' and 'low_noise_transformer'
+  attributes that are NNX Modules.
+  """
+
+  def load_lora_weights(
+      self,
+      pipeline: nnx.Module,
+      lora_model_path: str,
+      high_noise_weight_name: str,
+      low_noise_weight_name: str,
+      rank: int,
+      scale: float = 1.0,
+      scan_layers: bool = False,
+      dtype: str = "float32",
+      **kwargs,
+  ):
+    """
+    Merges LoRA weights into the pipeline from a checkpoint.
+    """
+    lora_loader = StableDiffusionLoraLoaderMixin()
+
+    merge_fn = lora_nnx.merge_lora_for_scanned if scan_layers else lora_nnx.merge_lora
+
+    def translate_fn(nnx_path_str: str):
+      return lora_conversion_utils.translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=scan_layers)
+
+    # Handle high noise model
+    if hasattr(pipeline, "high_noise_transformer") and high_noise_weight_name:
+      max_logging.log(f"Merging LoRA into high_noise_transformer with rank={rank}")
+      h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=high_noise_weight_name, **kwargs)
+      h_state_dict = lora_conversion_utils.preprocess_wan_lora_dict(h_state_dict)
+      merge_fn(pipeline.high_noise_transformer, h_state_dict, rank, scale, translate_fn, dtype=dtype)
+    else:
+      max_logging.log("high_noise_transformer not found or no weight name provided for LoRA.")
+
+    # Handle low noise model
+    if hasattr(pipeline, "low_noise_transformer") and low_noise_weight_name:
+      max_logging.log(f"Merging LoRA into low_noise_transformer with rank={rank}")
+      l_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=low_noise_weight_name, **kwargs)
+      l_state_dict = lora_conversion_utils.preprocess_wan_lora_dict(l_state_dict)
+      merge_fn(pipeline.low_noise_transformer, l_state_dict, rank, scale, translate_fn, dtype=dtype)
+    else:
+      max_logging.log("low_noise_transformer not found or no weight name provided for LoRA.")
+
+    return pipeline
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@`
`14`	`14`
`15`	`15`	`from .lora_pipeline import StableDiffusionLoraLoaderMixin`
`16`	`16`	`from .flux_lora_pipeline import FluxLoraLoaderMixin`
	`17`	`+from .wan_lora_nnx_loader import Wan2_1NNXLoraLoader, Wan2_2NNXLoraLoader`