LoRA support for Modulation layer in WAN2.2

Perseus14 · Perseus14 · commit aeca1cea9a7a · 2026-01-27T14:43:09.000Z
diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py
@@ -610,6 +610,20 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
   return new_state_dict
 
 
+def preprocess_wan_lora_dict(state_dict):
+  """
+  Preprocesses WAN LoRA dict to convert diff_m to modulation.diff.
+  """
+  new_d = {}
+  for k, v in state_dict.items():
+    if k.endswith(".diff_m"):
+      new_k = k.removesuffix(".diff_m") + ".modulation.diff"
+      new_d[new_k] = v
+    else:
+      new_d[k] = v
+  return new_d
+
+
 def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
   """
   Translates WAN NNX path to Diffusers/LoRA keys.
@@ -667,7 +681,7 @@ def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
       "ffn.proj_out": "ffn.2",  # Down proj
       # Global Norms & Modulation
       "norm2.layer_norm": "norm3",
-      "scale_shift_table": "modulation",
+      "adaln_scale_shift_table": "modulation",
       "proj_out": "head.head",
   }
 
diff --git a/src/maxdiffusion/loaders/wan_lora_nnx_loader.py b/src/maxdiffusion/loaders/wan_lora_nnx_loader.py
@@ -19,7 +19,7 @@
 from .lora_pipeline import StableDiffusionLoraLoaderMixin
 from ..models import lora_nnx
 from .. import max_logging
-from . import lora_conversion_utils
+from . import lora_conversion_utils, preprocess_wan_lora_dict
 
 
 class Wan2_1NNXLoraLoader(LoRABaseMixin):
@@ -50,10 +50,10 @@ def load_lora_weights(
     def translate_fn(nnx_path_str):
       return lora_conversion_utils.translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=scan_layers)
 
-    # Handle high noise model
     if hasattr(pipeline, "transformer") and transformer_weight_name:
       max_logging.log(f"Merging LoRA into transformer with rank={rank}")
       h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=transformer_weight_name, **kwargs)
+      h_state_dict = preprocess_wan_lora_dict(h_state_dict)
       merge_fn(pipeline.transformer, h_state_dict, rank, scale, translate_fn, dtype=dtype)
     else:
       max_logging.log("transformer not found or no weight name provided for LoRA.")
@@ -94,6 +94,7 @@ def translate_fn(nnx_path_str: str):
     if hasattr(pipeline, "high_noise_transformer") and high_noise_weight_name:
       max_logging.log(f"Merging LoRA into high_noise_transformer with rank={rank}")
       h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=high_noise_weight_name, **kwargs)
+      h_state_dict = preprocess_wan_lora_dict(h_state_dict)
       merge_fn(pipeline.high_noise_transformer, h_state_dict, rank, scale, translate_fn, dtype=dtype)
     else:
       max_logging.log("high_noise_transformer not found or no weight name provided for LoRA.")
@@ -102,6 +103,7 @@ def translate_fn(nnx_path_str: str):
     if hasattr(pipeline, "low_noise_transformer") and low_noise_weight_name:
       max_logging.log(f"Merging LoRA into low_noise_transformer with rank={rank}")
       l_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=low_noise_weight_name, **kwargs)
+      l_state_dict = preprocess_wan_lora_dict(l_state_dict)
       merge_fn(pipeline.low_noise_transformer, l_state_dict, rank, scale, translate_fn, dtype=dtype)
     else:
       max_logging.log("low_noise_transformer not found or no weight name provided for LoRA.")
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
@@ -29,7 +29,7 @@
 
 
 @jax.jit
-def _compute_and_add_single_jit(kernel, bias, down, up, scale, w_diff, b_diff, m_diff):
+def _compute_and_add_single_jit(kernel, bias, down, up, scale, w_diff, b_diff):
   """
   Applies LoRA + Weight Diff + Bias Diff on device.
   """
@@ -48,17 +48,11 @@ def _compute_and_add_single_jit(kernel, bias, down, up, scale, w_diff, b_diff, m
   if bias is not None and b_diff is not None:
     bias = bias + b_diff.astype(bias.dtype)
 
-  # 4. Apply DoRA magnitude vector
-  if m_diff is not None:
-    kernel = kernel * m_diff.astype(kernel.dtype)
-
   return kernel, bias
 
 
 @jax.jit
-def _compute_and_add_scanned_jit(
-    kernel, downs, ups, alphas, global_scale, w_diffs=None, b_diffs=None, bias=None, m_diff=None
-):
+def _compute_and_add_scanned_jit(kernel, downs, ups, alphas, global_scale, w_diffs=None, b_diffs=None, bias=None):
   """
   Applies scanned LoRA + Diffs.
   """
@@ -80,14 +74,6 @@ def _compute_and_add_scanned_jit(
   if bias is not None and b_diffs is not None:
     bias = bias + b_diffs.astype(bias.dtype)
 
-  # 4. Apply DoRA magnitude vector
-  if m_diff is not None:
-    # Reshape for broadcasting with kernel
-    # kernel shape can be (L, In, Out) or (L, H, W, In, Out)
-    # m_diff shape is (L, Out)
-    new_shape = [m_diff.shape[0]] + [1] * (kernel.ndim - 2) + [m_diff.shape[1]]
-    kernel = kernel * m_diff.reshape(new_shape).astype(kernel.dtype)
-
   return kernel, bias
 
 
@@ -135,14 +121,6 @@ def parse_lora_dict(state_dict, dtype):
       lora_params[key_base]["diff"] = _to_jax_array(v, dtype=dtype)
       continue
 
-    # DoRA Magnitude (e.g., "layer.diff_m")
-    if k.endswith(".diff_m"):
-      key_base = k[: -len(".diff_m")]
-      if key_base not in lora_params:
-        lora_params[key_base] = {}
-      lora_params[key_base]["diff_m"] = _to_jax_array(v, dtype=dtype)
-      continue
-
     # Standard LoRA
     m = re.match(r"^(.*?)\.(lora_down|lora_up)\.weight$", k)
     if not m:
@@ -205,7 +183,6 @@ def _merge_lora_layer(module, weights, scale):
     # Prepare Diff terms
     w_diff = weights.get("diff", None)
     b_diff = weights.get("diff_b", None)
-    m_diff = weights.get("diff_m", None)
 
     if w_diff is not None:
       w_diff = np.array(w_diff)
@@ -219,8 +196,6 @@ def _merge_lora_layer(module, weights, scale):
         w_diff = w_diff.transpose((1, 0))
     if b_diff is not None:
       b_diff = np.array(b_diff)
-    if m_diff is not None:
-      m_diff = np.array(m_diff)
 
     # If LoCON, compute delta and add to w_diff
     if is_conv_kxk_locon:
@@ -247,9 +222,9 @@ def _merge_lora_layer(module, weights, scale):
     bias_val = module.bias.value if module.bias is not None else None
 
     # --- EXECUTE JIT UPDATE ---
-    if down_w is not None or w_diff is not None or b_diff is not None or m_diff is not None:
+    if down_w is not None or w_diff is not None or b_diff is not None:
       new_kernel, new_bias = _compute_and_add_single_jit(
-          module.kernel.value, bias_val, down_w, up_w, current_scale, w_diff, b_diff, m_diff
+          module.kernel.value, bias_val, down_w, up_w, current_scale, w_diff, b_diff
       )
 
       module.kernel.value = new_kernel
@@ -404,7 +379,6 @@ def merge_lora_for_scanned(
       # Initialize as None, allocate only if found to save memory
       stack_w_diff = None
       stack_b_diff = None
-      stack_m_diff = None
 
       has_lora = False
       has_diff = False
@@ -415,14 +389,6 @@ def merge_lora_for_scanned(
           matched_keys.add(lora_key)
           w = lora_params[lora_key]
 
-          # --- Fill DoRA Magnitude ---
-          if "m_diff" in w:
-            if stack_m_diff is None:
-              stack_m_diff = np.ones((num_layers, out_feat), dtype=np.float32)
-            dm = np.array(w["m_diff"])
-            stack_m_diff[i] = dm.flatten()
-            has_diff = True
-
           # --- Fill LoRA ---
           if "down" in w:
             d, u = np.array(w["down"]), np.array(w["up"])
@@ -494,7 +460,6 @@ def merge_lora_for_scanned(
             stack_w_diff,
             stack_b_diff,
             bias_val,
-            stack_m_diff,
         )
 
         module.kernel.value = new_k