Fix

Perseus14 · Perseus14 · commit d4e22a4bb3a3 · 2026-01-15T11:06:38.000Z
diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py
@@ -612,16 +612,11 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
 
 def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
     """
-    Translates WAN NNX path like 'blocks.10.attn1.key' (scan_layers=False) or
-    'blocks.attn1.key' (scan_layers=True) to
-    LoRA path like 'diffusion_model.blocks.10.self_attn.k' or
-    template 'diffusion_model.blocks.{}.self_attn.k'.
-    Returns None if no match.
+    Translates WAN NNX path to Diffusers/LoRA keys.
+    Verified against wan_utils.py mappings.
     """
     
-    # Handle embeddings - exact paths
-    if nnx_path_str == "patch_embedding":
-        return "diffusion_model.patch_embedding"
+    # --- 1. Embeddings (Exact Matches) ---
     if nnx_path_str == 'condition_embedder.text_embedder.linear_1':
         return 'diffusion_model.text_embedding.0'
     if nnx_path_str == 'condition_embedder.text_embedder.linear_2':
@@ -630,46 +625,55 @@ def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
         return 'diffusion_model.time_embedding.0'
     if nnx_path_str == 'condition_embedder.time_embedder.linear_2':
         return 'diffusion_model.time_embedding.2'
-
-    # Translation for Attention and FFN layers
-    attn_ffn_map = {
-        "attn1.query": "self_attn.q",
-        "attn1.key": "self_attn.k",
-        "attn1.value": "self_attn.v",
+    if nnx_path_str == 'patch_embedding':
+        return 'diffusion_model.patch_embedding'
+
+    # --- 2. Map NNX Suffixes to LoRA Suffixes ---
+    suffix_map = {
+        # Self Attention (attn1)
+        "attn1.query":     "self_attn.q",
+        "attn1.key":       "self_attn.k",
+        "attn1.value":     "self_attn.v",
         "attn1.proj_attn": "self_attn.o",
-        "attn2.query": "cross_attn.q",
-        "attn2.key": "cross_attn.k",
-        "attn2.value": "cross_attn.v",
+        
+        # Self Attention Norms (QK Norm) - Added per your request
+        "attn1.norm_q":    "self_attn.norm_q",
+        "attn1.norm_k":    "self_attn.norm_k",
+        
+        # Cross Attention (attn2)
+        "attn2.query":     "cross_attn.q",
+        "attn2.key":       "cross_attn.k",
+        "attn2.value":     "cross_attn.v",
         "attn2.proj_attn": "cross_attn.o",
-        "ffn.act_fn.proj": "ffn.0",
-        "ffn.proj_out": "ffn.2",
-    }
-    # Translation for Norm layers
-    norm_map = {
-        "norm3": "norm3",
-        "attn1.norm_q": "self_attn.norm_q",
-        "attn1.norm_k": "self_attn.norm_k",
-        "attn2.norm_q": "cross_attn.norm_q",
-        "attn2.norm_k": "cross_attn.norm_k",
+
+        # Cross Attention Norms (QK Norm) - Added per your request
+        "attn2.norm_q":    "cross_attn.norm_q",
+        "attn2.norm_k":    "cross_attn.norm_k",
+        
+        # Feed Forward (ffn)
+        "ffn.act_fn.proj": "ffn.0",  # Up proj
+        "ffn.proj_out":    "ffn.2",  # Down proj
+        
+        # Global Norms & Modulation
+        "norm2.layer_norm": "norm3", 
+        "scale_shift_table": "modulation",
+        "proj_out": "head.head" 
     }
 
+    # --- 3. Translation Logic ---
     if scan_layers:
-        # Handle scanned attn/ffn: blocks.attn1.query -> diffusion_model.blocks.{}.self_attn.q
-        for k, v in attn_ffn_map.items():
-            if nnx_path_str == f"blocks.{k}":
-                return f"diffusion_model.blocks.{{}}.{v}"
-        # Handle scanned norm: blocks.norm3 -> diffusion_model.blocks.{}.norm3
-        for k, v in norm_map.items():
-            if nnx_path_str == f"blocks.{k}":
-                return f"diffusion_model.blocks.{{}}.{v}"
+        # Scanned Pattern: "blocks.attn1.query" -> "diffusion_model.blocks.{}.self_attn.q"
+        if nnx_path_str.startswith("blocks."):
+            inner_suffix = nnx_path_str[len("blocks."):] 
+            if inner_suffix in suffix_map:
+                return f"diffusion_model.blocks.{{}}.{suffix_map[inner_suffix]}"
     else:
-        # Handle non-scanned attn/ffn/norm: blocks.0.attn1.query -> diffusion_model.blocks.0.self_attn.q
+        # Unscanned Pattern: "blocks.0.attn1.query" -> "diffusion_model.blocks.0.self_attn.q"
         m = re.match(r"^blocks\.(\d+)\.(.+)$", nnx_path_str)
         if m:
-            idx, suffix = m.group(1), m.group(2)
-            if suffix in attn_ffn_map:
-                return f"diffusion_model.blocks.{idx}.{attn_ffn_map[suffix]}"
-            if suffix in norm_map:
-                return f"diffusion_model.blocks.{idx}.{norm_map[suffix]}"
+            idx, inner_suffix = m.group(1), m.group(2)
+            if inner_suffix in suffix_map:
+                return f"diffusion_model.blocks.{idx}.{suffix_map[inner_suffix]}"
     
     return None
+
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
@@ -61,8 +61,8 @@ def _compute_and_add_scanned_jit(kernel, downs, ups, alphas, global_scale, w_dif
         scales = (global_scale * alphas / rank)
         # Batch Matmul: (L, In, Out)
         delta = jnp.matmul(jnp.swapaxes(downs, 1, 2), jnp.swapaxes(ups, 1, 2))
-        delta = delta.reshape(kernel.shape)
-        kernel = kernel + (delta * scales).astype(kernel.dtype)
+        delta = (delta * scales).astype(kernel.dtype)
+        kernel = kernel + delta.reshape(kernel.shape)
 
     # 2. Apply Scanned Weight Diffs (L, ...)
     if w_diffs is not None:
@@ -227,7 +227,6 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
         if not isinstance(module, (nnx.Linear, nnx.Conv)): continue
 
         nnx_path_str = ".".join(map(str, path))
-        max_logging.log(f"NNX path: {nnx_path_str}")
         lora_key = translate_fn(nnx_path_str) if translate_fn else None
 
         if lora_key and lora_key in lora_params:
@@ -236,19 +235,19 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
             # Prepare LoRA terms
             down_w, up_w, current_scale = None, None, None
             if "down" in weights and "up" in weights:
-                down_w, up_w = weights["down"], weights["up"]
-                down_w, up_w = np.array(down_w), np.array(up_w) # CPU convert
-                
-                # Squeeze dimensions if needed (Conv 1x1 or Linear)
-                if isinstance(module, nnx.Conv) and module.kernel_size == (1, 1):
-                    down_w, up_w = np.squeeze(down_w), np.squeeze(up_w)
-                elif isinstance(module, nnx.Conv) and module.kernel_size != (1, 1):
-                    # Skip LoRA for non-1x1 convs if shapes don't align
-                    pass
-
-                rank = down_w.shape[0] if down_w.ndim > 0 else 0
-                alpha = float(weights.get("alpha", rank))
-                current_scale = scale * alpha / rank
+                if isinstance(module, nnx.Conv) and module.kernel_size != (1, 1):
+                    max_logging.log(f"Skipping LoRA merge for non-1x1 Conv: {lora_key}")
+                else:
+                    down_w, up_w = weights["down"], weights["up"]
+                    down_w, up_w = np.array(down_w), np.array(up_w) # CPU convert
+                    
+                    # Squeeze dimensions if needed (Conv 1x1 or Linear)
+                    if isinstance(module, nnx.Conv) and module.kernel_size == (1, 1):
+                        down_w, up_w = np.squeeze(down_w), np.squeeze(up_w)
+
+                    rank = down_w.shape[0] if down_w.ndim > 0 else 0
+                    alpha = float(weights.get("alpha", rank))
+                    current_scale = scale * alpha / rank
             
             # Prepare Diff terms
             w_diff = weights.get("diff", None)