Fix

Perseus14 · Perseus14 · commit 6042d24d8b9d · 2026-01-13T19:58:02.000Z
diff --git a/src/maxdiffusion/configs/base_wan_lora_27b.yml b/src/maxdiffusion/configs/base_wan_lora_27b.yml
@@ -50,7 +50,7 @@ replicate_vae: False
 # at the cost of time.
 precision: "DEFAULT"
 # Use jax.lax.scan for transformer layers
-scan_layers: True
+scan_layers: False
 
 # if False state is not jitted and instead replicate is called. This is good for debugging on single host
 # It must be True for multi-host.
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
@@ -200,42 +200,73 @@ def inject_lora(
 
     return model
 
+def _translate_nnx_path_to_lora_key(nnx_path_str):
+    """
+    Translates NNX path like 'blocks.10.attn1.key' to
+    LoRA path like 'diffusion_model.blocks.10.self_attn.k'.
+    Returns None if no match.
+    """
+    translation_map = {
+        "attn1": "self_attn",
+        "attn2": "cross_attn",
+        "query": "q",
+        "key": "k",
+        "value": "v",
+        "proj_attn": "o",
+        "ffn.act_fn.proj": "ffn.0",
+        "ffn.proj_out": "ffn.2",
+    }
+    # Match paths like blocks.10.attn1.key or blocks.5.ffn.proj_out
+    m = re.match(r"^blocks\.(\d+)\.(attn[12]\.(?:query|key|value|proj_attn)|ffn\.(?:act_fn\.proj|proj_out))$", nnx_path_str)
+    if not m:
+        return None
+
+    block_idx, suffix = m.group(1), m.group(2)
+
+    parts = suffix.split('.')
+    if parts[0] == 'attn1' or parts[0] == 'attn2':
+        lora_part1 = translation_map[parts[0]]
+        lora_part2 = translation_map[parts[1]]
+        return f"diffusion_model.blocks.{block_idx}.{lora_part1}.{lora_part2}"
+    elif suffix in translation_map:
+         return f"diffusion_model.blocks.{block_idx}.{translation_map[suffix]}"
+    return None
+
+
 def merge_lora(model: nnx.Module, state_dict: dict, scale: float):
     """
     Merges weights from a Diffusers-formatted state dict directly
     into the kernel of nnx.Linear and nnx.Conv layers.
+    Assumes scan_layers=False, so NNX paths include block indices.
     """
     lora_params = {}
     # Parse weights and alphas
     for k, v in state_dict.items():
         if k.endswith(".alpha"):
-            module_path_str = k[: -len(".alpha")]
-            if module_path_str not in lora_params:
-                lora_params[module_path_str] = {}
-            lora_params[module_path_str]["alpha"] = jnp.array(v)
+            key_base = k[:-len(".alpha")]
+            if key_base not in lora_params:
+                lora_params[key_base] = {}
+            lora_params[key_base]["alpha"] = jnp.array(v)
             continue
 
-        # Try matching diffusers rename format: "some.thing_lora.down.weight"
         m = re.match(r"^(.*?)_lora\.(down|up)\.weight$", k)
         if m:
-            module_path_str, weight_type = m.group(1), m.group(2)
+            key_base, weight_type = m.group(1), m.group(2)
         else:
-            # Try matching diffusers format: "some.thing.lora.down.weight"
             m = re.match(r"^(.*?)\.lora\.(down|up)\.weight$", k)
             if m:
-                module_path_str, weight_type = m.group(1), m.group(2)
+                key_base, weight_type = m.group(1), m.group(2)
             else:
-                # Try matching kohya/lightning format: "some.thing.lora_down.weight"
                 m = re.match(r"^(.*?)\.(lora_down|lora_up)\.weight$", k)
                 if m:
-                    module_path_str, weight_type = m.group(1), m.group(2).replace("lora_", "")
+                    key_base, weight_type = m.group(1), m.group(2).replace("lora_", "")
                 else:
                     max_logging.log(f"Could not parse LoRA key: {k}")
                     continue
-        if module_path_str not in lora_params:
-            lora_params[module_path_str] = {}
-        lora_params[module_path_str][weight_type] = jnp.array(v)
-    max_logging.log(f"Parsed {len(lora_params)} unique LoRA module keys: {list(lora_params.keys())}")
+        if key_base not in lora_params:
+            lora_params[key_base] = {}
+        lora_params[key_base][weight_type] = jnp.array(v)
+    max_logging.log(f"Parsed {len(lora_params)} unique LoRA module keys.")
 
     assigned_count = 0
     for path, module in nnx.iter_graph(model):
@@ -245,48 +276,36 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float):
 
         nnx_path_str = ".".join(map(str, path))
         max_logging.log(f"Checking NNX layer: {nnx_path_str}")
+        lora_key = _translate_nnx_path_to_lora_key(nnx_path_str)
 
-        matched_key = None
-        if nnx_path_str in lora_params:
-          matched_key = nnx_path_str
-        else:
-          # Fallback: check if any param key is a suffix of nnx path
-          for k in lora_params:
-            if nnx_path_str.endswith(k):
-                matched_key = k
-                max_logging.log(f"NNX path '{nnx_path_str}' matched LoRA key '{k}' via suffix.")
-                break
-        max_logging.log(f"Layer: {nnx_path_str}, Matched LoRA key: {matched_key}")
-
-        if matched_key and matched_key in lora_params:
-            weights = lora_params[matched_key]
+        if lora_key and lora_key in lora_params:
+            max_logging.log(f"NNX layer '{nnx_path_str}' matched LoRA key '{lora_key}'")
+            weights = lora_params[lora_key]
             if "down" in weights and "up" in weights:
                 if isinstance(module, nnx.Linear):
-                    down_w = weights["down"] # (rank, in_features)
-                    up_w = weights["up"]     # (out_features_flat, rank)
+                    down_w, up_w = weights["down"], weights["up"]
                     rank = down_w.shape[0]
                     alpha = weights.get("alpha", rank)
                     current_scale = scale * alpha / rank
-                    # delta = A@B = down.T @ up.T
                     delta = (down_w.T @ up_w.T).reshape(module.kernel.shape)
                     module.kernel.value += delta * current_scale
                     assigned_count +=1
                 elif isinstance(module, nnx.Conv):
-                    if module.kernel_size == (1, 1):
-                        down_w = weights["down"] # (1,1,in_c,rank)
-                        up_w = weights["up"]     # (1,1,rank,out_c)
+                      if module.kernel_size == (1, 1):
+                        down_w, up_w = weights["down"], weights["up"]
                         rank = down_w.shape[-1]
                         alpha = weights.get("alpha", rank)
                         current_scale = scale * alpha / rank
-                        # delta = down @ up for channel dimension
                         delta = (jnp.squeeze(down_w) @ jnp.squeeze(up_w)).reshape(module.kernel.shape)
                         module.kernel.value += delta * current_scale
                         assigned_count += 1
-                    else:
-                        raise NotImplementedError(
-                            f"Merging LoRA weights for Conv layer {matched_key} "
-                            f"with kernel_size {module.kernel_size} > 1 is not supported."
-                        )
+                      else:
+                        raise NotImplementedError(f"Conv merge only for 1x1 kernels, got {module.kernel_size}")
             else:
-                max_logging.log(f"LoRA weights for {matched_key} incomplete.")
+                max_logging.warning(f"LoRA weights for {lora_key} incomplete: missing down or up weights.")
+        elif lora_key:
+              max_logging.warning(f"NNX layer '{nnx_path_str}' translated to '{lora_key}' but key not in lora_params.")
+        else:
+            max_logging.debug(f"NNX layer '{nnx_path_str}' could not be translated to a LoRA key.")
+
     max_logging.log(f"Merged weights into {assigned_count} layers in {type(model).__name__}.")