Fix

Perseus14 · Perseus14 · commit 5d7a8c205061 · 2026-01-15T13:18:52.000Z
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
@@ -225,7 +225,6 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
     assigned_count = 0
     for path, module in nnx.iter_graph(model):
         if not isinstance(module, (nnx.Linear, nnx.Conv, nnx.LayerNorm, nnx.RMSNorm, nnx.Embed)): 
-          max_logging.log(f"Skipping non-supported module type: {module}")
           continue
 
         nnx_path_str = ".".join(map(str, path))
@@ -234,6 +233,10 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
         if lora_key and lora_key in lora_params:
             weights = lora_params[lora_key]
 
+            is_conv_kxk_locon = False
+            if isinstance(module, nnx.Conv) and module.kernel_size != (1,1) and "down" in weights and "up" in weights:
+                is_conv_kxk_locon = True
+
             # Handle Embeddings
             if isinstance(module, nnx.Embed):
                 if "diff" in weights and hasattr(module, 'embedding'):
@@ -257,20 +260,17 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
             
             # Prepare LoRA terms
             down_w, up_w, current_scale = None, None, None
-            if "down" in weights and "up" in weights:
-                if isinstance(module, nnx.Conv) and module.kernel_size != (1, 1):
-                    max_logging.log(f"Skipping LoRA merge for non-1x1 Conv: {lora_key}")
-                else:
-                    down_w, up_w = weights["down"], weights["up"]
-                    down_w, up_w = np.array(down_w), np.array(up_w) # CPU convert
-                    
-                    # Squeeze dimensions if needed (Conv 1x1 or Linear)
-                    if isinstance(module, nnx.Conv) and module.kernel_size == (1, 1):
-                        down_w, up_w = np.squeeze(down_w), np.squeeze(up_w)
+            if "down" in weights and "up" in weights and not is_conv_kxk_locon:
+                down_w, up_w = weights["down"], weights["up"]
+                down_w, up_w = np.array(down_w), np.array(up_w) # CPU convert
+                
+                # Squeeze dimensions if needed (Conv 1x1 or Linear)
+                if isinstance(module, nnx.Conv) and module.kernel_size == (1, 1):
+                    down_w, up_w = np.squeeze(down_w), np.squeeze(up_w)
 
-                    rank = down_w.shape[0] if down_w.ndim > 0 else 0
-                    alpha = float(weights.get("alpha", rank))
-                    current_scale = scale * alpha / rank
+                rank = down_w.shape[0] if down_w.ndim > 0 else 0
+                alpha = float(weights.get("alpha", rank))
+                current_scale = scale * alpha / rank
             
             # Prepare Diff terms
             w_diff = weights.get("diff", None)
@@ -288,6 +288,25 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
                      w_diff = w_diff.transpose((1,0))
             if b_diff is not None: b_diff = np.array(b_diff)
 
+            # If LoCON, compute delta and add to w_diff
+            if is_conv_kxk_locon:
+                dw, uw = np.array(weights['down']), np.array(weights['up'])
+                rank, in_c, *k_dims = dw.shape
+                out_c = uw.shape[0]
+                alpha = float(weights.get("alpha", rank))
+                
+                delta_pt = (uw.reshape(out_c, rank) @ dw.reshape(rank, -1)).reshape(out_c, in_c, *k_dims)
+                
+                # Transpose to flax
+                if delta_pt.ndim == 5: delta_fx = delta_pt.transpose((2,3,4,1,0))
+                else: delta_fx = delta_pt.transpose((2,3,1,0))
+                
+                lora_delta = delta_fx * (scale * alpha / rank)
+                if w_diff is None:
+                    w_diff = lora_delta.astype(np.float32)
+                else:
+                    w_diff += lora_delta.astype(w_diff.dtype)
+
             # Check for Bias existence
             bias_val = module.bias.value if module.bias is not None else None
 
@@ -322,7 +341,6 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
     assigned_count = 0
     for path, module in nnx.iter_graph(model):
         if not isinstance(module, (nnx.Linear, nnx.Conv, nnx.LayerNorm, nnx.RMSNorm, nnx.Embed)):
-          max_logging.log(f"Skipping non-supported module type: {module}")
           continue
 
         nnx_path_str = ".".join(map(str, path))
@@ -373,10 +391,11 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
         is_linear = isinstance(module, nnx.Linear) and module.kernel.ndim == 3
         is_conv   = isinstance(module, nnx.Conv) and module.kernel.ndim == 5
 
+        is_conv_kxk = isinstance(module, nnx.Conv) and module.kernel_size != (1,1)
+
         if is_linear:
             num_layers, in_feat, out_feat = module.kernel.shape
         elif is_conv:
-            if module.kernel_size != (1, 1): continue
             num_layers = module.kernel.shape[0]
             in_feat, out_feat = module.kernel.shape[3], module.kernel.shape[4]
         else:
@@ -412,12 +431,29 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
                 # --- Fill LoRA ---
                 if "down" in w:
                     d, u = np.array(w["down"]), np.array(w["up"])
-                    if d.ndim > 2: d = np.squeeze(d)
-                    if u.ndim > 2: u = np.squeeze(u)
-                    stack_down[i] = d
-                    stack_up[i] = u
-                    stack_alpha[i] = float(w.get("alpha", d.shape[0]))
-                    has_lora = True
+                    alpha = float(w.get("alpha", d.shape[0]))
+                    rank = d.shape[0]
+
+                    if is_conv_kxk:
+                         # For LoCON kxk, compute delta and merge into stack_w_diff
+                        rank, in_c, *k_dims = d.shape
+                        out_c = u.shape[0]
+                        delta_pt = (u.reshape(out_c, rank) @ d.reshape(rank, -1)).reshape(out_c, in_c, *k_dims)
+                        if delta_pt.ndim == 5: delta_fx = delta_pt.transpose((2,3,4,1,0))
+                        else: delta_fx = delta_pt.transpose((2,3,1,0))
+                        
+                        lora_delta = delta_fx * (scale * alpha / rank)
+                        if stack_w_diff is None: stack_w_diff = np.zeros(module.kernel.shape, dtype=np.float32)
+                        stack_w_diff[i] += lora_delta.astype(stack_w_diff.dtype)
+                        has_diff = True # Mark as having diff because we merged LoRA into w_diff
+                    else:
+                        # For Linear or 1x1 Conv, prepare for JIT
+                        if d.ndim > 2: d = np.squeeze(d)
+                        if u.ndim > 2: u = np.squeeze(u)
+                        stack_down[i] = d
+                        stack_up[i] = u
+                        stack_alpha[i] = alpha
+                        has_lora = True
                 
                 # --- Fill Weight Diff ---
                 if "diff" in w:
@@ -433,7 +469,7 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
                     elif is_linear and wd.ndim == 2:
                         wd = wd.transpose((1,0))
                     
-                    stack_w_diff[i] = wd
+                    stack_w_diff[i] += wd
                     has_diff = True
 
                 # --- Fill Bias Diff ---