Fix

Perseus14 · Perseus14 · commit 0d221eb0e22e · 2026-01-14T21:45:09.000Z
diff --git a/src/maxdiffusion/models/lora_nnx.py b/src/maxdiffusion/models/lora_nnx.py
@@ -18,6 +18,7 @@
 import re
 import torch
 import jax
+import numpy as np
 from jax import dlpack
 import jax.numpy as jnp
 from flax import nnx
@@ -156,8 +157,8 @@ def __call__(self, x):
 
 def _to_jax_array(v):
     if isinstance(v, torch.Tensor):
-        return jax.device_put(dlpack.from_dlpack(v))
-    return jax.device_put(jnp.array(v))
+        return dlpack.from_dlpack(v)
+    return jnp.array(v)
 
 def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=None):
     """
@@ -211,7 +212,9 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
                     alpha = weights.get("alpha", rank)
                     current_scale = scale * alpha / rank
                     delta = (down_w.T @ up_w.T).reshape(module.kernel.shape)
-                    module.kernel.value += delta * current_scale
+                    update = delta * current_scale
+                    update = jax.device_put(update, module.kernel.value.sharding)
+                    module.kernel.value += update
                     assigned_count +=1
                 elif isinstance(module, nnx.Conv):
                       if module.kernel_size == (1, 1):
@@ -220,7 +223,9 @@ def merge_lora(model: nnx.Module, state_dict: dict, scale: float, translate_fn=N
                         alpha = weights.get("alpha", rank)
                         current_scale = scale * alpha / rank
                         delta = (jnp.squeeze(down_w) @ jnp.squeeze(up_w)).reshape(module.kernel.shape)
-                        module.kernel.value += delta * current_scale
+                        update = delta * current_scale
+                        update = jax.device_put(update, module.kernel.value.sharding)
+                        module.kernel.value += update
                         assigned_count += 1
                       else:
                         raise NotImplementedError(f"Conv merge only for 1x1 kernels, got {module.kernel_size}")
@@ -240,34 +245,31 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
     into the kernel of nnx.Linear and nnx.Conv layers.
     Assumes scan_layers=True, so weights are stacked if layers are scanned
     (e.g. kernel.ndim=3 for Linear).
+    Optimized: Accumulates updates on CPU first, then performs a single device_put.
     """
     lora_params = {}
-    # Parse weights and alphas
+    # --- Parsing Logic ---
     for k, v in state_dict.items():
         if k.endswith(".alpha"):
             key_base = k[:-len(".alpha")]
-            if key_base not in lora_params:
-                lora_params[key_base] = {}
+            if key_base not in lora_params: lora_params[key_base] = {}
             lora_params[key_base]["alpha"] = _to_jax_array(v)
             continue
 
         m = re.match(r"^(.*?)_lora\.(down|up)\.weight$", k)
+        if not m:
+             m = re.match(r"^(.*?)\.lora\.(down|up)\.weight$", k)
+        if not m:
+             m = re.match(r"^(.*?)\.(lora_down|lora_up)\.weight$", k)
+        
         if m:
-            key_base, weight_type = m.group(1), m.group(2)
+            key_base, weight_type = m.group(1), m.group(2).replace("lora_", "")
+            if key_base not in lora_params: lora_params[key_base] = {}
+            lora_params[key_base][weight_type] = _to_jax_array(v)
         else:
-            m = re.match(r"^(.*?)\.lora\.(down|up)\.weight$", k)
-            if m:
-                key_base, weight_type = m.group(1), m.group(2)
-            else:
-                m = re.match(r"^(.*?)\.(lora_down|lora_up)\.weight$", k)
-                if m:
-                    key_base, weight_type = m.group(1), m.group(2).replace("lora_", "")
-                else:
-                    max_logging.log(f"Could not parse LoRA key: {k}")
-                    continue
-        if key_base not in lora_params:
-            lora_params[key_base] = {}
-        lora_params[key_base][weight_type] = _to_jax_array(v)
+            max_logging.log(f"Could not parse LoRA key: {k}")
+            continue
+
     max_logging.log(f"Parsed {len(lora_params)} unique LoRA module keys for scanned merge.")
 
     assigned_count = 0
@@ -277,69 +279,92 @@ def merge_lora_for_scanned(model: nnx.Module, state_dict: dict, scale: float, tr
 
         nnx_path_str = ".".join(map(str, path))
         
-        # Handle scanned Linear layers
+        # --- Handle Scanned Linear (NDIM=3) ---
         if isinstance(module, nnx.Linear) and module.kernel.ndim == 3:
             lora_key_template = translate_fn(nnx_path_str) if translate_fn else None
 
             if lora_key_template:
                 num_layers, in_features, out_features = module.kernel.shape
-                kernel_value_updated = module.kernel.value
-                lora_found_in_module = False
+                
+                # 1. Create a zero-filled buffer on CPU for float32 accumulation
+                cpu_delta_buffer = np.zeros((num_layers, in_features, out_features), dtype=np.float32)
+                
+                lora_found = False
                 for i in range(num_layers):
                     lora_key = lora_key_template.format(i)
                     if lora_key in lora_params and "down" in lora_params[lora_key] and "up" in lora_params[lora_key]:
                         weights = lora_params[lora_key]
-                        down_w, up_w = weights["down"], weights["up"]
+                        # Pull weights to CPU/Numpy for cheap calculation
+                        down_w = np.array(weights["down"])
+                        up_w = np.array(weights["up"])
+                        
                         rank = down_w.shape[0]
-                        alpha = weights.get("alpha", rank)
+                        alpha = float(weights.get("alpha", rank)) # ensure scalar
                         current_scale = scale * alpha / rank
+                        
+                        # Compute Delta on CPU
                         delta_i = (down_w.T @ up_w.T).reshape(in_features, out_features) * current_scale
-                        kernel_value_updated = kernel_value_updated.at[i].add(delta_i)
-                        lora_found_in_module = True
-
-                if lora_found_in_module:
-                    module.kernel.value = kernel_value_updated
+                        
+                        # Accumulate in buffer
+                        cpu_delta_buffer[i] += delta_i
+                        lora_found = True
+
+                if lora_found:
+                    # 2. Single Transfer: Move buffer to TPU with correct sharding and dtype
+                    sharded_delta = jax.device_put(
+                        jnp.array(cpu_delta_buffer, dtype=module.kernel.dtype), 
+                        module.kernel.value.sharding
+                    )
+                    # 3. In-place add
+                    module.kernel.value += sharded_delta
                     assigned_count += 1
                 else:
-                    max_logging.log(f"Scanned layer {nnx_path_str} matched template but no LoRA weights found for any block.")
+                    max_logging.log(f"Scanned layer {nnx_path_str} matched template but no LoRA weights found.")
             else:
                  max_logging.log(f"Scanned NNX layer '{nnx_path_str}' could not be translated to a LoRA key template.")
 
-        # Handle scanned Conv layers (ndim=5)
+        # --- Handle Scanned Conv (NDIM=5) ---
         elif isinstance(module, nnx.Conv) and module.kernel.ndim == 5:
             if module.kernel_size != (1, 1):
                 max_logging.log(f"Skipping merge for scanned Conv layer {nnx_path_str} with kernel size {module.kernel_size}, only 1x1 is supported for merging.")
                 continue
-
+            
             lora_key_template = translate_fn(nnx_path_str) if translate_fn else None
             if lora_key_template:
                 num_layers, _, _, in_features, out_features = module.kernel.shape
-                kernel_value_updated = module.kernel.value
-                lora_found_in_module = False
+                cpu_delta_buffer = np.zeros(module.kernel.shape, dtype=np.float32)
+                lora_found = False
+
                 for i in range(num_layers):
                     lora_key = lora_key_template.format(i)
                     if lora_key in lora_params and "down" in lora_params[lora_key] and "up" in lora_params[lora_key]:
                         weights = lora_params[lora_key]
-                        down_w, up_w = weights["down"], weights["up"]
+                        down_w = np.array(weights["down"])
+                        up_w = np.array(weights["up"])
 
                         if down_w.ndim == 4:
-                            down_w = jnp.squeeze(down_w)
+                            down_w = np.squeeze(down_w)
                         if up_w.ndim == 4:
-                            up_w = jnp.squeeze(up_w)
+                            up_w = np.squeeze(up_w)
                         
                         rank = down_w.shape[0]
-                        alpha = weights.get("alpha", rank)
+                        alpha = float(weights.get("alpha", rank))
                         current_scale = scale * alpha / rank
                         delta_i = (down_w.T @ up_w.T).reshape(1, 1, in_features, out_features) * current_scale
-                        kernel_value_updated = kernel_value_updated.at[i].add(delta_i)
-                        lora_found_in_module = True
-
-                if lora_found_in_module:
-                    module.kernel.value = kernel_value_updated
+                        cpu_delta_buffer[i] += delta_i
+                        lora_found = True
+                
+                if lora_found:
+                    sharded_delta = jax.device_put(
+                        jnp.array(cpu_delta_buffer, dtype=module.kernel.dtype), 
+                        module.kernel.value.sharding
+                    )
+                    module.kernel.value += sharded_delta
                     assigned_count += 1
                 else:
-                    max_logging.log(f"Scanned 1x1 Conv layer {nnx_path_str} matched template but no LoRA weights found for any block.")
+                    max_logging.log(f"Scanned 1x1 Conv layer {nnx_path_str} matched template but no LoRA weights found.")
             else:
                  max_logging.log(f"Scanned 1x1 Conv layer '{nnx_path_str}' could not be translated to a LoRA key template.")
 
+
     max_logging.log(f"Merged weights into {assigned_count} scanned layers in {type(model).__name__}.")