missing key debug

prishajain1 · prishajain1 · commit cae76f067395 · 2025-12-20T14:17:14.000+05:30
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -885,6 +885,7 @@ def __init__(
     self.add_k_proj = nnx.data(None)
     self.add_v_proj = nnx.data(None)
     self.norm_added_k = nnx.data(None)
+    self.norm_added_q = nnx.data(None)
     if self.added_kv_proj_dim is not None:
       self.add_k_proj = nnx.Linear(
           self.added_kv_proj_dim, self.inner_dim, rngs=rngs,
@@ -909,6 +910,13 @@ def __init__(
               ("norm",),
           ),
       )
+      self.norm_added_q = nnx.RMSNorm(
+          num_features=self.inner_dim, rngs=rngs, epsilon=eps, dtype=dtype, param_dtype=weights_dtype,
+          scale_init=nnx.with_partitioning(
+              nnx.initializers.ones,
+              ("norm",),
+          ),
+      )
 
   def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tuple[jax.Array, jax.Array]:
     dtype = xq.dtype
@@ -1016,8 +1024,10 @@ def __call__(
       # Attention - tensors are (B, S, D)
       with self.conditional_named_scope("cross_attn_text_apply"):
         attn_output_text = self.attention_op.apply_attention(query_proj, key_proj_text, value_proj_text)
+      with self.conditional_named_scope("norm_added_q"):
+        query_proj_img = self.norm_added_q(query_proj)
       with self.conditional_named_scope("cross_attn_img_apply"):
-        attn_output_img = self.attention_op.apply_attention(query_proj, key_proj_img, value_proj_img)
+        attn_output_img = self.attention_op.apply_attention(query_proj_img, key_proj_img, value_proj_img)
 
       attn_output = attn_output_text + attn_output_img
 
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -215,14 +215,12 @@ def load_base_wan_transformer(
       raise FileNotFoundError(f"File {index_file_path} not found for local directory.")
     local_files = True
   elif hf_download:
-    # download the index file for sharded models.
     index_file_path = hf_hub_download(
         pretrained_model_name_or_path,
         subfolder=subfolder,
         filename=filename,
     )
   with jax.default_device(device):
-    # open the index file.
     with open(index_file_path, "r") as f:
       index_dict = json.load(f)
     model_files = set()
@@ -236,37 +234,42 @@ def load_base_wan_transformer(
         ckpt_shard_path = os.path.join(pretrained_model_name_or_path, subfolder, model_file)
       else:
         ckpt_shard_path = hf_hub_download(pretrained_model_name_or_path, subfolder=subfolder, filename=model_file)
-      # now get all the filenames for the model that need downloading
+      
       max_logging.log(f"Load and port {pretrained_model_name_or_path} {subfolder} on {device}")
 
       if ckpt_shard_path is not None:
         with safe_open(ckpt_shard_path, framework="pt") as f:
           for k in f.keys():
             tensors[k] = torch2jax(f.get_tensor(k))
+            
     flax_state_dict = {}
     cpu = jax.local_devices(backend="cpu")[0]
-    flattened_eval_shapes = flatten_dict(eval_shapes)
-    # turn all block numbers to strings just for matching weights.
-    # Later they will be turned back to ints.
+    flattened_dict = flatten_dict(eval_shapes)
     random_flax_state_dict = {}
-    for key in flattened_eval_shapes:
+    for key in flattened_dict:
       string_tuple = tuple([str(item) for item in key])
-      random_flax_state_dict[string_tuple] = flattened_eval_shapes[key]
-    # del flattened_dict
+      random_flax_state_dict[string_tuple] = flattened_dict[key]
+    del flattened_dict
+
+    # 1. Initialize buffer for norm_added_q
     norm_added_q_buffer = {}
+
     for pt_key, tensor in tensors.items():
+      # 2. Robustly Intercept norm_added_q keys
       if "norm_added_q" in pt_key and "weight" in pt_key:
            parts = pt_key.split(".")
            try:
+               # Find the block index regardless of prefix (blocks.0 vs model.blocks.0)
                if "blocks" in parts:
                    block_idx_loc = parts.index("blocks") + 1
                    block_idx = int(parts[block_idx_loc])
                    tensor = tensor.T
                    norm_added_q_buffer[block_idx] = tensor
            except Exception:
-               pass
+               pass # Skip if unparseable
            continue
-           
+
+      # Standard processing
       renamed_pt_key = rename_key(pt_key)
       if "image_embedder" in renamed_pt_key:
           if "net.0" in renamed_pt_key or "net_0" in renamed_pt_key or \
@@ -287,39 +290,36 @@ def load_base_wan_transformer(
           if "norm1" in renamed_pt_key or "norm2" in renamed_pt_key:
               renamed_pt_key = renamed_pt_key.replace("weight", "scale")
               renamed_pt_key = renamed_pt_key.replace("kernel", "scale")
+      
       renamed_pt_key = renamed_pt_key.replace("blocks_", "blocks.")
       renamed_pt_key = renamed_pt_key.replace(".scale_shift_table", ".adaln_scale_shift_table")
       renamed_pt_key = renamed_pt_key.replace("to_out_0", "proj_attn")
       renamed_pt_key = renamed_pt_key.replace("ffn.net_2", "ffn.proj_out")
       renamed_pt_key = renamed_pt_key.replace("ffn.net_0", "ffn.act_fn")
       if "norm2.layer_norm" not in renamed_pt_key:
         renamed_pt_key = renamed_pt_key.replace("norm2", "norm2.layer_norm")
+      
       pt_tuple_key = tuple(renamed_pt_key.split("."))
       flax_key, flax_tensor = get_key_and_value(pt_tuple_key, tensor, flax_state_dict, random_flax_state_dict, scan_layers)
       flax_state_dict[flax_key] = jax.device_put(jnp.asarray(flax_tensor), device=cpu)
+
+    # 3. Stack and Insert (Correct Key Name for RMSNorm is 'scale')
     if norm_added_q_buffer:
         sorted_keys = sorted(norm_added_q_buffer.keys())
         sorted_tensors = [norm_added_q_buffer[i] for i in sorted_keys]
         stacked_tensor = jnp.stack(sorted_tensors, axis=0)
-        final_key = ('blocks', 'attn2', 'norm_added_q', 'kernel')
+        
+        # 'scale' is the correct parameter name for Flax/NNX RMSNorm
+        final_key = ('blocks', 'attn2', 'norm_added_q', 'scale')
+        
         flax_state_dict[final_key] = jax.device_put(stacked_tensor, device=cpu)
-        print(f"DEBUG: Manually injected {final_key} into flax_state_dict")
-        if final_key not in flattened_eval_shapes:
-            print(f"DEBUG: Key {final_key} missing in eval_shapes. Patching it now.")
-            shape_struct = jax.ShapeDtypeStruct(
-                shape=stacked_tensor.shape, 
-                dtype=stacked_tensor.dtype
-            )
-            flattened_eval_shapes[final_key] = shape_struct
-            eval_shapes = unflatten_dict(flattened_eval_shapes)
 
     validate_flax_state_dict(eval_shapes, flax_state_dict)
     flax_state_dict = unflatten_dict(flax_state_dict)
     del tensors
     jax.clear_caches()
     return flax_state_dict
 
-
 def load_wan_vae(pretrained_model_name_or_path: str, eval_shapes: dict, device: str, hf_download: bool = True):
   device = jax.devices(device)[0]
   subfolder = "vae"