test turbo scaling for head_dim 256

entrpn · entrpn · commit 2453c1bec725 · 2026-01-22T17:44:05.000Z
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -73,7 +73,14 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     p_t, p_h, p_w = self.patch_size
     ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-    freqs_split = get_frequencies(self.max_seq_len, self.theta, self.attention_head_dim)
+    is_turbo_mode = (self.attention_head_dim == 256)
+
+    # 2. Force the frequency calculation to use 128.
+    # This preserves the original T/H/W split ratios (21/21/22) 
+    # instead of stretching them to (42/42/44).
+    calc_dim = 128 if is_turbo_mode else self.attention_head_dim
+
+    freqs_split = get_frequencies(self.max_seq_len, self.theta, calc_dim)
 
     freqs_f = jnp.expand_dims(jnp.expand_dims(freqs_split[0][:ppf], axis=1), axis=1)
     freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, freqs_split[0].shape[-1]))
@@ -85,6 +92,15 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     freqs_w = jnp.broadcast_to(freqs_w, (ppf, pph, ppw, freqs_split[2].shape[-1]))
 
     freqs_concat = jnp.concatenate([freqs_f, freqs_h, freqs_w], axis=-1)
+
+    # === TURBO ADAPTER TILING: START ===
+    if is_turbo_mode:
+        # We calculated frequencies for a 128-dim head.
+        # We must duplicate them so the "Second Fused Head" (indices 128-255)
+        # sees the exact same rotation as the "First Head" (indices 0-127).
+        freqs_concat = jnp.concatenate([freqs_concat, freqs_concat], axis=-1)
+    # === TURBO ADAPTER TILING: END ===
+
     freqs_final = jnp.reshape(freqs_concat, (1, 1, ppf * pph * ppw, -1))
     return freqs_final
 
diff --git a/src/maxdiffusion/models/wan/wan_utils.py b/src/maxdiffusion/models/wan/wan_utils.py
@@ -196,6 +196,43 @@ def load_wan_transformer(
         pretrained_model_name_or_path, eval_shapes, device, hf_download, num_layers, scan_layers, subfolder
     )
 
+def apply_turbo_scaling(params):
+    """
+    Recursively traverses the unflattened state dict to find 'query' and 'key' 
+    layers and scales their kernels by 1/sqrt(2).
+    """
+    # Scale factor: 1/sqrt(2) ≈ 0.707
+    scale_factor = 1.0 / (2 ** 0.5)
+    
+    # Counter to verify we actually hit the tensors
+    scaled_count = 0
+
+    def _recursive_walk(d, path_prefix=""):
+        nonlocal scaled_count
+        # Iterate over a copy of keys to be safe, though we modify values in place
+        for key, value in d.items():
+            
+            # 1. Target Identification: Is this a Query or Key layer?
+            # We look for dicts named 'query' or 'key' that contain a 'kernel'
+            if key in ['query', 'key'] and isinstance(value, dict) and 'kernel' in value:
+                # Apply the scale
+                original_shape = value['kernel'].shape
+                value['kernel'] = value['kernel'] * scale_factor
+                scaled_count += 1
+                print(f"⚡ Turbo Scaled: {path_prefix}.{key}.kernel | Shape: {original_shape}")
+            
+            # 2. Recursion: If it's a container (like 'blocks' or 'attn1'), dive in.
+            elif isinstance(value, dict):
+                _recursive_walk(value, path_prefix=f"{path_prefix}.{key}" if path_prefix else key)
+
+    print("⚡ Starting Recursive Turbo Scaling...")
+    _recursive_walk(params)
+    
+    if scaled_count == 0:
+        raise ValueError("❌ Turbo Scaling Failed: No 'query' or 'key' kernels found! Check dictionary structure.")
+    
+    print(f"⚡ DONE. Scaled {scaled_count} tensors successfully.")
+    return params
 
 def load_base_wan_transformer(
     pretrained_model_name_or_path: str,
@@ -269,6 +306,7 @@ def load_base_wan_transformer(
     flax_state_dict = unflatten_dict(flax_state_dict)
     del tensors
     jax.clear_caches()
+    flax_state_dict = apply_turbo_scaling(flax_state_dict)
     return flax_state_dict
 
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -87,7 +87,6 @@ def _add_sharding_rule(vs: nnx.VariableState, logical_axis_rules) -> nnx.Variabl
   vs.sharding_rules = logical_axis_rules
   return vs
 
-
 # For some reason, jitting this function increases the memory significantly, so instead manually move weights to device.
 def create_sharded_logical_transformer(
     devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters, restored_checkpoint=None, subfolder: str = ""
@@ -116,6 +115,8 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["mask_padding_tokens"] = config.mask_padding_tokens
   wan_config["scan_layers"] = config.scan_layers
   wan_config["enable_jax_named_scopes"] = config.enable_jax_named_scopes
+  wan_config["num_attention_heads"] = 20
+  wan_config["attention_head_dim"] = 256
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.