experiment 1

entrpn · entrpn · commit 77a43cf74bde · 2025-11-13T00:39:46.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -244,7 +244,7 @@ num_eval_samples: 420
 
 warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
-save_optimizer: False
+save_optimizer: True
 
 # However you may choose a longer schedule (learning_rate_schedule_steps > steps), in which case the training will end before
 # dropping fully down. Or you may choose a shorter schedule, where the unspecified steps will have a learning rate of 0.
@@ -326,4 +326,10 @@ eval_data_dir: ""
 enable_generate_video_for_eval: False # This will increase the used TPU memory.
 eval_max_number_of_samples_in_bucket: 60 # The number of samples per bucket for evaluation. This is calculated by num_eval_samples / len(timesteps_list).
 
-enable_ssim: False
+enable_ssim: False
+
+# Model surgery
+override_model_dims: True
+# If doubling the target_head_dim, then must halve the num_heads
+target_head_dim: 256
+target_num_heads: 20
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -133,13 +133,12 @@ def run(config, pipeline=None, filename_prefix=""):
   print("seed: ", config.seed)
   model_key = config.model_name
 
-  checkpointer_lib = get_checkpointer(model_key)
-  WanCheckpointer = checkpointer_lib.WanCheckpointer
-
-  checkpoint_loader = WanCheckpointer(config, "WAN_CHECKPOINT")
-  pipeline, _, _ = checkpoint_loader.load_checkpoint()
-
   if pipeline is None:
+    checkpointer_lib = get_checkpointer(model_key)
+    WanCheckpointer = checkpointer_lib.WanCheckpointer
+
+    checkpoint_loader = WanCheckpointer(config, "WAN_CHECKPOINT")
+    pipeline, _, _ = checkpoint_loader.load_checkpoint()
     pipeline_lib = get_pipeline(model_key)
     WanPipeline = pipeline_lib.WanPipeline
     pipeline = WanPipeline.from_pretrained(config)
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -225,6 +225,7 @@ def get_1d_rotary_pos_embed(
     ntk_factor=1.0,
     freqs_dtype=jnp.float32,
     use_real: bool = True,
+    original_dim: int = None,
 ):
   """
   Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
@@ -235,7 +236,14 @@ def get_1d_rotary_pos_embed(
     pos = jnp.arange(pos)
 
   theta = theta * ntk_factor
-  freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor
+
+  # If original_dim is provided, we use it as the denominator for the exponent.
+  # For example, if we change the head_dim from 128 to 256, this ensures indices 0-127 generate the EXACT same frequencies they did during pre-training.
+  # Indices 128-255 will simply continue that curve into lower frequencies.
+  scale_dim = original_dim if original_dim is not None else dim
+  
+  freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / scale_dim)) / linear_factor
+
   freqs = jnp.outer(pos, freqs)
   if use_real:
     # Flux
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -39,13 +39,33 @@
 BlockSizes = common_types.BlockSizes
 
 
-def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int):
+def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int, original_attention_head_dim: int):
+  
+  # 1. Calculate NEW sub-dimensions (The target shapes)
+  # e.g., for 256: h=84, w=84, t=88
   h_dim = w_dim = 2 * (attention_head_dim // 6)
   t_dim = attention_head_dim - h_dim - w_dim
+  current_dims = [t_dim, h_dim, w_dim]
+  
+  # 2. Calculate OLD sub-dimensions (For interpolation reference)
+  # e.g., for 128: h=42, w=42, t=44
+  h_dim_old = w_dim_old = 2 * (original_attention_head_dim // 6)
+  t_dim_old = original_attention_head_dim - h_dim_old - w_dim_old
+  old_dims = [t_dim_old, h_dim_old, w_dim_old]
+
   freqs = []
-  for dim in [t_dim, h_dim, w_dim]:
-    freq = get_1d_rotary_pos_embed(dim, max_seq_len, theta, freqs_dtype=jnp.float32, use_real=False)
+
+  for dim, old_dim in zip(current_dims, old_dims):
+    freq = get_1d_rotary_pos_embed(
+      dim=dim, # new size
+      pos=max_seq_len,
+      theta=theta,
+      freqs_dtype=jnp.float32,
+      use_real=False,
+      original_dim=old_dim
+    )
     freqs.append(freq)
+
   freqs = jnp.concatenate(freqs, axis=1)
   t_size = attention_head_dim // 2 - 2 * (attention_head_dim // 6)
   hw_size = attention_head_dim // 6
@@ -61,8 +81,16 @@ def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int):
 
 class WanRotaryPosEmbed(nnx.Module):
 
-  def __init__(self, attention_head_dim: int, patch_size: Tuple[int, int, int], max_seq_len: int, theta: float = 10000.0):
+  def __init__(
+    self,
+    attention_head_dim: int,
+    original_attention_head_dim: int,
+    patch_size: Tuple[int, int, int],
+    max_seq_len: int,
+    theta: float = 10000.0
+  ):
     self.attention_head_dim = attention_head_dim
+    self.original_attention_head_dim = original_attention_head_dim
     self.patch_size = patch_size
     self.max_seq_len = max_seq_len
     self.theta = theta
@@ -72,7 +100,7 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     p_t, p_h, p_w = self.patch_size
     ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-    freqs_split = get_frequencies(self.max_seq_len, self.theta, self.attention_head_dim)
+    freqs_split = get_frequencies(self.max_seq_len, self.theta, self.attention_head_dim, self.original_attention_head_dim)
 
     freqs_f = jnp.expand_dims(jnp.expand_dims(freqs_split[0][:ppf], axis=1), axis=1)
     freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, freqs_split[0].shape[-1]))
@@ -378,6 +406,7 @@ class WanModel(nnx.Module, FlaxModelMixin, ConfigMixin):
   def __init__(
       self,
       rngs: nnx.Rngs,
+      target_head_dim: int,
       model_type="t2v",
       patch_size: Tuple[int] = (1, 2, 2),
       num_attention_heads: int = 40,
@@ -408,13 +437,13 @@ def __init__(
       names_which_can_be_offloaded: list = [],
       scan_layers: bool = True,
   ):
-    inner_dim = num_attention_heads * attention_head_dim
+    inner_dim = num_attention_heads * target_head_dim
     out_channels = out_channels or in_channels
     self.num_layers = num_layers
     self.scan_layers = scan_layers
 
     # 1. Patch & position embedding
-    self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+    self.rope = WanRotaryPosEmbed(target_head_dim, attention_head_dim, patch_size, rope_max_seq_len)
     self.patch_embedding = nnx.Conv(
         in_channels,
         inner_dim,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -22,6 +22,7 @@
 import flax.linen as nn
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
+from flax.traverse_util import flatten_dict, unflatten_dict
 from ...pyconfig import HyperParameters
 from ... import max_logging
 from ... import max_utils
@@ -86,6 +87,42 @@ def _add_sharding_rule(vs: nnx.VariableState, logical_axis_rules) -> nnx.Variabl
   vs.sharding_rules = logical_axis_rules
   return vs
 
+def perform_wan_scaling_surgery(params, target_head_dim, source_head_dim):
+    """
+    scales Q and K weights to preserve attention entropy when 
+    changing head dimensions.
+    
+    Formula: correction_factor = (target_dim / source_dim)^0.25
+    """
+
+    if target_head_dim == source_head_dim:
+      print("Target and Source head dims are identical. Skipping surgery.")
+      return params
+    
+    # Calculate the factor
+    # Example: (256 / 128)^0.25 = 2^0.25 ≈ 1.1892
+    ratio = target_head_dim / source_head_dim
+    correction_factor = ratio ** 0.25
+    
+    flat_params = flatten_dict(params, sep='/')
+    new_flat_params = {}
+    modified_count = 0
+    
+    for key, tensor in flat_params.items():
+        # Key format example: 'transformer_blocks/0/attn1/query/kernel'
+        # Identify Query and Key kernels. 
+        if ('query' in key or 'key' in key) and 'kernel' in key:
+             # Ensure we are targeting attention layers, not other projections
+            if 'attn' in key:
+                new_flat_params[key] = tensor * correction_factor
+                modified_count += 1
+            else:
+                new_flat_params[key] = tensor
+        else:
+            new_flat_params[key] = tensor
+
+    print(f"Surgery complete. Scaled {modified_count} tensors by {correction_factor:.4f}")
+    return unflatten_dict(new_flat_params, sep='/')
 
 # For some reason, jitting this function increases the memory significantly, so instead manually move weights to device.
 def create_sharded_logical_transformer(
@@ -113,6 +150,10 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["flash_min_seq_length"] = config.flash_min_seq_length
   wan_config["dropout"] = config.dropout
   wan_config["scan_layers"] = config.scan_layers
+  wan_config["target_head_dim"] = wan_config["attention_head_dim"]
+  if config.override_model_dims:
+    wan_config["target_head_dim"] = config.target_head_dim
+    wan_config["num_attention_heads"] = config.target_num_heads
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.
@@ -144,6 +185,8 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
         scan_layers=config.scan_layers,
         subfolder=subfolder,
     )
+    if config.override_model_dims:
+      params = perform_wan_scaling_surgery(params, config.target_head_dim, wan_config["attention_head_dim"])
 
   params = jax.tree_util.tree_map_with_path(
       lambda path, x: cast_with_exclusion(path, x, dtype_to_cast=config.weights_dtype), params
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -17,7 +17,7 @@
 import os
 import datetime
 import functools
-from pprint import pprint
+import pprint
 import numpy as np
 import threading
 from concurrent.futures import ThreadPoolExecutor