fix rope calculations.

jfacevedo-google · jfacevedo-google · commit 82b719e005ef · 2025-05-20T23:33:21.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -139,8 +139,8 @@ data_sharding: [['data', 'fsdp', 'tensor']]
 dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
 dcn_fsdp_parallelism: -1
 dcn_tensor_parallelism: 1
-ici_data_parallelism: -1
-ici_fsdp_parallelism: 1  # recommended ICI axis to be auto-sharded
+ici_data_parallelism: 1
+ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
 ici_tensor_parallelism: 1
 
 # Dataset
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -15,6 +15,7 @@
 import functools
 import math
 from typing import Optional, Callable, Tuple
+import numpy as np
 import flax.linen as nn
 from flax import nnx
 import jax
@@ -318,7 +319,7 @@ def _apply_attention(
   ):
   """Routes to different attention kernels."""
   _check_attention_inputs(query, key, value)
-
+  
   if attention_kernel == "flash":
     can_use_flash_attention = (
         query.shape[1] >= flash_min_seq_length
@@ -578,8 +579,7 @@ def __init__(
     qkv_bias: bool = False,
     quant: Quant = None,
   ):
-
-    if attention_kernel == "cudnn_flash_te" or attention_kernel == "dot_product":
+    if attention_kernel == "cudnn_flash_te":
       raise NotImplementedError(f"Wan 2.1 has not been tested with {attention_kernel}")
 
     if attention_kernel in {"flash", "cudnn_flash_te"} and mesh is None:
@@ -676,7 +676,7 @@ def __init__(
   def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tuple[jax.Array, jax.Array]:
     dtype = xq.dtype
     reshape_xq = xq.astype(jnp.float32).reshape(*xq.shape[:-1], -1, 2)
-    reshape_xk = xq.astype(jnp.float32).reshape(*xk.shape[:-1], -1, 2)
+    reshape_xk = xk.astype(jnp.float32).reshape(*xk.shape[:-1], -1, 2)
 
     xq_ = jax.lax.complex(reshape_xq[..., 0], reshape_xq[..., 1])
     xk_ = jax.lax.complex(reshape_xk[..., 0], reshape_xk[..., 1])
@@ -696,13 +696,19 @@ def __call__(
     encoder_hidden_states: jax.Array = None,
     rotary_emb: Optional[jax.Array] = None
   ) -> jax.Array:
-    
+    print(" -- -- WanAttention -- ")
     dtype = hidden_states.dtype
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
     query_proj = self.query(hidden_states)
+    print("query_proj min: ", np.min(query_proj))
+    print("query_proj max: ", np.max(query_proj))
     key_proj = self.key(encoder_hidden_states)
+    print("key_proj min: ", np.min(key_proj))
+    print("key_proj max: ", np.max(key_proj))
     value_proj = self.value(encoder_hidden_states)
+    print("value_proj min: ", np.min(value_proj))
+    print("value_proj max: ", np.max(value_proj))
 
     query_proj = nn.with_logical_constraint(query_proj, self.query_axis_names)
     key_proj = nn.with_logical_constraint(key_proj, self.key_axis_names)
@@ -711,18 +717,37 @@ def __call__(
     if self.qk_norm:
       query_proj = self.norm_q(query_proj)
       key_proj = self.norm_k(key_proj)
+      print("query_proj min: ", np.min(query_proj))
+      print("query_proj max: ", np.max(query_proj))
+      print("key_proj min: ", np.min(key_proj))
+      print("key_proj max: ", np.max(key_proj))
     
     if rotary_emb is not None:
       query_proj = _unflatten_heads(query_proj, self.heads)
       key_proj = _unflatten_heads(key_proj, self.heads)
+      # value_proj = _unflatten_heads(value_proj, self.heads)
       query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
+      print("Rope query_proj min: ", np.min(query_proj))
+      print("Rope query_proj max: ", np.max(query_proj))
+      print("Rope key_proj min: ", np.min(key_proj))
+      print("Rope key_proj max: ", np.max(key_proj))
+      #breakpoint()
       query_proj = _reshape_heads_to_head_dim(query_proj)
       key_proj = _reshape_heads_to_head_dim(key_proj)
     
     attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+    try:
+      print("attn_output min: ", np.min(attn_output))
+      print("attn_output_for_print max: ", np.max(attn_output))
+    except:
+      pass
     attn_output = attn_output.astype(dtype=dtype)
 
     hidden_states = self.proj_attn(hidden_states)
+    print("hidden_states min: ", np.min(hidden_states))
+    print("hidden_states max: ", np.max(hidden_states))
+    print(" -- -- WanAttention DONE -- ")
+    #breakpoint()
     return hidden_states
 
 
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -227,7 +227,7 @@ def get_1d_rotary_pos_embed(
     out = jnp.stack([freqs_cos, -freqs_sin, freqs_sin, freqs_cos], axis=-1)
   else:
     # Wan 2.1
-    out = jax.lax.complex(jnp.ones_like(freqs), freqs)
+    out = jax.lax.complex(jnp.cos(freqs), jnp.sin(freqs))
   return out
 
 class NNXPixArtAlphaTextProjection(nnx.Module):
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -19,6 +19,7 @@
 import jax
 import jax.numpy as jnp
 from flax import nnx
+import numpy as np
 from .... import common_types
 from ...modeling_flax_utils import FlaxModelMixin, get_activation
 from ....configuration_utils import ConfigMixin, register_to_config
@@ -58,12 +59,7 @@ def __init__(
         use_real=False
       )
       freqs.append(freq)
-    self.freqs = jnp.concatenate(freqs, axis=1)
-  
-  def __call__(self, hidden_states: jax.Array) -> jax.Array:
-    _, num_frames, height, width, _ = hidden_states.shape
-    p_t, p_h, p_w = self.patch_size
-    ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+    freqs = jnp.concatenate(freqs, axis=1)
 
     sizes = [
         self.attention_head_dim // 2 - 2 * (self.attention_head_dim // 6),
@@ -72,16 +68,21 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     ]
     cumulative_sizes = jnp.cumsum(jnp.array(sizes))
     split_indices = cumulative_sizes[:-1]
-    freqs_split = jnp.split(self.freqs, split_indices, axis=1)
+    self.freqs_split = jnp.split(freqs, split_indices, axis=1)
+  
+  def __call__(self, hidden_states: jax.Array) -> jax.Array:
+    _, num_frames, height, width, _ = hidden_states.shape
+    p_t, p_h, p_w = self.patch_size
+    ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-    freqs_f = jnp.expand_dims(jnp.expand_dims(freqs_split[0][:ppf], axis=1), axis=1)
-    freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, freqs_split[0].shape[-1]))
+    freqs_f = jnp.expand_dims(jnp.expand_dims(self.freqs_split[0][:ppf], axis=1), axis=1)
+    freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, self.freqs_split[0].shape[-1]))
 
-    freqs_h = jnp.expand_dims(jnp.expand_dims(freqs_split[1][:pph], axis=0), axis=2)
-    freqs_h = jnp.broadcast_to(freqs_h, (ppf, pph, ppw, freqs_split[1].shape[-1]))
+    freqs_h = jnp.expand_dims(jnp.expand_dims(self.freqs_split[1][:pph], axis=0), axis=2)
+    freqs_h = jnp.broadcast_to(freqs_h, (ppf, pph, ppw, self.freqs_split[1].shape[-1]))
 
-    freqs_w = jnp.expand_dims(jnp.expand_dims(freqs_split[2][:ppw], axis=0), axis=1)
-    freqs_w = jnp.broadcast_to(freqs_w, (ppf, pph, ppw, freqs_split[2].shape[-1]))
+    freqs_w = jnp.expand_dims(jnp.expand_dims(self.freqs_split[2][:ppw], axis=0), axis=1)
+    freqs_w = jnp.broadcast_to(freqs_w, (ppf, pph, ppw, self.freqs_split[2].shape[-1]))
 
     freqs_concat = jnp.concatenate([freqs_f, freqs_h, freqs_w], axis=-1)
     freqs_final = jnp.reshape(freqs_concat, (1, 1, ppf * pph * ppw, -1))
@@ -361,22 +362,41 @@ def __call__(
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
       (self.scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
     )
+    print("Wan Block -- START -- ")
 
     # 1. Self-attention
     norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
+    print("Wan Block -- norm_hidden_states, min: ", np.min(norm_hidden_states))
+    print("Wan Block -- norm_hidden_states, max: ", np.max(norm_hidden_states))
     attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
+    print("Wan Block -- Self Attn. attn_output, min: ", np.min(attn_output))
+    print("Wan Block -- Self Attn. attn_output, max: ", np.max(attn_output))
     hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
+    print("Wan Block -- hidden_states, min: ", np.min(hidden_states))
+    print("Wan Block -- hidden_states, max: ", np.max(hidden_states))
 
     # 2. Cross-attention
     norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32))
+    print("Wan Block -- norm_hidden_states, min: ", np.min(norm_hidden_states))
+    print("Wan Block -- norm_hidden_states, max: ", np.max(norm_hidden_states))
     attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+    print("Wan Block -- Cross Attn. attn_output, min: ", np.min(attn_output))
+    print("Wan Block -- Cross Attn. attn_output, max: ", np.max(attn_output))
     hidden_states = hidden_states + attn_output
+    print("Wan Block -- hidden_states, min: ", np.min(hidden_states))
+    print("Wan Block -- hidden_states, max: ", np.max(hidden_states))
 
     # 3. Feed-forward
     norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
-
+    print("Wan Block -- norm_hidden_states, min: ", np.min(norm_hidden_states))
+    print("Wan Block -- norm_hidden_states, max: ", np.max(norm_hidden_states))
     ff_output = self.ffn(norm_hidden_states)
+    print("Wan Block -- ff_output, min: ", np.min(ff_output))
+    print("Wan Block -- ff_output, max: ", np.max(ff_output))
     hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(hidden_states.dtype)
+    print("Wan Block -- hidden_states, min: ", np.min(hidden_states))
+    print("Wan Block -- hidden_states, max: ", np.max(hidden_states))
+    print("Wan Block -- COMPLETE -- ")
     return hidden_states
   
 
@@ -495,19 +515,32 @@ def __call__(
 
     rotary_emb = self.rope(hidden_states)
     hidden_states = self.patch_embedding(hidden_states)
+    print("***** After patch embedding")
+    print("hidden_states, min: ", np.min(hidden_states))
+    print("hidden_states, max: ", np.max(hidden_states))
     hidden_states = jax.lax.collapse(hidden_states, 1, -1)
 
     temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
       timestep, encoder_hidden_states, encoder_hidden_states_image
     )
+    print("***** After condition embedder")
+    print("temb, min: ", np.min(temb))
+    print("temb, max: ", np.max(temb))
+    print("timestep_proj, min: ", np.min(timestep_proj))
+    print("timestep_proj, max: ", np.max(timestep_proj))
+    print("encoder_hidden_states min: ", np.min(encoder_hidden_states))
+    print("encoder_hidden_states max: ", np.max(encoder_hidden_states))
+
     timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
     if encoder_hidden_states_image is not None:
       raise NotImplementedError("img2vid is not yet implemented.")
 
     for block in self.blocks:
       hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
-      
+      print("After block, hidden_states min:", np.min(hidden_states))
+      print("After block, hidden_states max:", np.max(hidden_states))
+    #breakpoint()
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
     hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)