AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 10 additions & 41 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 10 additions & 41 deletions
diff --git a/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxdiffusion/models/embeddings_flax.py‎
Lines changed: 1 addition & 1 deletion
@@ -130,6 +130,7 @@ logical_axis_rules: [
                       ['conv_batch', ['data','fsdp']],
                       ['out_channels', 'tensor'],
                       ['conv_out', 'fsdp'],
+                      ['conv_in', 'fsdp']
                     ]
 data_sharding: [['data', 'fsdp', 'tensor']]
 
 
@@ -101,7 +101,8 @@ def _reshape_data_for_flash(tensor, heads, flash_block_size):
   """
   Reshapes tensors for pallas flash attention adding padding to both seq_len and head_dim.
   """
-  tensor = _unflatten_heads(tensor, heads)
+  if tensor.ndim != 4: 
+    tensor = _unflatten_heads(tensor, heads)
 
   # pad head_dim to 128 if less than that.
   kv_size = tensor.shape[-1]
@@ -319,12 +320,14 @@ def _apply_attention(
   ):
   """Routes to different attention kernels."""
   _check_attention_inputs(query, key, value)
-  
+  seq_len_idx = 1
+  if query.ndim == 4:
+    seq_len_idx = 2
   if attention_kernel == "flash":
     can_use_flash_attention = (
-        query.shape[1] >= flash_min_seq_length
-        and key.shape[1] >= flash_min_seq_length
-        and value.shape[1] >= flash_min_seq_length
+        query.shape[seq_len_idx] >= flash_min_seq_length
+        and key.shape[seq_len_idx] >= flash_min_seq_length
+        and value.shape[seq_len_idx] >= flash_min_seq_length
     )
   else:
     can_use_flash_attention = True
@@ -584,7 +587,6 @@ def __init__(
 
     if attention_kernel in {"flash", "cudnn_flash_te"} and mesh is None:
       raise ValueError(f"The flash attention kernel requires a value for mesh, but mesh is {self.mesh}")
-    
     self.dim_head = dim_head
     self.heads = heads
     self.inner_dim = dim_head * heads
@@ -681,7 +683,6 @@ def _apply_rope(self, xq: jax.Array, xk: jax.Array, freqs_cis: jax.Array) -> Tup
     xq_ = jax.lax.complex(reshape_xq[..., 0], reshape_xq[..., 1])
     xk_ = jax.lax.complex(reshape_xk[..., 0], reshape_xk[..., 1])
 
-    freqs_cis = freqs_cis[None, None, ...]
     xq_out_complex = xq_ * freqs_cis
     xk_out_complex = xk_ * freqs_cis
 
@@ -696,58 +697,26 @@ def __call__(
     encoder_hidden_states: jax.Array = None,
     rotary_emb: Optional[jax.Array] = None
   ) -> jax.Array:
-    print(" -- -- WanAttention -- ")
     dtype = hidden_states.dtype
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
     query_proj = self.query(hidden_states)
-    print("query_proj min: ", np.min(query_proj))
-    print("query_proj max: ", np.max(query_proj))
     key_proj = self.key(encoder_hidden_states)
-    print("key_proj min: ", np.min(key_proj))
-    print("key_proj max: ", np.max(key_proj))
     value_proj = self.value(encoder_hidden_states)
-    print("value_proj min: ", np.min(value_proj))
-    print("value_proj max: ", np.max(value_proj))
-
-    query_proj = nn.with_logical_constraint(query_proj, self.query_axis_names)
-    key_proj = nn.with_logical_constraint(key_proj, self.key_axis_names)
-    value_proj = nn.with_logical_constraint(value_proj, self.value_axis_names)
 
     if self.qk_norm:
       query_proj = self.norm_q(query_proj)
       key_proj = self.norm_k(key_proj)
-      print("query_proj min: ", np.min(query_proj))
-      print("query_proj max: ", np.max(query_proj))
-      print("key_proj min: ", np.min(key_proj))
-      print("key_proj max: ", np.max(key_proj))
-    
     if rotary_emb is not None:
       query_proj = _unflatten_heads(query_proj, self.heads)
       key_proj = _unflatten_heads(key_proj, self.heads)
-      # value_proj = _unflatten_heads(value_proj, self.heads)
+      value_proj = _unflatten_heads(value_proj, self.heads)
       query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
-      print("Rope query_proj min: ", np.min(query_proj))
-      print("Rope query_proj max: ", np.max(query_proj))
-      print("Rope key_proj min: ", np.min(key_proj))
-      print("Rope key_proj max: ", np.max(key_proj))
-      #breakpoint()
-      query_proj = _reshape_heads_to_head_dim(query_proj)
-      key_proj = _reshape_heads_to_head_dim(key_proj)
 
     attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
-    try:
-      print("attn_output min: ", np.min(attn_output))
-      print("attn_output_for_print max: ", np.max(attn_output))
-    except:
-      pass
     attn_output = attn_output.astype(dtype=dtype)
 
-    hidden_states = self.proj_attn(hidden_states)
-    print("hidden_states min: ", np.min(hidden_states))
-    print("hidden_states max: ", np.max(hidden_states))
-    print(" -- -- WanAttention DONE -- ")
-    #breakpoint()
+    hidden_states = self.proj_attn(attn_output)
     return hidden_states
 
 
 
@@ -227,7 +227,7 @@ def get_1d_rotary_pos_embed(
     out = jnp.stack([freqs_cos, -freqs_sin, freqs_sin, freqs_cos], axis=-1)
   else:
     # Wan 2.1
-    out = jax.lax.complex(jnp.cos(freqs), jnp.sin(freqs))
+    out = jnp.exp(1j * freqs)
   return out
 
 class NNXPixArtAlphaTextProjection(nnx.Module):
Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,7 @@ logical_axis_rules: [`
`130`	`130`	`['conv_batch', ['data','fsdp']],`
`131`	`131`	`['out_channels', 'tensor'],`
`132`	`132`	`['conv_out', 'fsdp'],`
	`133`	`+ ['conv_in', 'fsdp']`
`133`	`134`	`]`
`134`	`135`	`data_sharding: [['data', 'fsdp', 'tensor']]`
`135`	`136`