single forward loop.

jfacevedo-google · jfacevedo-google · commit 3f6eb05e2251 · 2025-07-08T20:16:27.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -52,7 +52,17 @@ from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
 
-flash_block_sizes: {}
+#flash_block_sizes: {}
+flash_block_sizes: {
+  "block_q" : 2048,
+  "block_kv_compute" : 2048,
+  "block_kv" : 2048,
+  "block_q_dkv" : 2048,
+  "block_kv_dkv" : 2048,
+  "block_kv_dkv_compute" : 2048,
+  "block_q_dq" : 2048,
+  "block_kv_dq" : 2048
+}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -112,7 +122,7 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
                       ['batch', 'data'],
-                      #['activation_heads', 'fsdp'],
+                      ['activation_heads', 'tensor'],
                       ['activation_length', 'fsdp'],
                       #['activation_heads', 'fsdp'],
                       #['activation_heads', 'fsdp'],
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -162,7 +162,7 @@ def _tpu_flash_attention(
 ) -> jax.Array:
   """TPU Flash Attention"""
 
-  max_block_size = 768#1024 if dtype == jnp.bfloat16 else 512
+  max_block_size = 1024 if dtype == jnp.bfloat16 else 512
   if flash_block_sizes:
     block_sizes = flash_block_sizes
   else:
@@ -205,8 +205,8 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
     )
     return splash_kernel
 
-  shard_head_size = 1
-  mask = splash_attention_mask.FullMask(_shape=(query.shape[2], query.shape[2]))
+  shard_head_size = mesh.shape["tensor"]
+  mask = splash_attention_mask.FullMask(_shape=(query.shape[2], key.shape[2]))
   multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
   splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
   segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
@@ -223,7 +223,10 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
     check_rep=False
   )
   def wrap_flash_attention(query, key, value, splash_kernel):
+    #full_k = jax.lax.all_to_all(key, axis_name='fsdp', split_axis=2, concat_axis=2, tiled=True)
+    #full_v = jax.lax.all_to_all(value, axis_name='fsdp', split_axis=2, concat_axis=2, tiled=True)
     attention_output = jax.vmap(splash_kernel)(query, key, value)
+    #attention_output = jax.vmap(splash_kernel)(query, full_k, full_v)
     return attention_output
 
   devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"]
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -469,11 +469,18 @@ def __call__(
 
     if encoder_hidden_states_image is not None:
       raise NotImplementedError("img2vid is not yet implemented.")
+    def skip_block_true(hidden_states):
+      split_bs = hidden_states.shape[0] // 2
+      prev_neg_hidden_states = hidden_states[split_bs:]
+      hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+      hidden_states = jnp.concatenate([hidden_states[:split_bs], prev_neg_hidden_states], axis=0)
+      return hidden_states
+
     for block_idx, block in enumerate(self.blocks):
       should_skip_block = slg_mask[block_idx] & is_uncond
       hidden_states = jax.lax.cond(
           should_skip_block,
-          lambda hs: hs,  # If true, pass through original hidden_states (skip block)
+          lambda _: skip_block_true(hidden_states),  # If true, pass through original hidden_states (skip block)
           lambda _: block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb),
           hidden_states,
       )
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -470,11 +470,17 @@ def run_inference(
     slg_end: float = 1.0,
 ):
   do_classifier_free_guidance = guidance_scale > 1.0
+  if do_classifier_free_guidance:
+    prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
   for step in range(num_inference_steps):
     slg_mask = jnp.zeros(num_transformer_layers, dtype=jnp.bool_)
     if slg_layers and int(slg_start * num_inference_steps) <= step < int(slg_end * num_inference_steps):
       slg_mask = slg_mask.at[jnp.array(slg_layers)].set(True)
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+    # get original batch size before concat in case of cfg.
+    bsz = latents.shape[0]
+    if do_classifier_free_guidance:
+      latents = jnp.concatenate([latents] * 2)
     timestep = jnp.broadcast_to(t, latents.shape[0])
 
     noise_pred = transformer_forward_pass(
@@ -484,21 +490,14 @@ def run_inference(
         latents,
         timestep,
         prompt_embeds,
-        is_uncond=jnp.array(False, dtype=jnp.bool_),
+        is_uncond=jnp.array(True, dtype=jnp.bool_),
         slg_mask=slg_mask,
     )
 
     if do_classifier_free_guidance:
-      noise_uncond = transformer_forward_pass(
-          graphdef,
-          sharded_state,
-          rest_of_state,
-          latents,
-          timestep,
-          negative_prompt_embeds,
-          is_uncond=jnp.array(True, dtype=jnp.bool_),
-          slg_mask=slg_mask,
-      )
+      noise_uncond = noise_pred[bsz:]
+      noise_pred = noise_pred[:bsz]
       noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+      latents = latents[:bsz]
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
   return latents