halves inference time.

jfacevedo-google · jfacevedo-google · commit 54946446599b · 2025-06-13T20:22:42.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -112,8 +112,8 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
                       ['batch', 'data'],
+                      ['activation_heads', 'fsdp'],
                       ['activation_batch', ['data','fsdp']],
-                      ['activation_heads', 'tensor'],
                       ['activation_kv', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
@@ -182,6 +182,8 @@ num_train_epochs: 1
 seed: 0
 output_dir: 'sdxl-model-finetuned'
 per_device_batch_size: 1
+# If global_batch_size % jax.device_count is not 0, use FSDP sharding.
+global_batch_size: 0
 
 warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -16,7 +16,7 @@
 import jax
 import time
 from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
-from maxdiffusion import pyconfig
+from maxdiffusion import pyconfig, max_logging
 from absl import app
 from maxdiffusion.utils import export_to_video
 
@@ -30,9 +30,17 @@ def run(config):
   slg_layers = config.slg_layers
   slg_start = config.slg_start
   slg_end = config.slg_end
+  # If global_batch_size % jax.device_count is not 0, use FSDP sharding.
+  global_batch_size = config.global_batch_size
+  if global_batch_size != 0:
+    batch_multiplier = global_batch_size
+  else:
+    batch_multiplier = jax.device_count() * config.per_device_batch_size
 
-  prompt = [config.prompt] * jax.device_count()
-  negative_prompt = [config.negative_prompt] * jax.device_count()
+  prompt = [config.prompt] * batch_multiplier
+  negative_prompt = [config.negative_prompt] * batch_multiplier
+
+  max_logging.log(f"Num steps: {config.num_inference_steps}, height: {config.height}, width: {config.width}, frames: {config.num_frames}")
 
   videos = pipeline(
       prompt=prompt,
@@ -51,6 +59,23 @@ def run(config):
   for i in range(len(videos)):
     export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
   s0 = time.perf_counter()
+  videos = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      height=config.height,
+      width=config.width,
+      num_frames=config.num_frames,
+      num_inference_steps=config.num_inference_steps,
+      guidance_scale=config.guidance_scale,
+      slg_layers=slg_layers,
+      slg_start=slg_start,
+      slg_end=slg_end,
+  )
+  print("generation time: ", (time.perf_counter() - s0))
+  for i in range(len(videos)):
+    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
+
+  s0 = time.perf_counter()
   with jax.profiler.trace("/tmp/trace/"):
     videos = pipeline(
         prompt=prompt,
@@ -65,9 +90,6 @@ def run(config):
         slg_end=slg_end,
     )
   print("generation time: ", (time.perf_counter() - s0))
-  for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
-
 
 def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -311,29 +311,29 @@ def __init__(
 
   def __call__(self, hidden_states: jax.Array, encoder_hidden_states: jax.Array, temb: jax.Array, rotary_emb: jax.Array):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
-        (self.scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
+        (self.scale_shift_table + temb), 6, axis=1
     )
 
     # 1. Self-attention
-    norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(
+    norm_hidden_states = (self.norm1(hidden_states) * (1 + scale_msa) + shift_msa).astype(
         hidden_states.dtype
     )
     attn_output = self.attn1(
         hidden_states=norm_hidden_states, encoder_hidden_states=norm_hidden_states, rotary_emb=rotary_emb
     )
-    hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
+    hidden_states = (hidden_states + attn_output * gate_msa).astype(hidden_states.dtype)
 
     # 2. Cross-attention
-    norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32))
+    norm_hidden_states = self.norm2(hidden_states)
     attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
     hidden_states = hidden_states + attn_output
 
     # 3. Feed-forward
-    norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(
+    norm_hidden_states = (self.norm3(hidden_states) * (1 + c_scale_msa) + c_shift_msa).astype(
         hidden_states.dtype
     )
     ff_output = self.ffn(norm_hidden_states)
-    hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(
+    hidden_states = (hidden_states + ff_output * c_gate_msa).astype(
         hidden_states.dtype
     )
     return hidden_states
@@ -485,7 +485,7 @@ def __call__(
       )
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
-    hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
+    hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).astype(hidden_states.dtype)
     hidden_states = self.proj_out(hidden_states)
 
     hidden_states = hidden_states.reshape(
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -21,6 +21,7 @@
 import flax
 import flax.linen as nn
 from flax import nnx
+from flax.linen import partitioning as nn_partitioning
 from ...pyconfig import HyperParameters
 from ... import max_logging
 from ... import max_utils
@@ -420,7 +421,7 @@ def __call__(
           num_transformer_layers=self.transformer.config.num_layers,
       )
 
-      with self.mesh:
+      with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
         latents = p_run_inference(
             graphdef=graphdef,
             sharded_state=state,