Switch WAN configs to tensor-parallel and fix activation constraints

Perseus14 · Perseus14 · commit 6760ab93241d · 2026-04-11T18:54:33.000+05:30
Priority 1: Switch default parallelism from context-parallel to tensor-parallel
- Changed ici_tensor_parallelism from 1 to -1 (auto) and
  ici_context_parallelism from -1 to 1 across all WAN T2V configs
  (14B, 1.3B, 27B/A14B).
- Updated logical_axis_rules to match TP strategy:
  - 'embed' axis: ['context', 'fsdp'] -&gt; ['fsdp', 'tensor'] so QKV/FFN
    input dimensions are properly sharded across TP devices.
  - 'activation_self_attn_heads': ['context', 'tensor'] -&gt; 'tensor'
    (pure TP head sharding for self-attention splash kernel).
  - 'activation_cross_attn_q_length': ['context', 'tensor'] -&gt; 'tensor'
    (Q sequence sharding for cross-attention splash kernel).
  - 'activation_length': 'context' -&gt; None (no sequence sharding in TP mode).
  - Conv axes updated to use 'tensor' instead of 'context'.

This aligns with the reference torchax benchmark which uses pure TP and
achieves ~1.8x faster inference than context-parallel mode.

Priority 4: Fix activation constraints for TP compatibility
- WanTransformerBlock.__call__: Changed hidden_states constraint from
  ('activation_batch', 'activation_length', 'activation_heads') to
  ('activation_batch', 'activation_length', None).
- FlaxWanAttention.__call__: Changed constraint from (BATCH, LENGTH, HEAD)
  to (BATCH, LENGTH, None).

The model dim of hidden_states between blocks should remain replicated
(not sharded on tensor axis) because column-parallel QKV projections
expect replicated input. The old constraint forced the model dim onto
the tensor axis, which in TP mode caused an unnecessary all-scatter
before every attention block. In context-parallel mode, tensor=1 made
this a no-op, so the change is backwards-compatible.
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -165,20 +165,24 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 # keep_2 : conv.shape[1] weight
 # conv_in : conv.shape[2] weight
 # conv_out : conv.shape[-1] weight
+#
+# Default: Tensor Parallel (TP) mode — shards QKV/FFN weights across tensor axis.
+# For context-parallel mode (sequence sharding), set ici_context_parallelism: -1,
+# ici_tensor_parallelism: 1, and swap embed to ['context', 'fsdp'].
 logical_axis_rules: [
                       ['batch', ['data', 'fsdp']],
                       ['activation_batch', ['data', 'fsdp']],
-                      ['activation_self_attn_heads', ['context', 'tensor']],
-                      ['activation_cross_attn_q_length', ['context', 'tensor']],
-                      ['activation_length', 'context'],
+                      ['activation_self_attn_heads', 'tensor'],
+                      ['activation_cross_attn_q_length', 'tensor'],
+                      ['activation_length', None],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', ['fsdp', 'tensor']],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', ['data', 'fsdp']],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'tensor'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
@@ -192,8 +196,8 @@ dcn_context_parallelism: -1
 dcn_tensor_parallelism: 1
 ici_data_parallelism: 1
 ici_fsdp_parallelism: 1
-ici_context_parallelism: -1  # recommended ICI axis to be auto-sharded
-ici_tensor_parallelism: 1
+ici_context_parallelism: 1
+ici_tensor_parallelism: -1  # recommended ICI axis to be auto-sharded
 
 allow_split_physical_axes: False
 
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -1,4 +1,4 @@
-﻿# Copyright 2023 Google LLC
+# Copyright 2023 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -145,17 +145,17 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 logical_axis_rules: [
                       ['batch', ['data', 'fsdp']],
                       ['activation_batch', ['data', 'fsdp']],
-                      ['activation_self_attn_heads', ['context', 'tensor']],
-                      ['activation_cross_attn_q_length', ['context', 'tensor']],
-                      ['activation_length', 'context'],
+                      ['activation_self_attn_heads', 'tensor'],
+                      ['activation_cross_attn_q_length', 'tensor'],
+                      ['activation_length', None],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', ['fsdp', 'tensor']],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', ['data', 'fsdp']],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'tensor'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
@@ -169,8 +169,8 @@ dcn_context_parallelism: -1
 dcn_tensor_parallelism: 1
 ici_data_parallelism: 1
 ici_fsdp_parallelism: 1
-ici_context_parallelism: -1  # recommended ICI axis to be auto-sharded
-ici_tensor_parallelism: 1
+ici_context_parallelism: 1
+ici_tensor_parallelism: -1  # recommended ICI axis to be auto-sharded
 
 allow_split_physical_axes: False
 
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -156,17 +156,17 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
 logical_axis_rules: [
                       ['batch', ['data', 'fsdp']],
                       ['activation_batch', ['data', 'fsdp']],
-                      ['activation_self_attn_heads', ['context', 'tensor']], 
-                      ['activation_cross_attn_q_length', ['context', 'tensor']],
-                      ['activation_length', 'context'],
+                      ['activation_self_attn_heads', 'tensor'],
+                      ['activation_cross_attn_q_length', 'tensor'],
+                      ['activation_length', None],
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
-                      ['embed', ['context', 'fsdp']],
+                      ['embed', ['fsdp', 'tensor']],
                       ['heads', 'tensor'],
                       ['norm', 'tensor'],
-                      ['conv_batch', ['data', 'context', 'fsdp']],
+                      ['conv_batch', ['data', 'fsdp']],
                       ['out_channels', 'tensor'],
-                      ['conv_out', 'context'],
+                      ['conv_out', 'tensor'],
                     ]
 data_sharding: [['data', 'fsdp', 'context', 'tensor']]
 
@@ -180,8 +180,8 @@ dcn_context_parallelism: -1
 dcn_tensor_parallelism: 1
 ici_data_parallelism: 1
 ici_fsdp_parallelism: 1
-ici_context_parallelism: -1  # recommended ICI axis to be auto-sharded
-ici_tensor_parallelism: 1
+ici_context_parallelism: 1
+ici_tensor_parallelism: -1  # recommended ICI axis to be auto-sharded
 
 allow_split_physical_axes: False
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -1133,7 +1133,7 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
   ) -> jax.Array:
-    axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD))
+    axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, None))
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, axis_names)
     dtype = hidden_states.dtype
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -379,7 +379,7 @@ def __call__(
       shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
           (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
       )
-      axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_heads"))
+      axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", None))
       hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)
       hidden_states = checkpoint_name(hidden_states, "hidden_states")
       axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_kv"))

Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ def __call__(`
`379`	`379`	`shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(`
`380`	`380`	`(self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1`
`381`	`381`	`)`
`382`		`- axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_heads"))`
	`382`	`+ axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", None))`
`383`	`383`	`hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names)`
`384`	`384`	`hidden_states = checkpoint_name(hidden_states, "hidden_states")`
`385`	`385`	`axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_kv"))`