Weight gathering in multiple axes

Google-ML-Automation · Google-ML-Automation · commit 796eaebef1b4 · 2025-12-03T01:57:27.000-08:00
Replaces the hardcoded `is_fsdp_shard_on_exp` flag with a `weight_gather_axes` parameter in the Megablox GMM kernel. This allows specifying multiple axes and dimensions for gathering quantized weights before computation and scattering gradients back in the backward pass. The MoE layer now computes the necessary gather axes based on the weight's partition spec and quantization settings.

This is essential for using multiple weight sharding axes such as `ici_fsdp_transpose_parallelism=2` and `ici_fsdp_parallelism=256` for 256 chip configuration for DeepSeek v3 FP8 quantized Training.

PiperOrigin-RevId: 839652914
diff --git a/src/MaxText/kernels/megablox/ops.py b/src/MaxText/kernels/megablox/ops.py
@@ -18,7 +18,7 @@
 
 import functools
 import dataclasses
-from typing import Literal
+from typing import Literal, List, Tuple
 import jax
 import jax.numpy as jnp
 from MaxText.kernels.megablox import backend
@@ -41,7 +41,7 @@ def gmm(
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     use_qwix_quantization: bool = False,
     use_tokamax_backend: bool = False,
-    is_fsdp_shard_on_exp: bool = False,
+    weight_gather_axes: List[Tuple[str, int]] | None = None,
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
@@ -75,7 +75,7 @@ def gmm(
       interpret,
       quantization_rule,
       use_tokamax_backend,
-      is_fsdp_shard_on_exp,
+      weight_gather_axes,
   )
 
 
@@ -91,7 +91,7 @@ def _gmm_fwd(
     interpret: bool = False,
     quantization_rule: qwix.QtRule | None = None,
     use_tokamax_backend: bool = False,
-    is_fsdp_shard_on_exp: bool = False,
+    weight_gather_axes: List[Tuple[str, int]] | None = None,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -128,10 +128,11 @@ def _gmm_fwd(
       if (
           quantization_rule.weight_calibration_method.startswith("fixed")
           and isinstance(rhs, qpl.QArray)
-          and is_fsdp_shard_on_exp
       ):
-        rhs_qvalue = jax.lax.all_gather(rhs.qvalue, "fsdp", axis=0, tiled=True)
-        rhs = dataclasses.replace(rhs, qvalue=rhs_qvalue)
+        if weight_gather_axes:
+          for axis_name, axis_idx in weight_gather_axes:
+            rhs_qvalue = jax.lax.all_gather(rhs.qvalue, axis_name, axis=axis_idx, tiled=True)
+            rhs = dataclasses.replace(rhs, qvalue=rhs_qvalue)
     out = tokamax_backend.gmm(
         lhs=lhs,
         rhs=rhs,
@@ -167,7 +168,7 @@ def _gmm_bwd(
     interpret: bool,
     quantization_rule: qwix.QtRule | None,
     use_tokamax_backend: bool,
-    is_fsdp_shard_on_exp: bool,
+    weight_gather_axes: List[Tuple[str, int]] | None,
     residual: tuple[
         jnp.ndarray | qpl.QArray,
         jnp.ndarray | qpl.QArray,
@@ -241,8 +242,10 @@ def _gmm_bwd(
         num_actual_groups=num_actual_groups,
         interpret=interpret,
     )
-    if quantization_rule and quantization_rule.bwd_qtype and is_fsdp_shard_on_exp:
-      drhs = jax.lax.psum_scatter(drhs, "fsdp", scatter_dimension=0, tiled=True)
+    if quantization_rule and quantization_rule.bwd_qtype and weight_gather_axes:
+      # Scatter back in reverse order of gather
+      for axis_name, axis_idx in reversed(weight_gather_axes):
+        drhs = jax.lax.psum_scatter(drhs, axis_name, scatter_dimension=axis_idx, tiled=True)
   else:
     dlhs = backend.gmm(
         dlhs_dout,
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -35,6 +35,7 @@
 from MaxText.layers import attentions, linears, nnx_wrappers, quantizations
 from MaxText.layers.initializers import NdInitializer, default_bias_init, nd_dense_init, variable_to_logically_partitioned
 import numpy as np
+import qwix.pallas as qpl
 import tokamax
 
 set_xla_metadata = xla_metadata.set_xla_metadata
@@ -792,7 +793,7 @@ def sparse_matmul(
   ):
     """Perform sparse matrix multiplication of inputs and Experts."""
 
-    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
+    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes):
       pad_length = self.config.wi_tile_fwd_batch_seq
       hs_shape = inputs.shape
       # pad length is the 1st dimension of tiling size in gmm call
@@ -830,7 +831,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
               rhs_quantize_dtype=rhs_quantize_dtype,
               use_qwix_quantization=self.config.use_qwix_quantization,
               use_tokamax_backend=self.config.use_tokamax_gmm,
-              is_fsdp_shard_on_exp=self.config.fsdp_shard_on_exp,
+              weight_gather_axes=weight_gather_axes,
           )
         else:
           output = tokamax.ragged_dot(
@@ -853,7 +854,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
               rhs_quantize_dtype=rhs_quantize_dtype,
               use_qwix_quantization=self.config.use_qwix_quantization,
               use_tokamax_backend=self.config.use_tokamax_gmm,
-              is_fsdp_shard_on_exp=self.config.fsdp_shard_on_exp,
+              weight_gather_axes=weight_gather_axes,
           )
         else:
           rhs_inputs = kernel
@@ -935,12 +936,15 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
 
     # w0, w1, wo needs to be un sharded on fsdp / fsdp_transpose axis, so use
     # mlp_no_fsdp axis
+    weight_gather = False
     if self.config.fsdp_shard_on_exp:
-      if self.config.quantization:
-        # special sharding when quantization is enabled with fsdp_shard_on_exp
+      quantization_rule = qpl.get_current_rule("gmm")
+      if quantization_rule and quantization_rule.weight_calibration_method.startswith("fixed"):
+        # special sharding when using static scaling for weights in quantization with fsdp_shard_on_exp
         w0_pspec = nn.logical_to_mesh_axes(self.wi_kernel_axes)
         w1_pspec = nn.logical_to_mesh_axes(self.wi_kernel_axes)
         wo_pspec = nn.logical_to_mesh_axes(self.wo_kernel_axes)
+        weight_gather = True
       else:
         # special sharding for dsv3 to remove overhead between gmm/AG
         w0_pspec = nn.logical_to_mesh_axes(("embed_tensor_transpose", None, "mlp_no_fsdp"))
@@ -1069,7 +1073,25 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
 
       if self.config.mlp_bias:
         w0_bias, w1_bias, wo_bias = self.transform_bias(selected_experts, w0_bias, w1_bias, wo_bias)
-
+      def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
+        if pspec_dim_axes is None: return []
+        axes = (pspec_dim_axes,) if isinstance(pspec_dim_axes, str) else pspec_dim_axes
+        active = []
+        for ax in axes:
+          if ax and self.mesh.shape.get(ax, 1) > 1:
+            active.append((ax, tensor_dim_index))
+        return active
+      wi_gather_axes = []
+      wo_gather_axes = []
+
+      if weight_gather:
+        # wi [Experts, In, Hidden] -> Gather Exp(0) and Hidden(2)
+        wi_gather_axes.extend(get_active_sharding_axes(w0_pspec[0], 0))
+        wi_gather_axes.extend(get_active_sharding_axes(w0_pspec[2], 2))
+
+        # wo [Experts, Hidden, Out] -> Gather Exp(0) and Hidden(1)
+        wo_gather_axes.extend(get_active_sharding_axes(wo_pspec[0], 0))
+        wo_gather_axes.extend(get_active_sharding_axes(wo_pspec[1], 1))
       gmm_fn = functools.partial(
           gmm,
           group_sizes=group_sizes,
@@ -1097,22 +1119,22 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
           self.config.wo_tile_drhs_embed_dim,
           self.config.wo_tile_drhs_mlp_dim,
       )
-      layer_w0 = gmm_fn(x, w0, tiling=wi_tile_size)
+      layer_w0 = gmm_fn(x, w0, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes)
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w0 = jax.lax.psum(layer_w0, "tensor_transpose")
       if self.config.mlp_bias:
         layer_w0 = layer_w0 + w0_bias
       layer_w0 = adc.checkpoint_name(layer_w0, "mlpwi_0")
 
-      layer_w1 = gmm_fn(x, w1, tiling=wi_tile_size)
+      layer_w1 = gmm_fn(x, w1, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes)
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w1 = jax.lax.psum(layer_w1, "tensor_transpose")
       if self.config.mlp_bias:
         layer_w1 = layer_w1 + w1_bias
       layer_w1 = adc.checkpoint_name(layer_w1, "mlpwi_1")
       intermediate_layer = self.apply_ffn_activation(layer_w0, layer_w1)
 
-      intermediate_output = gmm_fn(intermediate_layer, wo, tiling=wo_tile_size)
+      intermediate_output = gmm_fn(intermediate_layer, wo, tiling=wo_tile_size, weight_gather_axes=wo_gather_axes)
       if self.get_tensor_parallelism_size() > 1:
         intermediate_output = jax.lax.psum_scatter(intermediate_output, "tensor", scatter_dimension=1, tiled=True)
       if self.config.mlp_bias: