Merge pull request #3081 from AI-Hypercomputer:amandaliang

Google-ML-Automation · Google-ML-Automation · commit 863d2a3a06e8 · 2026-02-05T09:40:05.000-08:00
PiperOrigin-RevId: 865988464
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -203,6 +203,14 @@ wo_tile_dlhs_mlp_dim: 1024
 wo_tile_drhs_batch_seq: 512
 wo_tile_drhs_embed_dim: 1024
 wo_tile_drhs_mlp_dim: 1024
+
+wi_tile_fwd_buffer_count: 2
+wi_tile_dlhs_buffer_count: 2
+wi_tile_drhs_buffer_count: 2
+wo_tile_fwd_buffer_count: 2
+wo_tile_dlhs_buffer_count: 2
+wo_tile_drhs_buffer_count: 2
+
 norm_topk_prob: false # boolean to enable the top-k probability normalization. qwen3-specific normalization of router weights.
 
 # how the expert axis is used to shard attention weights and activations
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -661,6 +661,13 @@ class MoEKernels(BaseModel):
   wo_tile_drhs_embed_dim: int = Field(1024, description="bwd pass drhs tiling dimension for embedding in GMM for wo.")
   wo_tile_drhs_mlp_dim: int = Field(1024, description="bwd pass drhs tiling dimension for MLP in GMM for wo.")
 
+  wi_tile_fwd_buffer_count: int = Field(2, description="forward pass tiling buffer count in GMM for wi.")
+  wi_tile_dlhs_buffer_count: int = Field(2, description="bwd pass dlhs tiling buffer count in GMM for wi.")
+  wi_tile_drhs_buffer_count: int = Field(2, description="bwd pass drhs tiling buffer count in GMM for wi.")
+  wo_tile_fwd_buffer_count: int = Field(2, description="forward pass tiling buffer count in GMM for wo.")
+  wo_tile_dlhs_buffer_count: int = Field(2, description="bwd pass dlhs tiling buffer count in GMM for wo.")
+  wo_tile_drhs_buffer_count: int = Field(2, description="bwd pass drhs tiling buffer count in GMM for wo.")
+
 
 class DeepSeekMoE(BaseModel):
   """Configuration specific to DeepSeek-style MoE layers."""
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -877,7 +877,7 @@ def sparse_matmul(
   ):
     """Perform sparse matrix multiplication of inputs and Experts."""
 
-    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes):
+    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, input_buffer_count):
       pad_length = self.config.wi_tile_fwd_batch_seq
       hs_shape = inputs.shape
       # pad length is the 1st dimension of tiling size in gmm call
@@ -916,6 +916,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
               use_qwix_quantization=self.config.use_qwix_quantization,
               use_tokamax_backend=self.config.use_tokamax_gmm,
               weight_gather_axes=weight_gather_axes,
+              input_buffer_count=input_buffer_count,
           )
         else:
           output = tokamax.ragged_dot(
@@ -1220,22 +1221,42 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           self.config.wo_tile_drhs_embed_dim,
           self.config.wo_tile_drhs_mlp_dim,
       )
-      layer_w0 = gmm_fn(x, w0, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes)
+      wi_input_buffer_count = (
+          self.config.wi_tile_fwd_buffer_count,
+          self.config.wi_tile_dlhs_buffer_count,
+          self.config.wi_tile_drhs_buffer_count,
+      )
+      wo_input_buffer_count = (
+          self.config.wo_tile_fwd_buffer_count,
+          self.config.wo_tile_dlhs_buffer_count,
+          self.config.wo_tile_drhs_buffer_count,
+      )
+      layer_w0 = gmm_fn(
+          x, w0, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes, input_buffer_count=wi_input_buffer_count
+      )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w0 = jax.lax.psum(layer_w0, "tensor_transpose")
       if self.config.mlp_bias:
         layer_w0 = layer_w0 + w0_bias
       layer_w0 = adc.checkpoint_name(layer_w0, "mlpwi_0")
 
-      layer_w1 = gmm_fn(x, w1, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes)
+      layer_w1 = gmm_fn(
+          x, w1, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes, input_buffer_count=wi_input_buffer_count
+      )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w1 = jax.lax.psum(layer_w1, "tensor_transpose")
       if self.config.mlp_bias:
         layer_w1 = layer_w1 + w1_bias
       layer_w1 = adc.checkpoint_name(layer_w1, "mlpwi_1")
       intermediate_layer = self.apply_ffn_activation(layer_w0, layer_w1)
 
-      intermediate_output = gmm_fn(intermediate_layer, wo, tiling=wo_tile_size, weight_gather_axes=wo_gather_axes)
+      intermediate_output = gmm_fn(
+          intermediate_layer,
+          wo,
+          tiling=wo_tile_size,
+          weight_gather_axes=wo_gather_axes,
+          input_buffer_count=wo_input_buffer_count,
+      )
       if self.get_tensor_parallelism_size() > 1:
         intermediate_output = jax.lax.psum_scatter(
             intermediate_output, self._tensor_parallelism_name, scatter_dimension=1, tiled=True
diff --git a/src/maxtext/kernels/megablox/ops.py b/src/maxtext/kernels/megablox/ops.py
@@ -42,6 +42,7 @@ def gmm(
     use_qwix_quantization: bool = False,
     use_tokamax_backend: bool = False,
     weight_gather_axes: List[Tuple[str, int]] | None = None,
+    input_buffer_count: tuple[int, int, int] = (2, 2, 2),
 ):
   """Grouped matrix multiplication operation."""
   quantization_rule = None
@@ -61,14 +62,15 @@ def gmm(
       )
 
   gmm_fwd_bwd = lambda *args: _gmm_fwd(*args)[0]  # pylint: disable=C3001
-  gmm_fwd_bwd = jax.custom_vjp(gmm_fwd_bwd, nondiff_argnums=(3, 4, 7, 8, 9, 10, 11))
+  gmm_fwd_bwd = jax.custom_vjp(gmm_fwd_bwd, nondiff_argnums=(3, 4, 5, 8, 9, 10, 11, 12))
   gmm_fwd_bwd.defvjp(_gmm_fwd, functools.partial(_gmm_bwd, lhs.dtype, rhs.dtype))
   return gmm_fwd_bwd(
       lhs,
       rhs,
       group_sizes,
       preferred_element_type,
       tiling,
+      input_buffer_count,
       group_offset,
       existing_out,
       transpose_rhs,
@@ -85,6 +87,7 @@ def _gmm_fwd(
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
     tiling: tuple[int, int, int, int, int, int, int, int, int] = (128, 128, 128, 128, 128, 128, 128, 128, 128),
+    input_buffer_count: tuple[int, int, int] = (2, 2, 2),
     group_offset: jnp.ndarray | None = None,
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
@@ -125,9 +128,7 @@ def _gmm_fwd(
       # QAG is only supported for following conditions
   if use_tokamax_backend:
     if quantization_rule and quantization_rule.bwd_qtype:
-      if quantization_rule.weight_calibration_method.startswith(
-          "fixed"
-      ) and isinstance(rhs, qpl.QArray):
+      if quantization_rule.weight_calibration_method.startswith("fixed") and isinstance(rhs, qpl.QArray):
         if weight_gather_axes:
           for axis_name, axis_idx in weight_gather_axes:
             rhs_qvalue = jax.lax.all_gather(rhs.qvalue, axis_name, axis=axis_idx, tiled=True)
@@ -142,6 +143,7 @@ def _gmm_fwd(
         group_offset=group_offset,
         transpose_rhs=transpose_rhs,
         interpret=interpret,
+        input_buffer_count=input_buffer_count[0],
     )
   else:
     out = backend.gmm(
@@ -163,6 +165,7 @@ def _gmm_bwd(
     rhs_dtype: jax.typing.DTypeLike,
     preferred_element_type: jnp.dtype,
     tiling: tuple[int, int, int, int, int, int, int, int, int],
+    input_buffer_count: tuple[int, int, int],
     transpose_rhs: bool,
     interpret: bool,
     quantization_rule: qwix.QtRule | None,
@@ -229,6 +232,7 @@ def _gmm_bwd(
         group_offset=group_offset,
         transpose_rhs=not transpose_rhs,
         interpret=interpret,
+        input_buffer_count=input_buffer_count[1],
     )
     drhs = tokamax_backend.tgmm(
         lhs=lhs.swapaxes(0, 1),
@@ -240,6 +244,7 @@ def _gmm_bwd(
         group_offset=group_offset,
         num_actual_groups=num_actual_groups,
         interpret=interpret,
+        input_buffer_count=input_buffer_count[2],
     )
     if quantization_rule and quantization_rule.bwd_qtype and weight_gather_axes:
       # Scatter back in reverse order of gather