Merge pull request #3199 from AI-Hypercomputer:amandaliang

Google-ML-Automation · Google-ML-Automation · commit 44039d824869 · 2026-02-25T20:48:08.000-08:00
PiperOrigin-RevId: 875485665
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -214,6 +214,8 @@ wo_tile_drhs_buffer_count: 2
 wi_combine_scopes: False
 wo_combine_scopes: False
 
+merge_gating_gmm: False
+
 norm_topk_prob: false # boolean to enable the top-k probability normalization. qwen3-specific normalization of router weights.
 
 # how the expert axis is used to shard attention weights and activations
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -685,6 +685,8 @@ class MoEKernels(BaseModel):
   wi_combine_scopes: bool = Field(False, description="whether to use combine_scopes features for tgmm for wi.")
   wo_combine_scopes: bool = Field(False, description="whether to use combine_scopes features for tgmm for wo.")
 
+  merge_gating_gmm: bool = Field(False, description="whether to merge the two gating gmm kernels into one.")
+
 
 class DeepSeekMoE(BaseModel):
   """Configuration specific to DeepSeek-style MoE layers."""
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -887,23 +887,34 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
 
     wo_gather_axes.extend(get_active_sharding_axes(wo_pspec[0], 0))
     wo_gather_axes.extend(get_active_sharding_axes(wo_pspec[1], 1))
-
-  layer_w0 = gmm_fn(
-      x,
-      w0,
-      tiling=wi_tile_size,
-      weight_gather_axes=wi_gather_axes,
-      input_buffer_count=wi_input_buffer_count,
-      combine_scopes=wi_combine_scopes,
-  )
-  layer_w1 = gmm_fn(
-      x,
-      w1,
-      tiling=wi_tile_size,
-      weight_gather_axes=wi_gather_axes,
-      input_buffer_count=wi_input_buffer_count,
-      combine_scopes=wi_combine_scopes,
-  )
+  if config.merge_gating_gmm:
+    w01 = jnp.concatenate([w0, w1], axis=-1)
+    layer_w01 = gmm_fn(
+        x,
+        w01,
+        tiling=wi_tile_size,
+        weight_gather_axes=wi_gather_axes,
+        input_buffer_count=wi_input_buffer_count,
+        combine_scopes=wi_combine_scopes,
+    )
+    layer_w0, layer_w1 = jnp.split(layer_w01, 2, axis=-1)
+  else:
+    layer_w0 = gmm_fn(
+        x,
+        w0,
+        tiling=wi_tile_size,
+        weight_gather_axes=wi_gather_axes,
+        input_buffer_count=wi_input_buffer_count,
+        combine_scopes=wi_combine_scopes,
+    )
+    layer_w1 = gmm_fn(
+        x,
+        w1,
+        tiling=wi_tile_size,
+        weight_gather_axes=wi_gather_axes,
+        input_buffer_count=wi_input_buffer_count,
+        combine_scopes=wi_combine_scopes,
+    )
   layer_w0 = jax.ad_checkpoint.checkpoint_name(layer_w0, "mlpwi_0")
   layer_w1 = jax.ad_checkpoint.checkpoint_name(layer_w1, "mlpwi_1")
   intermediate_layer = jax.nn.silu(layer_w0) * layer_w1