Update a sharding config

RissyRan · RissyRan · commit e986f2495a7d · 2026-01-05T20:22:08.000Z
diff --git a/docs/reference/core_concepts/moe_configuration.md b/docs/reference/core_concepts/moe_configuration.md
@@ -97,7 +97,7 @@ Dropping:
 
 `moe_fsdp_use_two_stage_all_gather`: If enabled, split the All-Gather operation for MoE weights into two separate stages when using FSDP/FSDP-transpose sharding. This is preferred when 3D All-Gather support is unavailable.
 
-`fsdp_shard_on_exp`: If enabled, shard the expert dimension of the MLP weights on the FSDP axis, and recommended when num_experts is a multiple of fsdp_parallelism.
+`shard_exp_on_fsdp`: If enabled, shard the expert dimension of the MLP weights on the FSDP axis, and recommended only when num_experts is a multiple of fsdp_parallelism.
 
 ## 3. Performance Tuning
 These parameters provide granular control over the tiling dimensions for sparse matmul Pallas kernel.
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -214,7 +214,7 @@ expert_shard_attention_option: "fsdp"
 moe_fsdp_use_two_stage_all_gather: false
 # Shard the expert dimension of the MLP weights on the FSDP axis. 
 # This configuration is recommended only when num_experts is a multiple of fsdp_parallelism
-fsdp_shard_on_exp: False
+shard_exp_on_fsdp: False
 # use fsdp and fsdp_transpose axes for sharding the moe weights
 use_2d_fsdp_sharding: False
 
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -570,10 +570,10 @@ class MoEGeneral(BaseModel):
       False,
       description="Use two separate All-Gather calls for MoE weights sharded on both FSDP and FSDP-transpose.",
   )
-  fsdp_shard_on_exp: bool = Field(
+  shard_exp_on_fsdp: bool = Field(
       False,
       description="Shard the expert dimension of the MLP weights on the FSDP axis, "
-      "and recommended when num_experts is a multiple of fsdp_parallelism",
+      "and recommended only when num_experts is a multiple of fsdp_parallelism",
   )
   use_2d_fsdp_sharding: bool = Field(
       False,
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -342,7 +342,7 @@ def __init__(
     self.quant = quant
     self.rngs = rngs
 
-    if self.config.fsdp_shard_on_exp:
+    if self.config.shard_exp_on_fsdp:
       # special sharding for dsv3
       self.wi_kernel_axes = ("embed_no_exp", None, "mlp")
       self.wo_kernel_axes = ("embed_no_exp", "mlp", None)
@@ -1012,10 +1012,10 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
     # w0, w1, wo needs to be un sharded on fsdp / fsdp_transpose axis, so use
     # mlp_no_fsdp axis
     weight_gather = False
-    if self.config.fsdp_shard_on_exp:
+    if self.config.shard_exp_on_fsdp:
       quantization_rule = qpl.get_current_rule("gmm")
       if quantization_rule and quantization_rule.weight_calibration_method.startswith("fixed"):
-        # special sharding when using static scaling for weights in quantization with fsdp_shard_on_exp
+        # special sharding when using static scaling for weights in quantization with shard_exp_on_fsdp
         w0_pspec = self._logical_to_mesh_axes(self.wi_kernel_axes)
         w1_pspec = self._logical_to_mesh_axes(self.wi_kernel_axes)
         wo_pspec = self._logical_to_mesh_axes(self.wo_kernel_axes)
diff --git a/src/MaxText/pyconfig_deprecated.py b/src/MaxText/pyconfig_deprecated.py
@@ -305,7 +305,7 @@ def validate_keys(keys):
     validate_mlp_dim(keys)
     validate_sparse_matmul_parallelism(keys)
     validate_ring_of_experts_parallelism(keys)
-    validate_shard_fsdp_on_expert_parallelism(keys)
+    validate_shard_expert_on_fsdp(keys)
     validate_ragged_dot(keys)
     validate_deepseek_moe(keys)
     validate_gpt_oss_moe(keys)
@@ -1212,12 +1212,12 @@ def validate_ring_of_experts_parallelism(raw_keys):
     raise ValueError("Ring-of-experts requires expert-parallelism to be enabled.")
 
 
-def validate_shard_fsdp_on_expert_parallelism(raw_keys):
-  if raw_keys["fsdp_shard_on_exp"] and raw_keys["num_experts"] % raw_keys["ici_fsdp_parallelism"] != 0:
-    raise ValueError("fsdp_shard_on_exp requires num_experts is divisiable by ici_fsdp_parallelism.")
-  if raw_keys["fsdp_shard_on_exp"] and (using_tensor_parallelism(raw_keys) or using_expert_parallelism(raw_keys)):
+def validate_shard_expert_on_fsdp(raw_keys):
+  if raw_keys["shard_exp_on_fsdp"] and raw_keys["num_experts"] % raw_keys["ici_fsdp_parallelism"] != 0:
+    raise ValueError("shard_exp_on_fsdp requires num_experts is divisiable by ici_fsdp_parallelism.")
+  if raw_keys["shard_exp_on_fsdp"] and (using_tensor_parallelism(raw_keys) or using_expert_parallelism(raw_keys)):
     raise ValueError(
-        "fsdp_shard_on_exp requires ici_expert_parallelism = 1 and ici_tensor_parallelism/ici_tensor_transpose_parallelism = 1."
+        "shard_exp_on_fsdp requires ici_expert_parallelism = 1 and ici_tensor_parallelism/ici_tensor_transpose_parallelism = 1."
     )
 
 
diff --git a/tests/check_qwen3_next_vs_reference.py b/tests/check_qwen3_next_vs_reference.py
@@ -648,7 +648,7 @@ def setUp(self):
             "num_experts_per_tok=2",
             "base_moe_mlp_dim=256",  # moe_mlp_dim will be calculated from this
             "norm_topk_prob=True",
-            "fsdp_shard_on_exp=False",
+            "shard_exp_on_fsdp=False",
             "mlp_activations=['silu', 'linear']",
             "dropout_rate=0.0",
             # Force the test to use the 'dense_matmul' path in the MoE layer,
@@ -1103,7 +1103,7 @@ def _run_full_attention_jax_vs_pytorch_attention(self, attention_type):
             "num_experts_per_tok=2",
             "base_moe_mlp_dim=256",  # moe_mlp_dim will be calculated from this
             "norm_topk_prob=True",
-            "fsdp_shard_on_exp=False",
+            "shard_exp_on_fsdp=False",
             "mlp_activations=['silu', 'linear']",
             "dropout_rate=0.0",
             "sparse_matmul=False",

Original file line number	Diff line number	Diff line change
`@@ -570,10 +570,10 @@ class MoEGeneral(BaseModel):`
`570`	`570`	`False,`
`571`	`571`	`description="Use two separate All-Gather calls for MoE weights sharded on both FSDP and FSDP-transpose.",`
`572`	`572`	`)`
`573`		`- fsdp_shard_on_exp: bool = Field(`
	`573`	`+ shard_exp_on_fsdp: bool = Field(`
`574`	`574`	`False,`
`575`	`575`	`description="Shard the expert dimension of the MLP weights on the FSDP axis, "`
`576`		`- "and recommended when num_experts is a multiple of fsdp_parallelism",`
	`576`	`+ "and recommended only when num_experts is a multiple of fsdp_parallelism",`
`577`	`577`	`)`
`578`	`578`	`use_2d_fsdp_sharding: bool = Field(`
`579`	`579`	`False,`