Merge pull request #3141 from AI-Hypercomputer:mohit/attn_expert_submit

Google-ML-Automation · Google-ML-Automation · commit 102af2313800 · 2026-03-05T17:17:34.000-08:00
PiperOrigin-RevId: 879321135
diff --git a/.gitignore b/.gitignore
@@ -148,3 +148,6 @@ dmypy.json
 # Gemini CLI
 .gemini/
 gha-creds-*.json
+
+# vscode workspace
+maxtext.code-workspace
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -25,7 +25,7 @@ weight_dtype: bfloat16
 
 
 # -------------- Logical Axis Rules --------------
-mesh_axes: ['data', 'attn_dp', 'model', 'expert']
+mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
 logical_axis_rules: [
                       ['activation_batch', ['expert']],
                       ['activation_batch_no_exp', []],
@@ -37,37 +37,38 @@ logical_axis_rules: [
                       ['activation_attn_length_no_exp', []],
                       ['activation_length', ['data', 'expert']],
                       ['activation_length_no_exp', 'data'],
-                      ['activation_q_length', ['expert']],
+                      ['activation_q_length', ['expert', 'attn_dp_expert']],
                       ['activation_attn_embed', 'model'],
                       ['activation_embed', ['model', 'attn_dp']],
                       ['activation_mlp', ['model', 'attn_dp']],
                       ['activation_kv', ['model']],
-                      ['activation_prefill_kv_batch', ['expert']],
-                      ['activation_kv_batch', ['expert']],
+                      ['activation_prefill_kv_batch', ['expert', 'attn_dp_expert']],
+                      ['activation_kv_batch', ['expert', 'attn_dp_expert']],
                       ['activation_kv_batch_no_exp', []],
                       ['activation_kv_head_dim', ['model']],
                       ['activation_vocab', ['model', 'attn_dp']],
                       ['activation_norm_length', []],
-                      ['activation_exp', ['expert']],
-                      ['decode_batch', ['expert']],
+                      ['activation_exp', ['expert', 'attn_dp_expert']],
+                      ['decode_batch', ['expert', 'attn_dp_expert']],
                       ['decode_length', []],
                       ['mlp', ['model', 'attn_dp']],
                       ['mlp_no_fsdp', ['model', 'attn_dp']],
+                      ['moe_mlp', ['model', 'attn_dp']],
                       ['vocab', ['model', 'attn_dp']],
                       ['heads', ['model']],
                       ['q_heads', ['model']],
                       ['kv_heads', ['model']],
                       ['kv_head_dim', []],
                       ['kv', []],
-                      ['embed', ['expert']],
+                      ['embed', ['expert', 'attn_dp_expert']],
                       ['embed_tensor_transpose', ['attn_dp', 'model']],
                       ['embed_no_exp', []],
-                      ['q_lora', ['expert']],
-                      ['kv_lora', ['expert']],
+                      ['q_lora', ['expert', 'attn_dp_expert']],
+                      ['kv_lora', ['expert', 'attn_dp_expert']],
                       ['norm', []],
                       ['cache_heads', ['model']],
-                      ['exp', ['expert']],
+                      ['exp', ['expert', 'attn_dp_expert']],
                       ['paged_kv_heads', ['model']],
                     ]
-data_sharding: [['data', 'attn_dp', 'model', 'expert']]
+data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -27,6 +27,7 @@ num_samplers_slices: -1
 # replicas in rollout. If not specified, rollout_tensor_parallelism will be auto-determined.
 rollout_data_parallelism: -1
 rollout_tensor_parallelism: -1
+rollout_expert_parallelism: 1
 
 # ====== Reproducibility ======
 data_shuffle_seed: 42
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1585,6 +1585,7 @@ class RLHardware(BaseModel):
       -1,
       description="Tensor parallelism per replica for rollout. If not specified, it will be auto-determined.",
   )
+  rollout_expert_parallelism: int = Field(1, description="Expert parallelism per replica for rollout")
 
 
 class VLLM(BaseModel):
@@ -2573,6 +2574,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "expert": self.ici_expert_parallelism,
           "autoregressive": self.ici_autoregressive_parallelism,
           "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
+          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
       }
       self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
 
@@ -2592,6 +2594,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           "expert": self.dcn_expert_parallelism,
           "autoregressive": self.dcn_autoregressive_parallelism,
           "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
+          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
       }
       self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
 
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -75,6 +75,7 @@ def decode_with_vllm(config: Config) -> None:
       "hf_config_path": config.vllm_hf_config_path,
       "hf_overrides": config.vllm_hf_overrides,
       "gpu_memory_utilization": config.hbm_utilization_vllm,
+      "async_scheduling": config.async_scheduling,
       "additional_config": {
           "maxtext_config": {
               "model_name": config.model_name,
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -368,6 +368,11 @@ def __init__(
     else:
       self._tensor_parallelism_name = "tensor"
 
+    if self.config.attention == "vllm_rpa":
+      self._expert_parallelism_name = "attn_dp_expert"
+    else:
+      self._expert_parallelism_name = "expert"
+
     self.gate = GateLogit(
         in_features_shape=self.config.emb_dim,
         out_features_shape=self.num_experts,
@@ -467,7 +472,7 @@ def _logical_to_mesh_axes(self, logical_name):
     return logical_to_mesh_axes(logical_name, mesh=self.mesh, rules=logical_rules)
 
   def get_expert_parallelism_size(self):
-    return self.mesh.shape.get("expert", 1)
+    return self.mesh.shape.get(self._expert_parallelism_name, 1)
 
   def get_tensor_parallelism_size(self):
     if isinstance(self._tensor_parallelism_name, tuple):
@@ -494,8 +499,8 @@ def get_topk(self, gate_logits, pre_bias_logits, rngs=None):
     if self.config.use_random_routing:
       if rngs is None:
         raise ValueError("The random key cannot be None for random routing.")
-      # Reuse the 'dropout' RNG stream to ensure random routing
-      rng = rngs.dropout()
+      # Reuse the 'params' RNG stream to ensure random routing
+      rng = rngs.params()
       top_k_weights, top_k_indices = random_routing(rng, gate_logits, self.num_experts_per_tok)
       return top_k_weights, top_k_indices
 
@@ -1002,7 +1007,7 @@ def gmm(
     # batch_size=1 while decode can have batch_size > 1.
     try:
       is_batch_sharded_by_expert = (
-          "expert"
+          self._expert_parallelism_name
           in tuple(
               filter(
                   lambda tup: tup[0] == "activation_batch",
@@ -1094,10 +1099,9 @@ def gmm(
     )
     def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, rngs):
       batch_size, sequence_length, _ = x.shape
-      expert_axis_name = "expert"
       num_expert_parallelism = self.get_expert_parallelism_size()
       if num_expert_parallelism > 1:
-        expert_shard_id = jax.lax.axis_index(expert_axis_name)
+        expert_shard_id = jax.lax.axis_index(self._expert_parallelism_name)
       else:
         expert_shard_id = 0
       num_expert_parallelism = self.get_expert_parallelism_size()
@@ -1107,7 +1111,8 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
 
         # Duplicate inputs to all expert shards.
         x, logits, pre_bias_logits = tuple(
-            jax.lax.all_gather(z, axis_name=expert_axis_name, tiled=True) for z in (x, logits, pre_bias_logits)
+            jax.lax.all_gather(z, axis_name=self._expert_parallelism_name, tiled=True)
+            for z in (x, logits, pre_bias_logits)
         )
 
         # "Route" tokens within each shard.
@@ -1131,7 +1136,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
         )
 
         if num_expert_parallelism > 1:
-          batch_axis = "expert" if is_batch_sharded_by_expert else "data"
+          batch_axis = self._expert_parallelism_name if is_batch_sharded_by_expert else "data"
           # get group sizes for all shards
           local_expert_size = self.config.num_experts // num_expert_parallelism
           reshaped_group_sizes = jnp.sum(group_sizes.reshape(-1, local_expert_size), axis=1)
@@ -1163,9 +1168,9 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
                 send_sizes,
                 output_offsets,
                 recv_sizes,
-                axis_name=expert_axis_name,
+                axis_name=self._expert_parallelism_name,
             )
-            global_group_sizes = jax.lax.all_gather(group_sizes, axis_name=expert_axis_name)
+            global_group_sizes = jax.lax.all_gather(group_sizes, axis_name=self._expert_parallelism_name)
             x, local_sorted_indices, group_sizes, selected_experts = RoutedMoE.local_permute(
                 x,
                 global_group_sizes,
@@ -1310,7 +1315,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
 
         # Sum up the partial outputs across the expert shards.
         output = jnp.reshape(output, (-1, sequence_length, self.config.emb_dim))
-        output = jax.lax.psum_scatter(output, expert_axis_name, scatter_dimension=0, tiled=True)
+        output = jax.lax.psum_scatter(output, self._expert_parallelism_name, scatter_dimension=0, tiled=True)
 
       else:
         if num_expert_parallelism > 1:
@@ -1343,7 +1348,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
                 send_sizes,
                 output_offsets,
                 recv_sizes,
-                axis_name=expert_axis_name,
+                axis_name=self._expert_parallelism_name,
             )
           else:
             # If bach is replicated across EP shards then each shard should send
@@ -1363,7 +1368,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
                 send_sizes,
                 output_offsets,
                 recv_sizes,
-                axis_name=expert_axis_name,
+                axis_name=self._expert_parallelism_name,
             )
 
         output = self.unpermute(
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -228,31 +228,56 @@ def setup_configs_and_devices(argv: list[str]):
   return trainer_config, sampler_config, trainer_devices, sampler_devices
 
 
-def get_rollout_kwargs_for_data_parallelism(sampler_config, num_sampler_devices):
+def get_rollout_kwargs_for_parallelism(sampler_config, num_sampler_devices):
   """Get rollout kwargs for vLLM rollout when using data parallelism."""
   dp = sampler_config.rollout_data_parallelism
-  if dp == -1:
-    return {}
-
-  rollout_kwargs = {}
   tp = sampler_config.rollout_tensor_parallelism
+  ep = sampler_config.rollout_expert_parallelism
+
+  # -1 means "auto-derive from the other two". At most one can be -1.
+  num_auto = sum(1 for x in [tp, dp, ep] if x == -1)
+  if num_auto > 1:
+    raise ValueError(
+        "At most one of rollout_tensor_parallelism, rollout_data_parallelism, "
+        "rollout_expert_parallelism can be -1 (auto-derived)."
+    )
 
-  if tp == -1:
-    if num_sampler_devices % dp != 0:
+  if dp == -1:
+    if num_sampler_devices % (tp * ep) != 0:
       raise ValueError(
           f"num_sampler_devices({num_sampler_devices}) must be divisible by "
-          f"rollout_data_parallelism({dp}) "
+          f"rollout_tensor_parallelism({tp}) * rollout_expert_parallelism({ep}) "
+          f"when rollout_data_parallelism is -1."
+      )
+    dp = num_sampler_devices // tp // ep
+  elif tp == -1:
+    if num_sampler_devices % (dp * ep) != 0:
+      raise ValueError(
+          f"num_sampler_devices({num_sampler_devices}) must be divisible by "
+          f"rollout_data_parallelism({dp}) * rollout_expert_parallelism({ep}) "
           f"when rollout_tensor_parallelism is -1."
       )
-    tp = num_sampler_devices // dp
-  elif tp * dp != num_sampler_devices:
+    tp = num_sampler_devices // dp // ep
+  elif ep == -1:
+    if num_sampler_devices % (tp * dp) != 0:
+      raise ValueError(
+          f"num_sampler_devices({num_sampler_devices}) must be divisible by "
+          f"rollout_tensor_parallelism({tp}) * rollout_data_parallelism({dp}) "
+          f"when rollout_expert_parallelism is -1."
+      )
+    ep = num_sampler_devices // tp // dp
+  elif tp * dp * ep != num_sampler_devices:
     raise ValueError(
         f"rollout_tensor_parallelism({tp}) * "
-        f"rollout_data_parallelism({dp}) "
+        f"rollout_data_parallelism({dp}) * "
+        f"rollout_expert_parallelism({ep}) "
         f"!= len(sampler_devices)({num_sampler_devices})"
     )
+
+  rollout_kwargs = {}
   rollout_kwargs["tensor_parallel_size"] = tp
   rollout_kwargs["data_parallel_size"] = dp
+  rollout_kwargs["expert_parallel_size"] = ep
 
   return rollout_kwargs
 
@@ -544,13 +569,14 @@ def _filter_long_prompts(x):
           rollout_vllm_async_scheduling=trainer_config.async_scheduling,
           rollout_vllm_kwargs={
               "hf_overrides": trainer_config.vllm_hf_overrides,
+              "enable_expert_parallel": sampler_config.rollout_expert_parallelism > 1,
           },
           rollout_vllm_sampling_kwargs={
               "stop": trainer_config.stop_strings,
               "detokenize": trainer_config.stop_strings is not None,
               "include_stop_str_in_output": trainer_config.stop_strings is not None,
           },
-          **get_rollout_kwargs_for_data_parallelism(sampler_config, len(sampler_devices)),
+          **get_rollout_kwargs_for_parallelism(sampler_config, len(sampler_devices)),
       ),
   )
   grpo_config = GrpoConfig(

Original file line number	Diff line number	Diff line change
`@@ -1585,6 +1585,7 @@ class RLHardware(BaseModel):`
`1585`	`1585`	`-1,`
`1586`	`1586`	`description="Tensor parallelism per replica for rollout. If not specified, it will be auto-determined.",`
`1587`	`1587`	`)`
	`1588`	`+ rollout_expert_parallelism: int = Field(1, description="Expert parallelism per replica for rollout")`
`1588`	`1589`
`1589`	`1590`
`1590`	`1591`	`class VLLM(BaseModel):`
`@@ -2573,6 +2574,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de`
`2573`	`2574`	`"expert": self.ici_expert_parallelism,`
`2574`	`2575`	`"autoregressive": self.ici_autoregressive_parallelism,`
`2575`	`2576`	`"attn_dp": 1, # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads`
	`2577`	`+ "attn_dp_expert": 1, # initialized to 1, vLLM will auto calculate this value based on EP`
`2576`	`2578`	`}`
`2577`	`2579`	`self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]`
`2578`	`2580`
`@@ -2592,6 +2594,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de`
`2592`	`2594`	`"expert": self.dcn_expert_parallelism,`
`2593`	`2595`	`"autoregressive": self.dcn_autoregressive_parallelism,`
`2594`	`2596`	`"attn_dp": 1, # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads`
	`2597`	`+ "attn_dp_expert": 1, # initialized to 1, vLLM will auto calculate this value based on EP`
`2595`	`2598`	`}`
`2596`	`2599`	`self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]`
`2597`	`2600`