Make batch split factor configurable.

Kevin Wang · Google-ML-Automation · commit f62ee44b1f3f · 2026-02-06T14:38:33.000-08:00
PiperOrigin-RevId: 866619829
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -240,6 +240,7 @@ topk_routing_group: -1 # number of top groups to route inputs. For EP,
 # Splits the batch to allow for better scheduling when using expert parallelism by overlapping the
 # all-to-all communication with compute. Currently only implemented with DeepSeek sparse layers.
 use_batch_split_schedule: False # a flag if splitting batch into micro-batches to hide communications that yields performance benefits.
+batch_split_factor: 1 # the factor by which to split the batch. Only used if use_batch_split_schedule is True.
 
 # For complex architectures like llama4 there are repeated sets of
 # inhomogeneous layers. E.g. maverick uses [dense+rope, moe+rope, dense+rope, moe+nope]
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -692,6 +692,10 @@ class DeepSeekMoE(BaseModel):
       False,
       description="Whether to split batch into micro-batches to hide communications that yields performance benefits.",
   )
+  batch_split_factor: int = Field(
+      1,
+      description="Factor by which to split the batch into micro-batches. Only used if use_batch_split_schedule is True.",
+  )
 
 
 class Qwen3Next(BaseModel):
diff --git a/src/MaxText/layers/deepseek_batchsplit.py b/src/MaxText/layers/deepseek_batchsplit.py
@@ -71,7 +71,8 @@ def fetch_weights(params, dtype):
 @jax.named_scope("deepseek_batchsplit_split")
 def split(x, split_factor=2):
   """Splits the input into `split_factor` parts along the batch dimension."""
-
+  if split_factor == 1:
+    return [x]
   if x is None:
     return [None] * split_factor
   else:
@@ -80,8 +81,10 @@ def split(x, split_factor=2):
 
 
 @jax.named_scope("deepseek_batchsplit_merge")
-def merge(x):
+def merge(x, split_factor=2):
   """Merges the input microbatches back into a single tensor."""
+  if split_factor == 1:
+    return x[0]
   x = jnp.stack(x, axis=1)
   return jnp.reshape(x, (-1,) + x.shape[2:])
 
@@ -104,13 +107,13 @@ def batch_split_schedule(
       None,
   )
   xs = jax.shard_map(
-      split,
+      functools.partial(split, split_factor=cfg.batch_split_factor),
       mesh=mesh,
       in_specs=activation_pspec,
-      out_specs=[activation_pspec, activation_pspec],
+      out_specs=[activation_pspec] * cfg.batch_split_factor,
   )(inputs)
-  dpos = split(positions)
-  dseg = split(segment_ids)
+  dpos = split(positions, split_factor=cfg.batch_split_factor)
+  dseg = split(segment_ids, split_factor=cfg.batch_split_factor)
   xs = [with_data_parallel_constraint(x, mesh) for x in xs]
   xs = jax.ad_checkpoint.checkpoint_name(xs, "decoder_layer_input")
 
@@ -186,9 +189,9 @@ def batch_split_schedule(
       dtype=cfg.dtype,
   )
   xs = jax.shard_map(
-      merge,
+      functools.partial(merge, split_factor=cfg.batch_split_factor),
       mesh=mesh,
-      in_specs=([activation_pspec, activation_pspec],),
+      in_specs=([activation_pspec] * cfg.batch_split_factor,),
       out_specs=activation_pspec,
   )(xs)
   return xs

Original file line number	Diff line number	Diff line change
`@@ -692,6 +692,10 @@ class DeepSeekMoE(BaseModel):`
`692`	`692`	`False,`
`693`	`693`	`description="Whether to split batch into micro-batches to hide communications that yields performance benefits.",`
`694`	`694`	`)`
	`695`	`+ batch_split_factor: int = Field(`
	`696`	`+ 1,`
	`697`	`+ description="Factor by which to split the batch into micro-batches. Only used if use_batch_split_schedule is True.",`
	`698`	`+ )`
`695`	`699`
`696`	`700`
`697`	`701`	`class Qwen3Next(BaseModel):`