Refactor execution of batch-split deepseek sparse layers in Decoder to use pure JAX.

Kevin Wang · Google-ML-Automation · commit 519ff77a43e0 · 2026-02-25T12:09:12.000-08:00
PiperOrigin-RevId: 875276104
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -41,6 +41,7 @@
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.models import (
     deepseek,
+    deepseek_batchsplit,
     gemma,
     gemma2,
     gemma3,
@@ -865,15 +866,32 @@ def __call__(
           moe_layer = RemattedBlockLayers[1]
           moe_layer.__call__ = functools.partial(moe_layer.__call__, **layer_call_kwargs)
           num_moe_layers = cfg.num_decoder_layers - cfg.first_num_dense_layers
-          y, _ = self.scan_decoder_layers(
-              cfg,
-              moe_layer,
-              num_moe_layers,
-              "moe_layers",
-              mesh,
-              in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
-              model_mode=model_mode,
-          )(y, *broadcast_args)
+
+          # If batch-split schedule is used and initialization is complete,
+          # as detected by immutable params, use deepseek_batchsplit custom
+          # scan with initialized parameters.
+          if cfg.use_batch_split_schedule and not self.is_mutable_collection("params"):
+            y = deepseek_batchsplit.scan_batch_split_layers(
+                y,
+                self.variables["params"]["moe_layers"],
+                decoder_positions,
+                decoder_segment_ids,
+                model_mode=model_mode,
+                mesh=mesh,
+                quant=self.quant,
+                cfg=cfg,
+                policy=policy,
+            )
+          else:
+            y, _ = self.scan_decoder_layers(
+                cfg,
+                moe_layer,
+                num_moe_layers,
+                "moe_layers",
+                mesh,
+                in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
+                model_mode=model_mode,
+            )(y, *broadcast_args)
         elif cfg.decoder_block == DecoderBlockType.GEMMA3:
           y = self._apply_gemma3_scanned_blocks(
               y,
diff --git a/src/maxtext/models/deepseek.py b/src/maxtext/models/deepseek.py
@@ -16,9 +16,11 @@
 # pylint: disable=arguments-differ
 # pylint: disable=no-name-in-module
 
+import functools
 from typing import Optional
 
 from flax import nnx
+import jax
 from jax.ad_checkpoint import checkpoint_name
 import jax.numpy as jnp
 from jax.sharding import Mesh
@@ -58,7 +60,6 @@ def __init__(
       rngs: nnx.Rngs,
       quant: Optional[quantizations.AqtQuantization] = None,
   ) -> None:
-
     self.config = config
     self.model_mode = model_mode
     self.mesh = mesh
@@ -350,7 +351,6 @@ def __init__(
       rngs: nnx.Rngs,
       quant: Optional[quantizations.AqtQuantization] = None,
   ) -> None:
-
     super().__init__(config, model_mode, mesh, rngs, quant)
     self.DeepSeekMoeBlock_0 = moe.RoutedAndSharedMoE(
         config=self.config,
@@ -380,18 +380,48 @@ def __call__(
     if isinstance(inputs, tuple):
       inputs = inputs[0]
 
-    # If using batch split schedule, call the batch split version of the layer.
+    # This code should only be traced during initialization when using
+    # batch-split schedule. It is never run during model execution, since
+    # `Decoder` directly calls `batch_split_schedule` during execution.
+    # That is also why we can split/merge activations here as well as
+    # in `Decoder`, since they will never be executed together.
     if self.config.use_batch_split_schedule:
+      activation_pspec = jax.sharding.PartitionSpec(
+          ("data", "fsdp", "fsdp_transpose", "expert", "context"),
+          None,
+          None,
+      )
+      inputs = jax.shard_map(
+          functools.partial(
+              deepseek_batchsplit.split,
+              split_factor=self.config.batch_split_factor,
+          ),
+          mesh=self.mesh,
+          in_specs=activation_pspec,
+          out_specs=[activation_pspec] * self.config.batch_split_factor,
+      )(inputs)
+      dpos = deepseek_batchsplit.split(decoder_positions, self.config.batch_split_factor)
+      dseg = deepseek_batchsplit.split(decoder_segment_ids, self.config.batch_split_factor)
+      weights = deepseek_batchsplit.fetch_weights(nnx.to_pure_dict(nnx.state(self, nnx.Param)), self.config.dtype)
       outputs = deepseek_batchsplit.batch_split_schedule(
           inputs,
-          nnx.to_pure_dict(nnx.state(self, nnx.Param)),
-          decoder_positions,
-          decoder_segment_ids,
+          weights,
+          dpos,
+          dseg,
           model_mode=model_mode,
           mesh=self.mesh,
           quant=self.quant,
           cfg=self.config,
       )
+      outputs = jax.shard_map(
+          functools.partial(
+              deepseek_batchsplit.merge,
+              split_factor=self.config.batch_split_factor,
+          ),
+          mesh=self.mesh,
+          in_specs=([activation_pspec] * self.config.batch_split_factor,),
+          out_specs=activation_pspec,
+      )(outputs)
       return outputs, None
 
     x = self.with_logical_constraint(inputs)
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -106,7 +106,9 @@ def _q_psum_scatter_bwd(
 def fetch_weights(params, dtype):
   """Fetches weights from params in the proper format for batch-split schedule."""
   return jax.tree.map(
-      lambda x: jnp.asarray(x[...], dtype),
+      # If x is a LogicallyPartitioned array, then x.value is the underlying
+      # array. If not, use the array directly.
+      lambda x: jnp.asarray(getattr(x, "value", x)[...], dtype),
       (
           (
               (
@@ -165,7 +167,7 @@ def merge(x, split_factor=2):
   return jnp.reshape(x, (-1,) + x.shape[2:])
 
 
-def batch_split_schedule(
+def scan_batch_split_layers(
     inputs,
     params,
     positions,
@@ -175,22 +177,75 @@ def batch_split_schedule(
     mesh,
     quant,
     cfg,
+    policy,
 ):
-  """Applies the DeepSeek MoE layer with batch-split schedule."""
+  """Scans the layers with batch-split schedule."""
+
+  def batch_split_scan_fn(inputs, weights, dpos, dseg):
+    xs = batch_split_schedule(
+        inputs,
+        weights,
+        dpos,
+        dseg,
+        model_mode=model_mode,
+        mesh=mesh,
+        quant=quant,
+        cfg=cfg,
+    )
+    return xs, None
+
+  batch_split_scan_fn_checkpointed = jax.checkpoint(
+      batch_split_scan_fn,
+      # No need to prevent CSE inside scan.
+      prevent_cse=False,
+      policy=policy,
+  )
+  weights = fetch_weights(params, cfg.dtype)
+  # `jax.lax.scan` expects the leading dimension of weights to be the scan
+  # dimension, but the weights are initialized/loaded with the param scan
+  # axis as the scan dimension, so swap the axes.
+  weights = jax.tree.map(lambda x: jnp.swapaxes(x, 0, cfg.param_scan_axis), weights)
+
   activation_pspec = jax.sharding.PartitionSpec(
       ("data", "fsdp", "fsdp_transpose", "expert", "context"),
       None,
       None,
   )
-  xs = jax.shard_map(
+  inputs = jax.shard_map(
       functools.partial(split, split_factor=cfg.batch_split_factor),
       mesh=mesh,
       in_specs=activation_pspec,
       out_specs=[activation_pspec] * cfg.batch_split_factor,
   )(inputs)
   dpos = split(positions, split_factor=cfg.batch_split_factor)
   dseg = split(segment_ids, split_factor=cfg.batch_split_factor)
-  xs = [with_data_parallel_constraint(x, mesh) for x in xs]
+  outputs, _ = jax.lax.scan(
+      functools.partial(batch_split_scan_fn_checkpointed, dpos=dpos, dseg=dseg),
+      inputs,
+      weights,
+  )
+  outputs = jax.shard_map(
+      functools.partial(merge, split_factor=cfg.batch_split_factor),
+      mesh=mesh,
+      in_specs=([activation_pspec] * cfg.batch_split_factor,),
+      out_specs=activation_pspec,
+  )(outputs)
+  return outputs
+
+
+def batch_split_schedule(
+    inputs,
+    weights,
+    positions,
+    segment_ids,
+    *,
+    model_mode,
+    mesh,
+    quant,
+    cfg,
+):
+  """Applies the DeepSeek MoE layer with batch-split schedule."""
+  xs = [with_data_parallel_constraint(x, mesh) for x in inputs]
   xs = jax.ad_checkpoint.checkpoint_name(xs, "decoder_layer_input")
 
   attn_op = attention_op.AttentionOp(
@@ -207,12 +262,12 @@ def batch_split_schedule(
       dtype=cfg.dtype,
       attention_type=cfg.attention_type,
   )
-  norm_mla_ws, moe_ws = fetch_weights(params, cfg.dtype)
+  norm_mla_ws, moe_ws = weights
   xs = mla_with_norms(
       xs,
       norm_mla_ws,
-      dpos,
-      dseg,
+      positions,
+      segment_ids,
       mesh=mesh,
       model_mode=model_mode,
       attn_op=attn_op,
@@ -242,12 +297,6 @@ def batch_split_schedule(
       use_gather_mosaic_kernel=False,
       config=cfg,
   )
-  xs = jax.shard_map(
-      functools.partial(merge, split_factor=cfg.batch_split_factor),
-      mesh=mesh,
-      in_specs=([activation_pspec] * cfg.batch_split_factor,),
-      out_specs=activation_pspec,
-  )(xs)
   return xs