add new pipeline weight prefetching config

NuojCheng · NuojCheng · commit c231424f2dbd · 2026-03-09T17:49:35.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -275,6 +275,9 @@ pipeline_parallel_layers: -1 # Pipeline only this number of layers - for the rem
 # PP degree divides the number of layers.
 # By default (when set to -1) we pipeline all of the decoder layers.
 
+# Pipeline weight prefetching is an advanced SPMD pipeline parallelism improvement technique
+# When enabled, it prefetches necessary weight gathering ahead of microbatched computation, therefore reducing collectives
+use_pipeline_weight_prefetching: False
 
 # num_pipeline_microbatches must be a multiple of the number of pipeline stages. By default it is set to the number of stages.
 # Note the microbatch_size is given by global_batch_size / num_pipeline_microbatches, where global_batch_size = per_device_batch_size * num_devices
@@ -923,7 +926,7 @@ xprof_e2e_enable_fw_power_level_event: False
 xprof_e2e_enable_fw_thermal_event: False
 profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
 
-log_config: False # Prints the config (after defaults have been set by pyconfig logic)
+log_config: True # Prints the config (after defaults have been set by pyconfig logic)
 debug_sharding: False # Prints model weights sharding info
 
 # Checkpoint Structured logging
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -840,6 +840,9 @@ class IciParallelism(BaseModel):
 class PipelineParallelism(BaseModel):
   """Configuration for pipeline parallelism."""
 
+  use_pipeline_weight_prefetching: bool = Field(
+      False, description="Enable weight prefetching for circular pipeline parallelism."
+  )
   num_layers_per_pipeline_stage: int = Field(1, description="Number of layers to place on each pipeline stage.")
   num_pipeline_repeats: int = Field(
       -1,
@@ -2237,6 +2240,17 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         )
         self.num_pipeline_repeats = num_pipeline_repeats
 
+      if self.use_pipeline_weight_prefetching:
+        assert self.num_pipeline_repeats > 1, "Pipeline weight prefetching only supports circular pipeline."
+        assert (
+            self.num_layers_per_pipeline_stage == 1
+        ), "Pipeline weight prefetching currently only supports one layer per pipeline stage."
+        assert (
+            not self.pipeline_delay_activation_forwarding
+        ), "Pipeline weight prefetching does not support pipeline delay."
+        assert not self.quantization, "Quantization is currently not supported for pipeline prefetching."
+        assert not self.scan_layers_per_stage, "Pipeline weight prefetching currently does not support scan."
+
       assert (num_stages * self.num_pipeline_repeats * self.num_layers_per_pipeline_stage) == (
           self.pipeline_parallel_layers
       ), (
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -796,7 +796,7 @@ def __call__(
     if cfg.using_pipeline_parallelism:
       logical_partition_spec = (
           self.pipeline_module.get_weight_sharding(y, decoder_segment_ids, decoder_positions, deterministic, model_mode)
-          if cfg.quantization == ""
+          if cfg.pipeline_fsdp_ag_once or cfg.use_pipeline_weight_prefetching
           else None
       )
       if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
@@ -1086,13 +1086,6 @@ def __call__(
 
     else:
       logits = self.apply_output_head(shared_embedding, hidden_state, deterministic, model_mode)
-      logits = sharding.maybe_shard_with_logical(
-          logits,
-          ("activation_embed_and_logits_batch", "activation_length_no_exp", "activation_vocab"),
-          mesh=self.mesh,
-          shard_mode=self.config.shard_mode,
-          debug_sharding=self.config.debug_sharding,
-      )
 
     # The API of the Decoder is now a tuple, providing both the main output
     # and the raw hidden state needed for auxiliary tasks.
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -809,7 +809,7 @@ def gmm(
         group_sizes,
         representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
     )
-    if config.use_qwix_quantization or config.using_pipeline_parallelism:
+    if config.use_qwix_quantization or (config.using_pipeline_parallelism and config.use_pipeline_weight_prefetching):
       output = megablox.gmm(
           lhs=inputs,
           rhs=kernel,
diff --git a/src/maxtext/utils/pipeline_utils.py b/src/maxtext/utils/pipeline_utils.py
@@ -255,7 +255,7 @@ def create_run_scannable(
   """Creates a scannable function for pipeline loop iterations."""
 
   def run_scannable(model, loop_state):
-    loop_state["bsw"] = model.bsw_all_gather_over_fsdp(
+    loop_state["bsw"] = model.weight_prefetching(
         loop_state["weights"], physical_partition_spec, loop_state["loop_iteration"]
     )
 
diff --git a/tests/unit/pipeline_parallelism_test.py b/tests/unit/pipeline_parallelism_test.py
@@ -278,6 +278,24 @@ def test_circular_ag_once(self):
     )
     self.assert_pipeline_same_output_and_grad(config)
 
+  @pytest.mark.tpu_only
+  def test_circular_pipeline_prefetching(self):
+    # 2 stages, 8 microbatches, enable pipeline weight prefetching
+    config = pyconfig.initialize(
+        [sys.argv[0], get_test_config_path()],
+        enable_checkpointing=False,
+        enable_goodput_recording=False,
+        run_name="circular_prefetching",
+        max_target_length=128,
+        base_emb_dim=28,
+        ici_pipeline_parallelism=2,
+        base_num_decoder_layers=8,
+        num_pipeline_microbatches=8,
+        per_device_batch_size=4,
+        use_pipeline_weight_prefetching=True,
+    )
+    self.assert_pipeline_same_output_and_grad(config)
+
   @pytest.mark.tpu_only
   def test_non_circular_same_output_and_grad(self):
     # 4 stages, 4 layers (no circular repeats, 1 layer per stage), 4 microbatches
@@ -326,6 +344,40 @@ def test_full_train_circular(self):
         ]
     )
 
+  @pytest.mark.integration_test
+  @pytest.mark.tpu_only
+  def test_full_train_circular_pipeline_prefetching(self):
+    # Run a full train.py call with 4 stages, 32 layers (2 layers per stage, 4 circular repeats),
+    # 8 microbatches and using pipeline weight prefetching
+    train_main(
+        [
+            None,
+            get_test_config_path(),
+            f"base_output_directory={self.base_output_directory}",
+            "run_name=runner_pipeline_parallelism_test",
+            f"dataset_path={self.dataset_path}",
+            "base_emb_dim=28",
+            "base_num_query_heads=4",
+            "base_num_kv_heads=4",
+            "base_mlp_dim=32",
+            "base_num_decoder_layers=32",
+            "head_dim=128",
+            "per_device_batch_size=2",
+            "max_target_length=1024",
+            "vocab_size=32",
+            "dataset_type=synthetic",
+            "steps=3",
+            "enable_checkpointing=False",
+            "enable_goodput_recording=False",
+            "ici_pipeline_parallelism=2",
+            "num_layers_per_pipeline_stage=1",
+            "num_pipeline_microbatches=4",
+            "use_pipeline_weight_prefetching=True",
+            rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+            "scan_layers_per_stage=False",  # We see better performance only scanning the pipeline iterations.
+        ]
+    )
+
   @pytest.mark.tpu_only
   def test_delay_activation_forwarding_same_output_and_grad(self):
     # 4 stages, delayed activation forwarding, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
@@ -860,7 +860,6 @@ def test_circular_pipeline_compile_pp_fsdp_fsdpt_ep_ds3(self):
             "use_random_routing=true",
             "allow_split_physical_axes=true",
             "max_target_length=4096",
-            "remat_policy=custom",
         )
     )
 
@@ -881,10 +880,8 @@ def test_circular_pipeline_compile_pp_fsdp_tp_ds3(self):
             "pipeline_parallel_layers=56",
             "num_pipeline_microbatches=16",
             "model_name=deepseek3-671b",
-            "ici_expert_parallelism=4",
             "allow_split_physical_axes=true",
             "max_target_length=4096",
-            "remat_policy=custom",
         )
     )
 
@@ -910,7 +907,6 @@ def test_circular_pipeline_compile_pp_fsdp_tp_ep_ds3(self):
             "use_random_routing=false",
             "allow_split_physical_axes=true",
             "max_target_length=4096",
-            "remat_policy=custom",
         )
     )
 

Original file line number	Diff line number	Diff line change
`@@ -809,7 +809,7 @@ def gmm(`
`809`	`809`	`group_sizes,`
`810`	`810`	`representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),`
`811`	`811`	`)`
`812`		`- if config.use_qwix_quantization or config.using_pipeline_parallelism:`
	`812`	`+ if config.use_qwix_quantization or (config.using_pipeline_parallelism and config.use_pipeline_weight_prefetching):`
`813`	`813`	`output = megablox.gmm(`
`814`	`814`	`lhs=inputs,`
`815`	`815`	`rhs=kernel,`
Original file line number	Diff line number	Diff line change
`@@ -255,7 +255,7 @@ def create_run_scannable(`
`255`	`255`	`"""Creates a scannable function for pipeline loop iterations."""`
`256`	`256`
`257`	`257`	`def run_scannable(model, loop_state):`
`258`		`- loop_state["bsw"] = model.bsw_all_gather_over_fsdp(`
	`258`	`+ loop_state["bsw"] = model.weight_prefetching(`
`259`	`259`	`loop_state["weights"], physical_partition_spec, loop_state["loop_iteration"]`
`260`	`260`	`)`
`261`	`261`
Original file line number	Diff line number	Diff line change
`@@ -860,7 +860,6 @@ def test_circular_pipeline_compile_pp_fsdp_fsdpt_ep_ds3(self):`
`860`	`860`	`"use_random_routing=true",`
`861`	`861`	`"allow_split_physical_axes=true",`
`862`	`862`	`"max_target_length=4096",`
`863`		`- "remat_policy=custom",`
`864`	`863`	`)`
`865`	`864`	`)`
`866`	`865`
`@@ -881,10 +880,8 @@ def test_circular_pipeline_compile_pp_fsdp_tp_ds3(self):`
`881`	`880`	`"pipeline_parallel_layers=56",`
`882`	`881`	`"num_pipeline_microbatches=16",`
`883`	`882`	`"model_name=deepseek3-671b",`
`884`		`- "ici_expert_parallelism=4",`
`885`	`883`	`"allow_split_physical_axes=true",`
`886`	`884`	`"max_target_length=4096",`
`887`		`- "remat_policy=custom",`
`888`	`885`	`)`
`889`	`886`	`)`
`890`	`887`
`@@ -910,7 +907,6 @@ def test_circular_pipeline_compile_pp_fsdp_tp_ep_ds3(self):`
`910`	`907`	`"use_random_routing=false",`
`911`	`908`	`"allow_split_physical_axes=true",`
`912`	`909`	`"max_target_length=4096",`
`913`		`- "remat_policy=custom",`
`914`	`910`	`)`
`915`	`911`	`)`
`916`	`912`