AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 2 additions & 1 deletion b/‎src/maxtext/configs/base.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/layers/decoders.py‎
Lines changed: 10 additions & 6 deletions b/‎src/maxtext/layers/decoders.py‎
Lines changed: 10 additions & 6 deletions
@@ -299,6 +299,7 @@ pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights
 # It may be useful to do the reverse when the layers_per_stage is very large.
 # The below settings only have effect when using pipeline parallelism.
 scan_pipeline_iterations: True
+scan_pipeline_repeats: True
 scan_layers_per_stage: False
 set_remat_policy_on_pipeline_iterations: True
 set_remat_policy_on_layers_per_stage: False
@@ -922,7 +923,7 @@ xprof_e2e_enable_fw_power_level_event: False
 xprof_e2e_enable_fw_thermal_event: False
 profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
 
-log_config: True # Prints the config (after defaults have been set by pyconfig logic)
+log_config: False # Prints the config (after defaults have been set by pyconfig logic)
 debug_sharding: False # Prints model weights sharding info
 
 # Checkpoint Structured logging
 
@@ -855,6 +855,7 @@ class PipelineParallelism(BaseModel):
   )
   pipeline_fsdp_ag_once: bool = Field(False, description="If True, all-gather FSDP weights once per pipeline repeat.")
   scan_pipeline_iterations: bool = Field(True, description="Use jax.lax.scan over pipeline iterations.")
+  scan_pipeline_repeats: bool = Field(True, description="Use jax.lax.scan over pipeline repeats.")
   scan_layers_per_stage: bool = Field(False, description="Use jax.lax.scan over layers within a stage.")
   set_remat_policy_on_pipeline_iterations: bool = Field(True, description="Set remat policy on the pipeline scan.")
   set_remat_policy_on_layers_per_stage: bool = Field(False, description="Set remat policy on the inner layer scan.")
 
@@ -794,12 +794,9 @@ def __call__(
         model_mode,
     )
     if cfg.using_pipeline_parallelism:
-      if cfg.pipeline_fsdp_ag_once:
-        logical_partition_spec = self.pipeline_module.get_weight_sharding(
-            y, decoder_segment_ids, decoder_positions, deterministic, model_mode
-        )
-      else:
-        logical_partition_spec = None  # This partition spec is only used for the fsdp_ag_once feature.
+      logical_partition_spec = self.pipeline_module.get_weight_sharding(
+          y, decoder_segment_ids, decoder_positions, deterministic, model_mode
+      )
       if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
         assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
         dense_layer = RemattedBlockLayers[0]
@@ -1087,6 +1084,13 @@ def __call__(
 
     else:
       logits = self.apply_output_head(shared_embedding, hidden_state, deterministic, model_mode)
+      logits = sharding.maybe_shard_with_logical(
+        logits,
+        ("activation_embed_and_logits_batch", "activation_length_no_exp", "activation_vocab"),
+        mesh=self.mesh,
+        shard_mode=self.config.shard_mode,
+        debug_sharding=self.config.debug_sharding,
+      )
 
     # The API of the Decoder is now a tuple, providing both the main output
     # and the raw hidden state needed for auxiliary tasks.
Original file line number	Diff line number	Diff line change
`@@ -855,6 +855,7 @@ class PipelineParallelism(BaseModel):`
`855`	`855`	`)`
`856`	`856`	`pipeline_fsdp_ag_once: bool = Field(False, description="If True, all-gather FSDP weights once per pipeline repeat.")`
`857`	`857`	`scan_pipeline_iterations: bool = Field(True, description="Use jax.lax.scan over pipeline iterations.")`
	`858`	`+ scan_pipeline_repeats: bool = Field(True, description="Use jax.lax.scan over pipeline repeats.")`
`858`	`859`	`scan_layers_per_stage: bool = Field(False, description="Use jax.lax.scan over layers within a stage.")`
`859`	`860`	`set_remat_policy_on_pipeline_iterations: bool = Field(True, description="Set remat policy on the pipeline scan.")`
`860`	`861`	`set_remat_policy_on_layers_per_stage: bool = Field(False, description="Set remat policy on the inner layer scan.")`