Merge pull request #3397 from AI-Hypercomputer:chengnuojin-enable-pipeline-2dfsdp

Google-ML-Automation · Google-ML-Automation · commit 8d4f13af1b37 · 2026-03-13T10:51:29.000-07:00
PiperOrigin-RevId: 883239276
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -896,10 +896,14 @@ def sparse_matmul(
     def gmm(
         inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, input_buffer_count, combine_scopes
     ):
-      tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
-          group_sizes,
-          max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
-      )
+      # TODO (b/491979205) pipeline fsdp ag per repeat fails tokamax gmm
+      if self.config.using_pipeline_parallelism and self.config.pipeline_fsdp_ag_per_repeat:
+        tokamax_group_sizes = group_sizes
+      else:
+        tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
+            group_sizes,
+            max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
+        )
       pad_length = self.config.wi_tile_fwd_batch_seq
       hs_shape = inputs.shape
       # pad length is the 1st dimension of tiling size in gmm call