Merge pull request #3438 from AI-Hypercomputer:chengnuojin-custom-logical

Google-ML-Automation · Google-ML-Automation · commit e1a2ba70612f · 2026-03-19T15:45:09.000-07:00
PiperOrigin-RevId: 886413492
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -431,6 +431,7 @@ internal_compile_num_devices: -1 # You must specify the number of devices when u
 
 # Parallelism
 shard_mode: "auto" # can be either auto or explicit
+custom_mesh_and_rule: "" # replace default mesh and logical rule by specifying yml name under config/mesh_and_rule/.
 mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
diff --git a/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml b/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe.yml
@@ -0,0 +1,68 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This logical rule is designed to optimize pipeline parallelism for large-scale jobs. 
+# Key changes include removing expert weight sharding on the `q_lora` dimension, which 
+# is relatively small (e.g., 512 for DeepSeek), and limiting sharding strategies when 
+# EP x FSDP > 512. 
+#
+# The `data` axis is preserved for two reasons: first, the pipeline stage acts as a 
+# data parallel (DP) domain externally, making the `data` axis a necessary reference; 
+# second, it may be required for DCN communication. 
+#
+# Finally, the `tensor` axis is used to shard weights when `pipeline_fsdp_ag_once` or 
+# `pipeline_fsdp_ag_per_repeat` is enabled, ensuring we have sufficient memory to 
+# store prefetched weights.
+mesh_axes: ['data', 'stage', 'fsdp', 'tensor', 'expert']
+data_sharding: [['data', 'stage', 'fsdp', 'tensor', 'expert']]
+logical_axis_rules: [
+                      ['activation_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_batch_no_exp', ['data', 'fsdp']],
+                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_heads', ['tensor']],
+                      ['activation_kv_heads', ['tensor']],
+                      ['activation_length', ['expert']],
+                      ['activation_attn_length', ['expert']],
+                      ['activation_q_length', ['expert']],
+                      ['activation_attn_embed', ['tensor']],
+                      ['activation_embed', ['tensor']],
+                      ['activation_mlp', ['tensor']],
+                      ['activation_kv', ['tensor']],
+                      ['activation_prefill_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_batch_no_exp', ['data', 'fsdp']],
+                      ['activation_kv_head_dim', ['tensor']],
+                      ['activation_vocab', ['tensor']],
+                      ['activation_stage', 'stage'],
+                      ['activation_exp', ['expert']],
+                      ['decode_batch', ['data', 'fsdp', 'expert']],
+                      ['mlp', ['tensor']],
+                      ['mlp_no_fsdp', ['tensor']],
+                      ['vocab', ['tensor']],
+                      ['heads', ['tensor']],
+                      ['q_heads', ['tensor']],
+                      ['kv_heads', ['tensor']],
+                      ['embed', ['fsdp', 'expert']],
+                      ['embed_no_exp', ['fsdp']],
+                      ['q_lora', ['fsdp']],
+                      ['kv_lora', ['fsdp']],
+                      ['norm', ['tensor']],
+                      ['layers', 'stage'],
+                      ['cache_heads', ['tensor']],
+                      ['exp', 'expert'],
+                      ['exp_with_fsdp', 'fsdp'],
+                      ['paged_kv_heads', ['tensor']],
+                      ['engram_dim', ['tensor']],
+                    ]
diff --git a/src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml b/src/maxtext/configs/custom_mesh_and_rule/pure-fsdp.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This rule only uses FSDP. Pure FSDP is the go-to sharding strategy 
+# for small-scale training and this rule simplifies the overall configuration.
+mesh_axes: ['fsdp']
+data_sharding: [['fsdp']]
+logical_axis_rules: [
+                      ['activation_batch', ['fsdp']],
+                      ['activation_batch_no_exp', ['fsdp']],
+                      ['activation_embed_and_logits_batch', ['fsdp']],
+                      ['activation_embed_and_logits_batch_sequence', ['fsdp']],
+                      ['activation_prefill_kv_batch', ['fsdp']],
+                      ['activation_kv_batch', ['fsdp']],
+                      ['activation_kv_batch_no_exp', ['fsdp']],
+                      ['decode_batch', ['fsdp']],
+                      ['embed', ['fsdp']],
+                      ['embed_no_exp', ['fsdp']],
+                      ['q_lora', ['fsdp']],
+                      ['kv_lora', ['fsdp']],
+                      ['exp_with_fsdp', 'fsdp'],
+                    ]
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -25,6 +25,7 @@
 from math import prod
 import os
 from tempfile import gettempdir
+import yaml
 from typing import Any, Literal, NewType, Optional
 
 import jax
@@ -781,6 +782,7 @@ class HardwareAndMesh(BaseModel):
       description="Strategy for context parallelism ('all_gather' or 'ring').",
   )
   custom_mesh: str = Field("", description="Available options: ['hybrid_ring_64x4', 'hybrid_ring_32x8']")
+  custom_mesh_and_rule: str = Field("", description="Customized mesh and logical rules for granularity.")
   allow_split_physical_axes: bool = Field(False, description="Allow splitting physical axes for device mesh creation.")
   enable_nnx: bool = Field(False, description="Whether to use NNX for model definition.")
   optimize_mesh_for_tpu_v6e: bool = Field(False, description="Apply transformations to the mesh for TPU v6e.")
@@ -1962,6 +1964,24 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
     Computes all derived values and runs all cross-field validations after initial parsing.
     This logic is ported from the legacy pyconfig_deprecated.py system and adapted for Pydantic.
     """
+    if self.custom_mesh_and_rule:
+      custom_mesh_path = os.path.join(
+          os.path.dirname(os.path.abspath(__file__)),
+          "custom_mesh_and_rule",
+          f"{self.custom_mesh_and_rule}.yml",
+      )
+      if os.path.exists(custom_mesh_path):
+        with open(custom_mesh_path, "r") as f:  # pylint: disable=unspecified-encoding
+          custom_mesh_config = yaml.safe_load(f)
+          if "mesh_axes" in custom_mesh_config:
+            self.mesh_axes = custom_mesh_config["mesh_axes"]
+          if "logical_axis_rules" in custom_mesh_config:
+            self.logical_axis_rules = custom_mesh_config["logical_axis_rules"]
+          if "data_sharding" in custom_mesh_config:
+            self.data_sharding = custom_mesh_config["data_sharding"]
+      else:
+        raise NotImplementedError(f"Custom mesh config file not found at {custom_mesh_path}")
+
     # A. SET RUN NAME AND PATHS
     # If run_name is not set, generate one from the JOBSET_NAME environment variable (if available)
     # or create one from the model name and a timestamp.
diff --git a/src/maxtext/layers/attention_op.py b/src/maxtext/layers/attention_op.py
@@ -1184,7 +1184,7 @@ def tpu_flash_attention(
     global_k_layout = self.config.sa_k_layout
     global_v_layout = self.config.sa_v_layout
 
-    devices_in_data_fsdp = self.mesh.shape["data"] * self.mesh.shape["fsdp"]
+    devices_in_data_fsdp = self.mesh.shape.get("data", 1) * self.mesh.shape.get("fsdp", 1)
     assert (query.shape[0] / devices_in_data_fsdp).is_integer(), (
         "Batch dimension should be shardable among the devices in data and fsdp"
         " axis"
@@ -1284,22 +1284,17 @@ def create_sa_config(config, query, key, attn_logits_soft_cap):
           jax.jit,
           static_argnames=[
               "single_head_mask",
-              "shard_head_size",
           ],
       )
-      def wrap_splash_kernel(single_head_mask, shard_head_size=1):
+      def wrap_splash_kernel(single_head_mask):
         splash_kernel = tokamax_splash_kernel.make_splash_mha(
             mask=single_head_mask,
             config=sa_config,
             q_seq_shards=cp_size,  # axis for sequence sharding,
         )
         return splash_kernel
 
-      logical_axis_rules_head = np.array(
-          [self.mesh.shape[physical_axes] for physical_axes in dict(self.config.logical_axis_rules)[HEAD]]
-      )
-      shard_head_size = np.prod(logical_axis_rules_head)
-      splash_kernel = wrap_splash_kernel(single_head_mask, int(shard_head_size))
+      splash_kernel = wrap_splash_kernel(single_head_mask)
       if self.config.expert_shard_attention_option == EP_AS_CONTEXT:
         segment_axis_names_splash_kernel = self._logical_to_mesh_axes((Q_LENGTH,))
       else:
@@ -1331,11 +1326,10 @@ def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
         )
         return splash_kernel
 
-      logical_axis_rules_head = np.array(
-          [self.mesh.shape[physical_axes] for physical_axes in dict(self.config.logical_axis_rules)[HEAD]]
-      )
-      shard_head_size = np.prod(logical_axis_rules_head)
-      splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
+      head_physical_axes = logical_to_mesh_axes((HEAD,), self.mesh)[0]
+      head_physical_axes = (head_physical_axes,) if isinstance(head_physical_axes, str) else (head_physical_axes or ())
+      shard_head_size = math.prod(self.mesh.shape.get(ax, 1) for ax in head_physical_axes)
+      splash_kernel = wrap_splash_kernel(multi_head_mask, shard_head_size)
       named_sharding = jax.sharding.NamedSharding(self.mesh, axis_names_splash_kernel)
       segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
 
diff --git a/src/maxtext/utils/sharding.py b/src/maxtext/utils/sharding.py
@@ -151,9 +151,9 @@ def remove_size_one_mesh_axis(spec, mesh):
     if s is None or s == P.UNCONSTRAINED:
       new_spec.append(s)  # type: ignore
     elif isinstance(s, tuple):
-      new_spec.append(tuple(i for i in s if mesh.shape[i] != 1))
+      new_spec.append(tuple(i for i in s if mesh.shape.get(i, 1) != 1))
     else:
-      new_spec.append(None if mesh.shape[s] == 1 else s)  # type: ignore
+      new_spec.append(None if mesh.shape.get(s, 1) == 1 else s)  # type: ignore
   return P(*new_spec, unreduced=spec.unreduced, reduced=spec.reduced)
 
 
diff --git a/src/maxtext/utils/train_utils.py b/src/maxtext/utils/train_utils.py
@@ -204,7 +204,7 @@ def setup_train_loop(config, recorder, devices=None):
     data_iterator, eval_data_iterator = create_data_iterator(config, mesh)
     rampup_manager = create_rampup_manager(config, checkpoint_manager)
     data_loader = create_dataloader(config, mesh, data_iterator, recorder, rampup_manager)
-    context_parallel_size = mesh.shape["context"]
+    context_parallel_size = mesh.shape.get("context", 1)
     # Check if context parallelism is being used with sequence packing
     if context_parallel_size > 1 and config.packing and config.dataset_type != "synthetic":
       raise ValueError(
diff --git a/tests/unit/custom_mesh_and_rule_test.py b/tests/unit/custom_mesh_and_rule_test.py
@@ -0,0 +1,68 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for custom mesh and sharding rule configurations in model compilation.
+
+This module verifies that `train_compile.py` correctly processes the
+`custom_mesh_and_rule` flag. It ensures that user-defined hardware
+meshes and parallelization strategies compile successfully prior to execution.
+"""
+
+import unittest
+
+import pytest
+
+from maxtext.trainers.pre_train.train_compile import main as train_compile_main
+from tests.utils.test_helpers import get_test_config_path
+
+
+@pytest.mark.tpu_backend
+class CustomMeshAndRuleTest(unittest.TestCase):
+  """Tests for custom_mesh functionality in train_compile.py"""
+
+  @pytest.mark.cpu_only
+  def test_pure_fsdp(self):
+    """Test compiling with a pure FSDP custom mesh."""
+    train_compile_main(
+        (
+            "",
+            get_test_config_path(),
+            "compile_topology=v4-8",
+            "compile_topology_num_slices=1",
+            "base_emb_dim=256",
+            "base_mlp_dim=256",
+            "base_num_decoder_layers=1",
+            "custom_mesh_and_rule=pure-fsdp",
+        )
+    )
+
+  @pytest.mark.cpu_only
+  def test_ds3_large_pp(self):
+    """Test compiling deepseek3-tiny with the pipeline-large-moe custom mesh."""
+    train_compile_main(
+        (
+            "",
+            get_test_config_path(),
+            "compile_topology=v5p-32",
+            "compile_topology_num_slices=1",
+            "ici_fsdp_transpose_parallelism=2",
+            "ici_expert_parallelism=2",
+            "model_name=deepseek3-tiny",
+            "override_model_config=true",
+            "base_emb_dim=256",
+            "base_mlp_dim=256",
+            "base_num_decoder_layers=4",
+            "custom_mesh_and_rule=pipeline-large-moe",
+        )
+    )