Add internal functionality for train_compile

gobbleturk · Google-ML-Automation · commit d272058f5867 · 2026-03-04T10:36:25.000-08:00
PiperOrigin-RevId: 878559828
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -413,6 +413,10 @@ jax_cache_dir: "~/jax_cache"
 # Hardware
 hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu', 'gpu_multiprocess' and 'cpu'
 
+# internal_compile allows bypassing open-source topology name mappings when using internal topologies directly via get_topology_desc.
+internal_compile: False
+internal_compile_num_devices: -1 # You must specify the number of devices when using internal_compile.
+
 # Parallelism
 shard_mode: "auto" # can be either auto or explicit
 mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -795,6 +795,8 @@ class LayoutAndSharding(BaseModel):
       description="Allowed percentage of non-sharded parameters.",
   )
   shard_optimizer_over_data: bool = Field(False, description="Enable ZeRO-1 optimizer sharding over the data axis.")
+  internal_compile: bool = Field(False, description="Use internal_compile to bypass open-source topology mappings.")
+  internal_compile_num_devices: int = Field(-1, description="Number of devices when using internal_compile.")
 
 
 class DcnParallelism(BaseModel):
@@ -2064,6 +2066,11 @@ def validate_and_set_hlo_dump_defaults():
     # E. HARDWARE-DEPENDENT CALCULATIONS
     def get_num_target_devices():
       """Get the number of devices for the target topology, handling AOT compilation and single-controller modes."""
+      if self.internal_compile:
+        if self.internal_compile_num_devices <= 0:
+          raise ValueError("Set internal_compile_num_devices to a positive integer.")
+        # User bypassing topology mappings should supply explicit device count
+        return self.internal_compile_num_devices
       if self.compile_topology:
         spec = accelerator_to_spec_map.get_system_characteristics(self.compile_topology)
         return int(spec.devices_per_slice * self.compile_topology_num_slices)
diff --git a/src/maxtext/trainers/pre_train/train_compile.py b/src/maxtext/trainers/pre_train/train_compile.py
@@ -60,22 +60,27 @@ def validate_config(config):
 
 def get_topology_mesh(config):
   """Get the target hardware devices, and create configured mesh with them"""
-  target_hardware = accelerator_to_spec_map.get_system_characteristics(config.compile_topology)
-  if target_hardware.platform == "gpu":
-    # Disable sharded autotuning. This is an optimization to distribute
-    # autotuning across the fleet, but can cause hangs with AoT compilation.
-    os.environ["XLA_FLAGS"] = os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
-    jax.config.update("mock_num_gpu_processes", config.compile_topology_num_slices)
-    topology_devices = jax.devices()
-  else:
+  if config.internal_compile:
     topology_devices = get_topology_desc(
-        platform=target_hardware.platform,
-        topology_name=target_hardware.topology_name,
-        chip_config_name=target_hardware.chip_config_name,
-        chips_per_host_bounds=target_hardware.chips_per_host_bounds,
-        num_slices=config.compile_topology_num_slices,
-        wrap=target_hardware.wrap,
+        platform="tpu", topology_name=config.compile_topology, num_slices=config.compile_topology_num_slices
     ).devices
+  else:
+    target_hardware = accelerator_to_spec_map.get_system_characteristics(config.compile_topology)
+    if target_hardware.platform == "gpu":
+      # Disable sharded autotuning. This is an optimization to distribute
+      # autotuning across the fleet, but can cause hangs with AoT compilation.
+      os.environ["XLA_FLAGS"] = os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
+      jax.config.update("mock_num_gpu_processes", config.compile_topology_num_slices)
+      topology_devices = jax.devices()
+    else:
+      topology_devices = get_topology_desc(
+          platform=target_hardware.platform,
+          topology_name=target_hardware.topology_name,
+          chip_config_name=target_hardware.chip_config_name,
+          chips_per_host_bounds=target_hardware.chips_per_host_bounds,
+          num_slices=config.compile_topology_num_slices,
+          wrap=target_hardware.wrap,
+      ).devices
   if config.shard_mode == ShardMode.EXPLICIT:
     jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
   topology_device_mesh = maxtext_utils.create_device_mesh(config, topology_devices)
@@ -174,10 +179,14 @@ def is_oom(argv: Sequence[str]) -> bool:
   data_sharding = sharding.get_input_data_sharding(config, topology_mesh)
 
   # Get function to compile and shardings
-  func_to_compile, in_shard, out_shard, static_argnums, donate_argnums = (
-      maxtext_utils.get_functional_train_with_signature(
-          train.train_step, data_sharding, state_mesh_shardings, model, config
-      )
+  (
+      func_to_compile,
+      in_shard,
+      out_shard,
+      static_argnums,
+      donate_argnums,
+  ) = maxtext_utils.get_functional_train_with_signature(
+      train.train_step, data_sharding, state_mesh_shardings, model, config
   )
 
   try:
@@ -255,10 +264,14 @@ def main(argv: Sequence[str]) -> None:
     donate_argnums = 0
   else:
     # Get function to compile and shardings
-    func_to_compile, in_shard, out_shard, static_argnums, donate_argnums = (
-        maxtext_utils.get_functional_train_with_signature(
-            train.train_step, data_sharding, state_mesh_shardings, model, config
-        )
+    (
+        func_to_compile,
+        in_shard,
+        out_shard,
+        static_argnums,
+        donate_argnums,
+    ) = maxtext_utils.get_functional_train_with_signature(
+        train.train_step, data_sharding, state_mesh_shardings, model, config
     )
 
   # print weights sharding info under debug sharding mode