AI-Hypercomputer
diff --git a/‎docs/run_maxtext/decoupled_mode.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/run_maxtext/decoupled_mode.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/integration/checkpointing_test.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/integration/checkpointing_test.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/integration/smoke/train_gpu_smoke_test.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/integration/smoke/train_gpu_smoke_test.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/integration/train_tests.py‎
Lines changed: 25 additions & 17 deletions b/‎tests/integration/train_tests.py‎
Lines changed: 25 additions & 17 deletions
diff --git a/‎tests/unit/attention_test.py‎
Lines changed: 4 additions & 5 deletions b/‎tests/unit/attention_test.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎tests/unit/data_loader_test.py‎
Lines changed: 2 additions & 4 deletions b/‎tests/unit/data_loader_test.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tests/unit/engram_vs_reference_test.py‎
Lines changed: 2 additions & 3 deletions b/‎tests/unit/engram_vs_reference_test.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎tests/unit/maxtext_utils_test.py‎
Lines changed: 4 additions & 3 deletions b/‎tests/unit/maxtext_utils_test.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tests/unit/mhc_test.py‎
Lines changed: 4 additions & 3 deletions b/‎tests/unit/mhc_test.py‎
Lines changed: 4 additions & 3 deletions
@@ -28,7 +28,7 @@ When enabled:
 - Import-time safety is preserved by lightweight stubs returned from `decouple.py` (so modules import cleanly); only active use of missing functionality raises.
 - Conditionally replaces dataset paths in certain tests to point at minimal local datasets.
 - Uses a local base output directory (users can override with `LOCAL_BASE_OUTPUT`).
-- All tests that previously hard-coded `configs/base.yml` now use the helper `get_test_config_path()` from `tests/utils/test_utils.py`. This helper ensures usage of `decoupled_base_test.yml`.
+- Many tests use the helper `get_test_config_path()` from `tests/utils/test_helpers.py`. In decoupled mode, this helper selects `src/maxtext/configs/decoupled_base_test.yml` instead of `src/maxtext/configs/base.yml`.
 
 Minimal datasets included (checked into the repo):
 
 
@@ -14,6 +14,10 @@
 
 # This sentinel is a reminder to choose a real run name.
 # If there is already a checkpoint under this run, that checkpoint will auto-resume.
+#
+# NOTE: Some unit/integration tests in MaxText do not always run this file directly.
+# When running in decoupled mode (DECOUPLE_GCLOUD=TRUE), tests may use
+# `decoupled_base_test.yml` instead of `base.yml` via `tests/utils/test_helpers.py`.
 run_name: ""
 
 model_name: "default" # override config settings to match a specific model. other than the override, nothing should use this!
 
@@ -30,13 +30,16 @@
 from math import isclose
 import os.path
 
-import jax
 import pytest
 
 from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.trainers.pre_train.train import main as train_main
 from maxtext.utils.globals import MAXTEXT_PKG_DIR
-from tests.utils.test_helpers import get_test_config_path, get_test_base_output_directory
+from tests.utils.test_helpers import (
+    get_test_config_path,
+    get_test_base_output_directory,
+    get_decoupled_parallelism_overrides,
+)
 
 
 def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention_type, dataset_type, dataset_path):
@@ -72,10 +75,7 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
 
   extra_parallelism = []
   if is_decoupled():  # Match device topology in decoupled/local mode
-    try:
-      extra_parallelism.append(f"ici_fsdp_parallelism={jax.device_count()}")
-    except Exception as e:  # pragma: no cover - defensive  # pylint: disable=broad-exception-caught
-      print(f"Warning: unable to determine jax.device_count(): {e}")
+    extra_parallelism.extend(get_decoupled_parallelism_overrides(as_argv=True))
 
   return (
       [
 
@@ -20,8 +20,8 @@
 
 from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.trainers.pre_train.train import main as train_main
-from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_PKG_DIR
-from tests.utils.test_helpers import get_test_dataset_path, get_test_base_output_directory
+from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT
+from tests.utils.test_helpers import get_test_dataset_path, get_test_base_output_directory, get_test_config_path
 
 
 class Train(unittest.TestCase):
@@ -43,7 +43,7 @@ def test_tiny_config(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "gpu", "gpu_smoke_test.yml"),
+            get_test_config_path("gpu/gpu_smoke_test.yml"),
             # pylint: disable=f-string-without-interpolation
             f"base_output_directory={self.base_output_directory}",
             "run_name=runner_test",
 
@@ -22,7 +22,13 @@
 from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.trainers.pre_train.train import main as train_main
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT
-from tests.utils.test_helpers import get_test_config_path, get_test_dataset_path, get_test_base_output_directory
+from tests.utils.test_helpers import (
+    get_test_config_path,
+    get_test_dataset_path,
+    get_test_base_output_directory,
+    get_decoupled_parallelism_overrides,
+    is_rocm_backend,
+)
 
 
 class TrainTests(unittest.TestCase):
@@ -37,9 +43,9 @@ class TrainTests(unittest.TestCase):
   _fsdp_tp4_override = []
   if decoupled:
     if dev_count >= 4 and dev_count % 4 == 0:
-      _fsdp_tp4_override = [f"ici_fsdp_parallelism={dev_count // 4}"]
+      _fsdp_tp4_override = get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count // 4, as_argv=True)
     elif dev_count < 4:
-      _fsdp_tp4_override = [f"ici_fsdp_parallelism={dev_count}"]
+      _fsdp_tp4_override = get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True)
 
   CONFIGS = {
       "base": [  # short test for train.py with TFDS c4
@@ -53,7 +59,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "synthetic": [  # tests base config with synthetic dataset
           None,
           get_test_config_path(),
@@ -66,7 +72,7 @@ class TrainTests(unittest.TestCase):
           "dataset_type=synthetic",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "pdb_lt_1": [  # tests base config with per_device_batch_size < 1
           None,
           get_test_config_path(),
@@ -80,7 +86,7 @@ class TrainTests(unittest.TestCase):
           "ici_tensor_parallelism=4",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "tp_transpose": [  # tests base config with ici_tensor_transpose_parallelism=4
           None,
           get_test_config_path(),
@@ -92,7 +98,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "int8": [  # tests base config with int8
           None,
           get_test_config_path(),
@@ -105,7 +111,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "fp8": [  # tests base config with fp8
           None,
           get_test_config_path(),
@@ -118,7 +124,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "nanoo_fp8": [  # tests base config with nanoo_fp8
           None,
           get_test_config_path(),
@@ -131,7 +137,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "te_fp8_delayedscaling": [  # tests base config with te_fp8_delayedscaling
           None,
           get_test_config_path(),
@@ -144,7 +150,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "te_fp8_currentscaling": [  # tests base config with te_fp8_currentscaling
           None,
           get_test_config_path(),
@@ -157,7 +163,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "te_mxfp8": [  # tests base config with te_mxfp8
           None,
           get_test_config_path(),
@@ -170,7 +176,7 @@ class TrainTests(unittest.TestCase):
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "dropout": [  # tests base config with dropout
           None,
           get_test_config_path(),
@@ -185,7 +191,7 @@ class TrainTests(unittest.TestCase):
           "dropout_rate=0.02",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
       "hf_input_pipeline": [  # test for train.py with TFDS c4, using HF input pipeline
           None,
           get_test_config_path(),
@@ -199,7 +205,7 @@ class TrainTests(unittest.TestCase):
           f"hf_train_files={dataset_path}/hf/c4/c4-train-00000-of-01637.parquet",
           "tokenizer_path=google-t5/t5-large",
       ]
-      + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
+      + get_decoupled_parallelism_overrides(fsdp_parallelism=dev_count, as_argv=True),
   }
 
   @pytest.mark.integration_test
@@ -427,7 +433,7 @@ def test_gpu_optimizer_offload(self):
         "enable_goodput_recording=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
     ]
-    train_main(optimizer_offload + ([f"ici_fsdp_parallelism={self.dev_count}"] if self.decoupled else []))
+    train_main(optimizer_offload + get_decoupled_parallelism_overrides(fsdp_parallelism=self.dev_count, as_argv=True))
 
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
@@ -448,7 +454,7 @@ def test_gpu_parameter_offload(self):
         "enable_goodput_recording=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
     ]
-    train_main(parameter_offload + ([f"ici_fsdp_parallelism={self.dev_count}"] if self.decoupled else []))
+    train_main(parameter_offload + get_decoupled_parallelism_overrides(fsdp_parallelism=self.dev_count, as_argv=True))
 
   @pytest.mark.gpu_only
   def test_gpu_cudnn_flash_jax(self):
@@ -567,6 +573,8 @@ def test_gpu_packed_attention(self):
   @pytest.mark.gpu_only
   @pytest.mark.skip(reason="b/489133823. Previously transient in b/462548581.")
   def test_gpu_ring_attention(self):
+    if is_rocm_backend():
+      pytest.skip("TE ring attention context parallelism not supported on ROCm.")
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"] = "0"  # Disable scan for ring attention
     ring_attention = [  # tests base config on GPU with ring attention
 
@@ -44,7 +44,7 @@
 import pytest
 
 from tests.utils import attention_test_util
-from tests.utils.test_helpers import get_test_config_path
+from tests.utils.test_helpers import get_test_config_path, get_decoupled_parallelism_overrides
 
 
 class BidirectionalBlockMaskTest(unittest.TestCase):
@@ -290,7 +290,7 @@ def setUp(self):
     """Initializes the configuration for each test"""
     super().setUp()
     # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
-    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    extra_args = get_decoupled_parallelism_overrides()
     if not is_decoupled():
       jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
     config = pyconfig.initialize(
@@ -1335,7 +1335,7 @@ def test_projection_initialization(self):
     # Create a copy of the arguments and override the attention_type for the base model
     attention_config_args = self.config_arguments.copy()
     attention_config_args["attention_type"] = AttentionType.GLOBAL.value
-    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    extra_args = get_decoupled_parallelism_overrides()
     attention_cfg = pyconfig.initialize(
         [sys.argv[0], get_test_config_path()],
         **attention_config_args,
@@ -1371,10 +1371,9 @@ def test_projection_initialization(self):
 
     # 3. Initialize the MLA layer
     mla_config_args = self.config_arguments.copy()
-    mla_extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    mla_extra_args = get_decoupled_parallelism_overrides()
     mla_config_args.update(mla_extra_args)
     _, mla_layer = self.init_mla(mla_config_args, rope_type="default")
-    _, mla_layer = self.init_mla(self.config_arguments, rope_type="default")
 
     # 4. Assert that the MLA layer DOES NOT HAVE the base projections
     self.assertFalse(hasattr(mla_layer, "query"), "MLA should not have 'query' projection.")
 
@@ -30,7 +30,7 @@
 from maxtext.utils.maxtext_utils import create_device_mesh
 from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.utils.rampup_batch import RampupBatchManager
-from tests.utils.test_helpers import get_test_config_path
+from tests.utils.test_helpers import get_test_config_path, get_decoupled_parallelism_overrides
 
 
 class DataLoaderTest(unittest.TestCase):
@@ -63,9 +63,7 @@ def get_test_config(self, reuse_example_batch, **kwargs):
     # In decoupled mode, adapt mesh/ICI parallelism so that the
     # product of ICI parallelism matches the available devices for
     # this test only.
-    if is_decoupled():
-      args.setdefault("mesh_axes", ["data"])
-      args.setdefault("ici_data_parallelism", -1)
+    args.update(get_decoupled_parallelism_overrides(include_mesh_defaults=True))
 
     return pyconfig.initialize(
         [None, get_test_config_path()],
 
@@ -28,7 +28,6 @@
 from typing import List
 from dataclasses import dataclass, field
 import math
-import os
 import unittest
 from absl.testing import parameterized
 
@@ -52,7 +51,7 @@
 from maxtext.layers.engram import ShortConv as ShortConvJAX
 from maxtext.layers.engram import Engram as EngramJAX
 from maxtext.utils import maxtext_utils
-from maxtext.utils.globals import MAXTEXT_PKG_DIR
+from tests.utils.test_helpers import get_test_config_path
 
 
 def setUpModule():
@@ -470,7 +469,7 @@ def init_torch_weights(module, std=1):
 def get_cfg_and_mesh(config):
   """Returns MaxText configuration and mesh."""
   cfg = pyconfig.initialize(
-      [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+      [None, get_test_config_path()],
       run_name="",
       enable_checkpointing=False,
       model_name="default",
 
@@ -28,7 +28,6 @@
 import jax.numpy as jnp
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from maxtext.configs import pyconfig
-from maxtext.common.gcloud_stub import is_decoupled
 from maxtext.common.common_types import MODEL_MODE_TRAIN
 from maxtext.inference import inference_utils
 from maxtext.layers import quantizations
@@ -37,7 +36,7 @@
 from maxtext.utils import maxtext_utils
 from maxtext.utils import sharding
 from maxtext.utils.sharding import assert_params_sufficiently_sharded, get_formatted_sharding_annotations
-from tests.utils.test_helpers import get_test_config_path
+from tests.utils.test_helpers import get_test_config_path, get_decoupled_parallelism_overrides
 import numpy as np
 import optax
 
@@ -347,7 +346,7 @@ class MaxUtilsInitTransformerState(unittest.TestCase):
 
   def setUp(self):
     # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
-    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    extra_args = get_decoupled_parallelism_overrides()
     self.config = pyconfig.initialize([None, get_test_config_path()], enable_checkpointing=False, **extra_args)
     devices_array = maxtext_utils.create_device_mesh(self.config)
     self.mesh = Mesh(devices_array, self.config.mesh_axes)
@@ -913,8 +912,10 @@ class TestGetAbstractState(unittest.TestCase):
   """Test class for get_abstract_state."""
 
   def setUp(self):
+    extra_args = get_decoupled_parallelism_overrides()
     self.config = pyconfig.initialize(
         [None, get_test_config_path()],
+        **extra_args,
         enable_checkpointing=False,
         model_name="llama3.1-8b",
         per_device_batch_size=1,
 
@@ -14,7 +14,6 @@
 
 """Test for DeepSeek Manifold-Constrained Hyper Connections (mHC)."""
 
-import os.path
 import unittest
 import pytest
 
@@ -26,12 +25,12 @@
 import numpy as np
 
 from maxtext.configs import pyconfig
-from maxtext.utils.globals import MAXTEXT_PKG_DIR
 from maxtext.common.common_types import HyperConnectionType
 from maxtext.layers import attention_mla, linears, mhc, moe
 from maxtext.layers.initializers import nd_dense_init
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.utils import maxtext_utils
+from tests.utils.test_helpers import get_test_config_path, get_decoupled_parallelism_overrides
 
 
 class TestExpandReduce(unittest.TestCase):
@@ -92,8 +91,10 @@ class TestMHC(unittest.TestCase):
 
   def setUp(self):
     self.dim = 16
+    extra_args = get_decoupled_parallelism_overrides()
     self.config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
+        **extra_args,
         run_name="test_mhc",
         enable_checkpointing=False,
         model_name="deepseek-custom",