Merge pull request #2938 from AI-Hypercomputer:xibin/nnx

Google-ML-Automation · Google-ML-Automation · commit 05bf24c1c149 · 2026-01-15T10:54:55.000-08:00
PiperOrigin-RevId: 856729133
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -764,8 +764,12 @@ def init_initial_state(model, tx, config, is_training, key):
   image_shape = multimodal_utils.get_dummy_image_shape_for_init(
       config.model_name, batch_size=config.micro_batch_size_to_train_on
   )
+  # Split the master key into independent keys for each RNG collection
+  # Reference: https://flax-linen.readthedocs.io/en/latest/guides/flax_fundamentals/rng_guide.html
+  params_key, dropout_key, aqt_key = jax.random.split(key, 3)
+
   model_vars = model.init(
-      {"params": key, "dropout": key, "aqt": key},
+      {"params": params_key, "dropout": dropout_key, "aqt": aqt_key},
       np.ones(input_shape, dtype=jnp.int32),
       np.ones(input_shape, dtype=jnp.int32),
       encoder_images=np.ones(image_shape, dtype=jnp.int32) if config.use_multimodal else None,
diff --git a/tests/data_loader_test.py b/tests/data_loader_test.py
@@ -43,7 +43,8 @@ def setUp(self):
         enable_rampup_batch_size=True,
         per_device_batch_size_start=1.0,
         per_device_batch_size_increment=1.0,
-        global_rampup_samples=60,
+        # global_rampup_samples: (rampup increment number) * (Samples for initial 5 steps)
+        global_rampup_samples=3 * (1 * jax.device_count() * 5),
     )
     self.mesh = Mesh(create_device_mesh(self.config), self.config.mesh_axes)
     self.mock_data_iterator = MagicMock()
@@ -140,8 +141,10 @@ def test_rampup_data_loader(self):
 
     # Expected batch sizes based on test config.
     # The end global batch size is self.num_devices * per_device_batch_size
-    # The rampup should be: 5 steps of size 4, 3 steps of size 8, 2 steps of size 12, then size 16.
-    expected_batch_sizes = [4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16]
+    # The rampup of per_device_batch_size should be:
+    #   5 steps of size 1, 3 steps of size 2, 2 steps of size 3, then size 4.
+    multipliers = [1] * 5 + [2] * 3 + [3] * 2 + [4] * 2
+    expected_batch_sizes = [m * self.config_rampup.num_target_devices for m in multipliers]
     for i, expected_size in enumerate(expected_batch_sizes):
       batch = data_loader.load_next_batch(rampup_manager=rampup_manager)
       expected_shape = (expected_size, self.config_rampup.max_target_length)
@@ -168,8 +171,10 @@ def test_rampup_data_loader_from_checkpointing(self):
 
     # Expected batch sizes based on test config.
     # The end global batch size is self.num_devices * per_device_batch_size
-    # The rampup should be: 3 steps of size 8, 2 steps of size 12, then size 16.
-    expected_batch_sizes = [8, 8, 8, 12, 12, 16, 16]
+    # The rampup of per_device_batch_size should be:
+    #   3 steps of size 2, 2 steps of size 3, then size 4.
+    multipliers = [2] * 3 + [3] * 2 + [4] * 2
+    expected_batch_sizes = [m * self.config_rampup.num_target_devices for m in multipliers]
     for i, expected_size in enumerate(expected_batch_sizes):
       batch = data_loader.load_next_batch(rampup_manager=rampup_manager)
       expected_shape = (expected_size, self.config_rampup.max_target_length)
diff --git a/tests/maxtext_utils_test.py b/tests/maxtext_utils_test.py
@@ -405,7 +405,7 @@ def test_multi_axis_sharding_pass(self):
     multi-dimensional mesh passes the assertion.
     """
     # Create a mesh shape for a 5D mesh.
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    devices = np.array(jax.devices()).reshape((jax.device_count(), 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
 
     # Shard across multiple axes, including the valid 'fsdp' axis.
@@ -420,7 +420,7 @@ def test_multi_axis_not_sharded_fails(self):
     Tests that a tensor on a complex mesh fails if it's not sharded along any
     of the primary valid axes (like 'fsdp').
     """
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    devices = np.array(jax.devices()).reshape((jax.device_count(), 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
     pspec = PartitionSpec(("sequence", "context"), "stage", "tensor", None)
     params = {"complex_layer": jax.device_put(jnp.ones((8, 8, 2, 2)), NamedSharding(mesh, pspec))}
@@ -432,7 +432,7 @@ def test_multi_axis_mixed_sharding_fails(self):
     """
     Tests that a mix of sharded (correctly) and unsharded tensors on a complex mesh fails.
     """
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    devices = np.array(jax.devices()).reshape((jax.device_count(), 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
     sharded_pspec = PartitionSpec(("fsdp", "sequence"), "stage", ("tensor"), None)
     sharded_param = jax.device_put(jnp.ones((8, 8, 2, 2)), NamedSharding(mesh, sharded_pspec))
@@ -459,7 +459,7 @@ def setUp(self):
       self.skipTest("This test suite requires at least 4 TPU devices")
 
     self.mesh_axes = ("fsdp", "sequence", "tensor", "stage", "context")
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    devices = np.array(jax.devices()).reshape((jax.device_count(), 1, 1, 1, 1))
     self.mesh = Mesh(devices, self.mesh_axes)
 
   def test_multi_axis_mixed_formating(self):