Migrate test modules to NNX

hsuan-lun-chiang · hsuan-lun-chiang · commit 1a50f57e451f · 2025-11-26T02:34:30.000Z
diff --git a/tests/maxtext_utils_test.py b/tests/maxtext_utils_test.py
@@ -27,8 +27,8 @@
 
 from flax import linen as nn
 from flax.core.scope import FrozenVariableDict
-from flax.linen import Dense
 from flax.training import train_state
+from flax import nnx
 
 import optax
 
@@ -156,39 +156,61 @@ def test_init_training_state(self):
     )
 
 
-class ModelWithMultipleCollections(nn.Module):
+@nnx.register_variable_name("special_variables")
+class SpecialVariables(nnx.Variable):
+  pass
+
+
+class ModelWithMultipleCollections(nnx.Module):
   """
   A simple model that has variables in multiple collections - "params" and "special_variables"
   """
 
-  dense: Dense = nn.Dense(4)
-
-  def setup(self):
-    self.kernel = self.variable("special_variables", "my_first_kernel", lambda: jnp.ones((4, 5)))
+  def __init__(self, input_dim: int, rngs: nnx.Rngs | None = None):
+    self.dense = nnx.Linear(input_dim, 4, rngs=rngs)
+    self.my_first_kernel = SpecialVariables(jnp.ones((4, 5)))
 
   def __call__(self, x, y, encoder_images=None, nnx_method=None, model_mode=None):
     x = self.dense(x)
-    x = x @ self.kernel.value
+    x = x @ self.my_first_kernel
     return x
 
 
+class TrainState(train_state.TrainState):
+  other_variables: nnx.State
+
+
 class MaxUtilsInitStateWithMultipleCollections(unittest.TestCase):
   """test class for multiple collection state in maxutils"""
 
   def setUp(self):
     self.config = pyconfig.initialize(
         [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")], enable_checkpointing=False
     )
-    self.model = ModelWithMultipleCollections()
-    self.key1, self.key2, self.key3 = random.split(random.key(0), num=3)
-    self.input = random.normal(self.key1, (self.config.global_batch_size_to_load, self.config.max_target_length))
-    self.params = self.model.init(self.key2, self.input, self.input)
+    self.model = ModelWithMultipleCollections(self.config.max_target_length, nnx.Rngs(0))
+    self.key = random.key(0)
     self.tx = optax.adam(learning_rate=0.001)
 
   def _test_init_initial_state_driver(self, is_training):
     """test initiating of the initial state driver"""
-    state_under_test = maxtext_utils.init_initial_state(self.model, self.tx, self.config, is_training, self.key3)
-    self.assertEqual(state_under_test.apply_fn, self.model.apply)
+    if is_training:
+      self.model.train()
+    else:
+      self.model.eval()
+
+    graphdef, params, other_variables = nnx.split(self.model, nnx.Param, ...)
+
+    state_under_test = None
+    if is_training:
+      state_under_test = TrainState.create(
+          apply_fn=graphdef.apply, params=params, other_variables=other_variables, tx=self.tx
+      )
+    else:
+      state_under_test = TrainState(
+          step=0, apply_fn=graphdef.apply, params=params, other_variables=other_variables, tx=None, opt_state={}
+      )
+
+    self.assertEqual(state_under_test.apply_fn, graphdef.apply)
     if is_training:
       self.assertEqual(state_under_test.tx, self.tx)
       self.assertNotEqual(state_under_test.opt_state, {})
@@ -197,11 +219,11 @@ def _test_init_initial_state_driver(self, is_training):
       self.assertEqual(state_under_test.opt_state, {})
     self.assertEqual(
         max_utils.calculate_num_params_from_pytree(state_under_test.params),
-        max_utils.calculate_num_params_from_pytree(self.params),
+        max_utils.calculate_num_params_from_pytree(params),
     )
-    self.assertEqual(len(self.params), len(state_under_test.params))
-    self.assertIn("special_variables", state_under_test.params)
-    self.assertIn("params", state_under_test.params)
+    self.assertEqual(len(params), len(state_under_test.params))
+    self.assertIsInstance(state_under_test.other_variables["my_first_kernel"], SpecialVariables)
+    self.assertTrue(hasattr(state_under_test, "params"))
 
   def test_initial_train_state(self):
     self._test_init_initial_state_driver(True)
diff --git a/tests/multi_token_prediction_test.py b/tests/multi_token_prediction_test.py
@@ -19,7 +19,7 @@
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
-from flax import linen as nn
+from flax import nnx
 
 from MaxText.common_types import Config
 from MaxText import max_logging, pyconfig
@@ -29,7 +29,7 @@
 from MaxText.layers import multi_token_prediction  # The class under test
 from MaxText.layers import embeddings
 from MaxText.common_types import MODEL_MODE_TRAIN
-
+from MaxText.layers import nnx_wrappers
 
 TEST_LAYER_NUM = 1
 
@@ -122,29 +122,35 @@ def test_multi_token_prediction_layer_output(self):
 
 
 # A lightweight wrapper model for robustly testing the MTPBlock.
-class MTPBlockTestModel(nn.Module):
+class MTPBlockTestModel(nnx.Module):
   """A lightweight wrapper model for testing the MTPBlock."""
 
-  config: Config
-  mesh: Mesh
-
-  def setup(self):
+  def __init__(
+      self,
+      config: Config,
+      mesh: Mesh,
+      rngs: nnx.Rngs | None = None,
+  ):
+    self.config = config
+    self.mesh = mesh
     """Initializes the MTP block and its dependencies for the test."""
-    self.shared_embedding = embeddings.embed_as_linen(
-        mesh=self.mesh,
+    self.shared_embedding = embeddings.Embed(
         num_embeddings=self.config.vocab_size,
         num_features=self.config.base_emb_dim,
         config=self.config,
-        name="shared_embedding",
+        mesh=self.mesh,
+        rngs=rngs,
     )
-    self.decoder = Decoder(config=self.config, mesh=self.mesh, name="decoder_for_mtp")
-    self.mtp_block = multi_token_prediction.MultiTokenPredictionBlock(
+    decoder_for_mtp = Decoder(config=self.config, mesh=self.mesh, name="decoder_for_mtp")
+
+    self.multi_token_prediction_block = multi_token_prediction.MultiTokenPredictionBlock(
         config=self.config,
         mesh=self.mesh,
         name="mtp_block",
         transformer_layer_module=DecoderLayer,
-        decoder=self.decoder,
+        decoder=decoder_for_mtp,
     )
+    self.mtp_block = nnx_wrappers.ToNNX(self.multi_token_prediction_block, rngs=nnx.Rngs(params=0))
 
   def __call__(
       self,
@@ -156,6 +162,7 @@ def __call__(
       decoder_segment_ids,
       model_mode,
       deterministic,
+      mutable=None,
   ):
     return self.mtp_block(
         self.shared_embedding,
@@ -167,6 +174,7 @@ def __call__(
         decoder_segment_ids,
         model_mode,
         deterministic,
+        mutable=mutable,
     )
 
 
@@ -181,6 +189,7 @@ def setUp(self):
         skip_jax_distributed_system=True,
         mtp_num_layers=2,
     )
+    self.nnx_rngs = nnx.Rngs(params=0)
     self.rng = jax.random.PRNGKey(43)
     devices_array = maxtext_utils.create_device_mesh(self.cfg)
     self.mesh = Mesh(devices_array, self.cfg.mesh_axes)
@@ -195,23 +204,11 @@ def setUp(self):
     self.position_ids = jnp.arange(self.seq_len, dtype=jnp.int32).reshape(1, -1)
     self.decoder_segment_ids = jnp.ones((self.batch_size, self.seq_len), dtype=jnp.int32)
 
-    self.test_model = MTPBlockTestModel(config=self.cfg, mesh=self.mesh)
-    self.variables = self.test_model.init(
-        {"params": self.init_rng, "dropout": self.init_rng},
-        self.main_hidden_state,
-        self.input_ids,
-        self.target_ids,
-        self.target_mask,
-        self.position_ids,
-        self.decoder_segment_ids,
-        model_mode=MODEL_MODE_TRAIN,
-        deterministic=True,
-    )
+    self.test_model = MTPBlockTestModel(config=self.cfg, mesh=self.mesh, rngs=self.nnx_rngs)
 
   def test_sow_functionality(self):
     """Verifies that the block correctly sows losses and weights."""
-    _, captured_vars = self.test_model.apply(
-        self.variables,
+    self.test_model(
         self.main_hidden_state,
         self.input_ids,
         self.target_ids,
@@ -222,25 +219,24 @@ def test_sow_functionality(self):
         model_mode=MODEL_MODE_TRAIN,
         mutable=["mtp_losses"],
     )
-    self.assertIn("mtp_losses", captured_vars)
-    sown_data = maxtext_utils.get_nested_value(captured_vars, ("mtp_losses", "mtp_block"), {})
-    self.assertIn("losses", sown_data)
-    self.assertEqual(len(sown_data["losses"]), self.cfg.mtp_num_layers)
+    self.assertTrue(hasattr(self.test_model.mtp_block, "losses"))
+    mtp_loss = self.test_model.mtp_block.losses
+    self.assertTrue(type(mtp_loss).__name__, "mtp_losses")
+    self.assertEqual(len(mtp_loss), self.cfg.mtp_num_layers)
 
   def test_no_sow_during_init(self):
     """Verifies no losses are sown during model initialization."""
     # `self.variables` was created by `.init()`. We inspect it to ensure
     # our `if not self.is_initializing()` check worked.
-    self.assertNotIn("mtp_losses", self.variables)
+    self.assertFalse(hasattr(self.test_model.mtp_block, "losses"))
 
   def test_loss_aggregation_logic(self):
     """
     Tests the full 'sow and reap' cycle, mimicking the logic from train.py
     to ensure the final loss calculation is correct.
     """
     # 1. Run the forward pass and capture the sown variables.
-    _, captured_vars = self.test_model.apply(
-        self.variables,
+    self.test_model(
         self.main_hidden_state,
         self.input_ids,
         self.target_ids,
@@ -250,26 +246,21 @@ def test_loss_aggregation_logic(self):
         deterministic=False,
         mutable=["mtp_losses"],
         model_mode=MODEL_MODE_TRAIN,
-        rngs={"dropout": self.rng},
     )
 
     # This section of the test now *becomes* the logic from train.py
     # -------------------------------------------------------------
     final_loss_for_gradient = 100.0  # A dummy main loss
     mtp_loss_for_logging = 0.0
 
-    # 2. Define the exact path to retrieve the sown variables.
-    losses_path = ("mtp_losses", "mtp_block", "losses")
-    weights_path = ("mtp_losses", "mtp_block", "weights")
-
-    # 3. Use the standard utility to get the data.
-    mtp_losses = maxtext_utils.get_nested_value(captured_vars, losses_path, default=())
-    mtp_weights = maxtext_utils.get_nested_value(captured_vars, weights_path, default=())
+    # 2. Get the weight and losses.
+    mtp_losses = self.test_model.mtp_block.losses.value
+    mtp_weights = self.test_model.mtp_block.weights.value
 
-    # 4. Perform the aggregation logic exactly as in `loss_fn`.
+    # 3. Perform the aggregation logic exactly as in `loss_fn`.
     if mtp_losses:
-      sum_of_all_mtp_losses = jnp.sum(jnp.array(mtp_losses))
-      sum_of_all_mtp_weights = jnp.sum(jnp.array(mtp_weights))
+      sum_of_all_mtp_losses = jnp.sum(jnp.array(mtp_losses)).item()
+      sum_of_all_mtp_weights = jnp.sum(jnp.array(mtp_weights)).item()
 
       self.assertGreater(sum_of_all_mtp_weights, 0)
 
@@ -280,7 +271,7 @@ def test_loss_aggregation_logic(self):
       mtp_loss_for_logging = scaled_mtp_loss
     # -------------------------------------------------------------
 
-    # 5. Assert that the final values are correct.
+    # 4. Assert that the final values are correct.
     # The final loss should have increased from its base value.
     self.assertGreater(final_loss_for_gradient, 100.0)
     # The logged MTP loss should be a valid, positive number.
diff --git a/tests/quantizations_test.py b/tests/quantizations_test.py