Merge pull request #3280 from AI-Hypercomputer:weight_decay

Google-ML-Automation · Google-ML-Automation · commit e09167593789 · 2026-03-02T17:31:44.000-08:00
PiperOrigin-RevId: 877653152
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -779,6 +779,7 @@ adam_b2: 0.95 # Exponential decay rate to track the second moment of past gradie
 adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
 adam_eps_root: 0. # A small constant applied to denominator inside the square root.
 adam_weight_decay: 0.1 # AdamW Weight decay
+adamw_mask: [] # List of parameter names/patterns to exclude from weight decay in AdamW, like ['bias', '.*norm', '.*ln.*'].
 mu_dtype: "" # data type to store "mu" of AdamW tracking the first moment. Inherits from  weight_dtype if unset.
 # Setting nu_dtype is not yet supported by optax, instead nu_dtype is always inherited from weights.
 # See b/399961932 for more.
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1175,6 +1175,12 @@ class AdamW(BaseModel):
       description="A small constant for numerical stability (epsilon), applied inside of the square root.",
   )
   adam_weight_decay: float = Field(0.1, description="Weight decay regularization.")
+  adamw_mask: list[str] = Field(
+      default_factory=list,
+      description=(
+          "List of parameter names/patterns to exclude from weight decay in AdamW," " like ['bias', '.*norm', '.*ln.*']"
+      ),
+  )
   mu_dtype: str = Field(
       "",
       description="Data type for 'mu' (first moment) in AdamW. Inherits from weight_dtype if empty.",
diff --git a/src/maxtext/optimizers/optimizers.py b/src/maxtext/optimizers/optimizers.py
@@ -15,6 +15,7 @@
 # pylint: disable=bare-except, consider-using-generator, too-many-positional-arguments
 """ Utils that are only interesting to MaxText. """
 
+import re
 import jax
 import jax.numpy as jnp
 
@@ -23,6 +24,26 @@
 from maxtext.utils.muon_utils import get_muon_weight_dimension_numbers
 
 
+def get_adamw_mask(config):
+  """Create a mask function for AdamW optimizer to exclude certain parameters from weight decay."""
+  if not getattr(config, "adamw_mask", None):
+    return None
+
+  compiled_patterns = [re.compile(pattern) for pattern in config.adamw_mask]
+
+  def mask_fn(params):
+    def _is_decayed(path, _):
+      # Join path keys into a single string for pattern matching (e.g., "layer1/bias")
+      path_str = "/".join(str(getattr(p, "key", getattr(p, "idx", getattr(p, "name", p)))) for p in path)
+      # If any pattern in adamw_mask matches the path, exclude from weight decay (return False).
+      # Otherwise, apply weight decay (return True).
+      return not any(pattern.search(path_str) for pattern in compiled_patterns)
+
+    return jax.tree_util.tree_map_with_path(_is_decayed, params)
+
+  return mask_fn
+
+
 def get_optimizer(config, learning_rate_schedule, model=None):
   """Create optimizer."""
   if config.opt_type == "adamw":
@@ -35,6 +56,7 @@ def get_optimizer(config, learning_rate_schedule, model=None):
         eps_root=config.adam_eps_root,
         weight_decay=config.adam_weight_decay,
         mu_dtype=config.mu_dtype,
+        mask=get_adamw_mask(config),
     )
   elif config.opt_type == "adam_pax":
     return adam_pax(
@@ -44,6 +66,7 @@ def get_optimizer(config, learning_rate_schedule, model=None):
         epsilon=config.adam_eps,
         epsilon_root=config.adam_eps_root,
         weight_decay=config.adam_weight_decay,
+        mask=get_adamw_mask(config),
     )
   elif config.opt_type == "sgd":
     return optax.sgd(learning_rate_schedule)
@@ -81,6 +104,7 @@ def adam_pax(
     epsilon: float,
     epsilon_root: float,
     weight_decay: float,
+    mask=None,
 ) -> optax.GradientTransformation:
   """Standard Adam optimizer that supports weight decay.
 
@@ -162,7 +186,11 @@ def _update_momentum(update, mu, nu):
     updates = jax.tree_util.tree_map(lambda mu, nu: mu / (jnp.sqrt(nu + epsilon_root) + epsilon), mu, nu)
 
     if weight_decay > 0:
-      updates = jax.tree_util.tree_map(lambda x, v: x + weight_decay * v, updates, params)
+      if mask is not None:
+        mask_tree = mask(params) if callable(mask) else mask
+        updates = jax.tree_util.tree_map(lambda x, v, m: x + weight_decay * v if m else x, updates, params, mask_tree)
+      else:
+        updates = jax.tree_util.tree_map(lambda x, v: x + weight_decay * v, updates, params)
 
     step_size = -1.0 * learning_rate_fn(count)
     # Finally, fold in step size.
diff --git a/tests/unit/optimizers_test.py b/tests/unit/optimizers_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-"""Unit tests for Muon dimension number generation.
-
-This suite verifies that the automatically generated Muon dimension numbers
-for various models match their hardcoded reference values.
-  python3 -m pytest -v --pyargs tests.muon_test -rP -s
-"""
-
+""" Unit tests for all optimizers. """
+import re
 import unittest
+from unittest.mock import patch
+import jax
+
+import pytest
 from absl.testing import parameterized
 from optax.contrib import MuonDimensionNumbers as mdn
+
+from maxtext.configs import pyconfig
+from maxtext.optimizers import optimizers
+from maxtext.utils import maxtext_utils
 from maxtext.utils.muon_utils import get_model_mdn
-import pytest
+from tests.utils.test_helpers import get_test_config_path
+from typing import NamedTuple
+
 
 # deepseek2, specific: q_lora_rank=0
 # applicable: deepseek2-16, but not deepseek2-236b (q_lora_rank=1536)
@@ -214,6 +218,11 @@
 
 
 class MuonDimensionTest(parameterized.TestCase):
+  """Unit tests for Muon dimension number generation.
+
+  This suite verifies that the automatically generated Muon dimension numbers
+  for various models match their hardcoded reference values.
+  """
 
   @parameterized.named_parameters(
       ("deepseek2-16b", "deepseek2-16b", DEEPSEEK2_DIMENSION_NUMBER),
@@ -236,5 +245,122 @@ def test_model_integration(self, model_name, expected_output):
     self.assertEqual(actual_output, expected_output)
 
 
+class AdamWMaskTest(parameterized.TestCase):
+  """Tests for the AdamW mask functionality"""
+
+  def test_get_adamw_mask_with_empty_mask(self):
+    """Directly test the get_adamw_mask function with empty list"""
+    # Case 1: No mask in config (empty list)
+    argv = ["", get_test_config_path(), "run_name=test", "adamw_mask=[]"]
+    config = pyconfig.initialize(argv)
+    mask_fn = optimizers.get_adamw_mask(config)
+    self.assertIsNone(mask_fn)
+
+  def test_get_adamw_mask_with_valid_mask(self):
+    """Directly test the get_adamw_mask function with valid mask"""
+    # Case 2: Mask in config
+    argv = ["", get_test_config_path(), "run_name=test", "adamw_mask=['bias', '.*norm', '.*ln.*']"]
+    config = pyconfig.initialize(argv)
+    mask_fn = optimizers.get_adamw_mask(config)
+    self.assertTrue(callable(mask_fn))
+
+    params = {"layer1": {"kernel": 1, "bias": 2}, "layer2": {"layer_norm": {"scale": 3}}, "layer3": {"ln": {"scale": 4}}}
+    mask = mask_fn(params)
+    self.assertTrue(mask["layer1"]["kernel"])
+    self.assertFalse(mask["layer1"]["bias"])
+    self.assertFalse(mask["layer2"]["layer_norm"]["scale"])
+    self.assertFalse(mask["layer3"]["ln"]["scale"])
+
+  def test_get_adamw_mask_with_invalid_mask(self):
+    """Test that an invalid regex in the mask config raises an error when used"""
+    # Create a config with an invalid regex (unbalanced bracket)
+    argv = ["", get_test_config_path(), "run_name=test", "adamw_mask=['[']"]
+    config = pyconfig.initialize(argv)
+
+    # Applying the mask should raise re.error due to the invalid regex
+    with self.assertRaises(re.error):
+      optimizers.get_adamw_mask(config)
+
+  def test_get_adamw_mask_with_getattrkey(self):
+    """Test that get_adamw_mask correctly handles GetAttrKey (e.g. from NamedTuples)"""
+
+    class MyParams(NamedTuple):
+      kernel: jax.Array
+      bias: jax.Array
+
+    argv = ["", get_test_config_path(), "run_name=test", "adamw_mask=['bias']"]
+    config = pyconfig.initialize(argv)
+    mask_fn = optimizers.get_adamw_mask(config)
+
+    params = MyParams(kernel=jax.numpy.ones((2, 2)), bias=jax.numpy.zeros((2,)))
+    mask = mask_fn(params)
+
+    self.assertTrue(mask.kernel)
+    self.assertFalse(mask.bias)
+
+  @parameterized.named_parameters(
+      ("adamw", "adamw", "maxtext.optimizers.optimizers.optax.adamw"),
+      ("adam_pax", "adam_pax", "maxtext.optimizers.optimizers.adam_pax"),
+  )
+  def test_optimizer_with_mask(self, opt_type, mock_path):
+    """Test that optimizer receives the mask function from config and it works as expected"""
+    # Create a config with a mask list including regex
+    argv = [
+        "",
+        get_test_config_path(),
+        "run_name=test",
+        "adamw_mask=['bias', 'layer_norm', 'layer1/.*kernel']",
+        f"opt_type={opt_type}",
+    ]
+    config = pyconfig.initialize(argv)
+    learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
+
+    with patch(mock_path) as mock_opt:
+      # Call get_optimizer
+      optimizers.get_optimizer(config, learning_rate_schedule)
+
+      # Check that optimizer was called with a mask function
+      mock_opt.assert_called_once()
+      _, kwargs = mock_opt.call_args
+      mask_fn = kwargs["mask"]
+
+      # Verify that mask_fn is not None
+      self.assertIsNotNone(mask_fn)
+
+      # Test the behavior of mask_fn
+      params = {"layer1": {"kernel": 1, "bias": 2}, "layer2": {"layer_norm": {"scale": 3}}, "layer3": [4, 5]}
+
+      mask = mask_fn(params)
+
+      # kernel in layer1 should be False because of 'layer1/.*kernel'
+      self.assertFalse(mask["layer1"]["kernel"])
+      # bias in layer1 should be False because of 'bias'
+      self.assertFalse(mask["layer1"]["bias"])
+      # layer_norm should be False because of 'layer_norm'
+      self.assertFalse(mask["layer2"]["layer_norm"]["scale"])
+      # layer3 elements should be True
+      self.assertTrue(mask["layer3"][0])
+      self.assertTrue(mask["layer3"][1])
+
+  @parameterized.named_parameters(
+      ("adamw", "adamw", "maxtext.optimizers.optimizers.optax.adamw"),
+      ("adam_pax", "adam_pax", "maxtext.optimizers.optimizers.adam_pax"),
+  )
+  def test_optimizer_without_mask(self, opt_type, mock_path):
+    """Test that optimizer receives None for mask when config is empty"""
+    argv = ["", get_test_config_path(), "run_name=test", f"opt_type={opt_type}"]
+    config = pyconfig.initialize(argv)
+    learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
+
+    with patch(mock_path) as mock_opt:
+      # Call get_optimizer
+      optimizers.get_optimizer(config, learning_rate_schedule)
+
+      # Check that optimizer was called with mask=None
+      mock_opt.assert_called_once()
+      _, kwargs = mock_opt.call_args
+      self.assertIsNone(kwargs["mask"])
+
+
 if __name__ == "__main__":
   unittest.main()