Add support z-loss in pre-training

gagika · gagika · commit 25b5de77a5ca · 2026-02-26T18:36:06.000-08:00
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -334,6 +334,7 @@ sliding_window_size: 0
 chunk_attn_window_size: 0
 attn_logits_soft_cap: 0.0
 final_logits_soft_cap: 0.0
+z_loss_multiplier: 0.0
 use_post_attn_norm: False
 use_post_ffw_norm: False
 mla_naive_kvcache: True
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -462,6 +462,7 @@ class Logits(BaseModel):
       None,
       description="Soft-cap value for the final logits. None or 0.0 means no cap.",
   )
+  z_loss_multiplier: float = Field(0.0, description="The multiplier for the z-loss (e.g., 1e-4). 0.0 to disable.")
 
 
 class Attention(BaseModel):
diff --git a/src/maxtext/trainers/pre_train/train.py b/src/maxtext/trainers/pre_train/train.py
@@ -136,20 +136,32 @@ def loss_fn(model, config, data, dropout_rng, params, is_train=True):
     if config.num_vocab_tiling > 1:
       hidden_state_key = ("intermediates", "decoder", "hidden_states")
       hidden_states = maxtext_utils.get_nested_value(intermediate_outputs, hidden_state_key)[0]
-      total_loss = vocab_tiling_linen_loss(hidden_states, data, config, model, params, is_train)
+      total_loss, total_z_loss = vocab_tiling_linen_loss(hidden_states, data, config, model, params, is_train)
     else:
       one_hot_targets = jax.nn.one_hot(data["targets"], config.vocab_size)
-      xent, _ = max_utils.cross_entropy_with_logits(logits, one_hot_targets)
+      xent, z_loss = max_utils.cross_entropy_with_logits(logits, one_hot_targets, z_loss=config.z_loss_multiplier)
+
       xent = sharding.maybe_shard_with_logical(
           xent,
           ("activation_embed_and_logits_batch", "activation_length"),
           model.mesh,
           config.shard_mode,
           debug_sharding=config.debug_sharding,
       )
+      z_loss = sharding.maybe_shard_with_logical(
+          z_loss,
+          ("activation_embed_and_logits_batch", "activation_length"),
+          model.mesh,
+          config.shard_mode,
+          debug_sharding=config.debug_sharding,
+      )
+
       # Mask out paddings at the end of each example.
       xent = xent * (data["targets_segmentation"] != 0)
+      z_loss = z_loss * (data["targets_segmentation"] != 0)
+
       total_loss = jnp.sum(xent)
+      total_z_loss = jnp.sum(z_loss)
   else:
     # Flax NNX model
     logits = model(
@@ -164,11 +176,17 @@ def loss_fn(model, config, data, dropout_rng, params, is_train=True):
     )
     intermediate_outputs = {}
     one_hot_targets = jax.nn.one_hot(data["targets"], config.vocab_size)
-    xent, _ = max_utils.cross_entropy_with_logits(logits, one_hot_targets)
+    xent, z_loss = max_utils.cross_entropy_with_logits(logits, one_hot_targets, z_loss=config.z_loss_multiplier)
+
     xent = nn.with_logical_constraint(xent, ("activation_embed_and_logits_batch", "activation_length"))
+    z_loss = nn.with_logical_constraint(z_loss, ("activation_embed_and_logits_batch", "activation_length"))
+
     # Mask out paddings at the end of each example.
     xent = xent * (data["targets_segmentation"] != 0)
+    z_loss = z_loss * (data["targets_segmentation"] != 0)
+
     total_loss = jnp.sum(xent)
+    total_z_loss = jnp.sum(z_loss)
 
   total_weights = jnp.sum(data["targets_segmentation"] != 0)
   # If gradient accumulation is enabled, we don't need to divide total_loss
@@ -188,6 +206,9 @@ def loss_fn(model, config, data, dropout_rng, params, is_train=True):
     # updates and scaling internally.
     loss = total_loss / (total_weights + EPS)
 
+  # We keep z-loss normalized by total_weights.
+  total_z_loss = total_z_loss / (total_weights + EPS)
+
   # Calculate and Add MTP Loss
   mtp_loss = 0.0
   if config.mtp_num_layers > 0 and is_train:
@@ -230,6 +251,7 @@ def loss_fn(model, config, data, dropout_rng, params, is_train=True):
   aux = {
       "intermediate_outputs": intermediate_outputs,
       "total_loss": total_loss,
+      "z_loss": total_z_loss,
       "total_weights": total_weights,
       "moe_lb_loss": moe_lb_loss,
       "moe_bias_updates": moe_bias_updates,
@@ -302,6 +324,7 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
   intermediate_outputs = aux["intermediate_outputs"]
   total_weights = aux["total_weights"]
   moe_lb_loss = aux["moe_lb_loss"]
+  z_loss = aux["z_loss"]
   moe_bias_updates = aux["moe_bias_updates"]
   mtp_loss = aux["mtp_loss"]
 
@@ -345,6 +368,7 @@ def move(path, value):
 
   scalar_metrics = {
       "learning/loss": loss,
+      "learning/z_loss": z_loss,
       "learning/moe_lb_loss": moe_lb_loss,
       "learning/mtp_loss": mtp_loss,
       "learning/total_weights": total_weights,
@@ -395,12 +419,14 @@ def eval_step(model, config, state, data, dropout_rng):
     mtp_acceptance_rate = calculate_mtp_acceptance_rate(aux["intermediate_outputs"], config)
 
   total_loss = aux["total_loss"]
+  z_loss = aux["z_loss"]
   total_weights = aux["total_weights"]
   moe_lb_loss = aux["moe_lb_loss"]
   mtp_loss = aux["mtp_loss"]
   metrics = {
       "scalar": {
           "evaluation/loss": loss,
+          "evaluation/z_loss": z_loss,
           "evaluation/total_loss": total_loss,
           "evaluation/total_weights": total_weights,
           "evaluation/moe_lb_loss": moe_lb_loss,
diff --git a/src/maxtext/utils/vocabulary_tiling.py b/src/maxtext/utils/vocabulary_tiling.py
@@ -52,7 +52,7 @@ def vocab_tiling_linen_loss(
     params: The model parameters.
     is_train: A boolean indicating if the model is in training mode.
   Returns:
-    The total cross-entropy loss computed via vocab tiling.
+    A tuple of (total_loss, total_z_loss) computed via vocab tiling.
   """
   labels = data["targets"]
   segmentation = data["targets_segmentation"]
@@ -112,8 +112,8 @@ def chunked_cross_entropy_loss(gathered_params, hidden_states, labels, segmentat
     """
     Calculates the total cross-entropy loss using vocab tiling.
     """
-    total_loss, _ = _chunked_cross_entropy_loss_fwd(gathered_params, hidden_states, labels, segmentation)
-    return total_loss
+    (total_loss, total_z_loss), _ = _chunked_cross_entropy_loss_fwd(gathered_params, hidden_states, labels, segmentation)
+    return total_loss, total_z_loss
 
   def _chunked_cross_entropy_loss_fwd(gathered_params, hidden_states, labels, segmentation):
     batch_size, seq_len, emb_dim = hidden_states.shape
@@ -126,7 +126,8 @@ def _chunked_cross_entropy_loss_fwd(gathered_params, hidden_states, labels, segm
     reshaped_segmentation = _reshape(segmentation, (config.num_vocab_tiling, vocab_tile_size), reshaped_data_spec)
 
     # Scan body accumulates loss from each tile given chunked hidden states and labels
-    def _fwd_scan_body(loss_accumulator, chunk_data):
+    def _fwd_scan_body(accumulators, chunk_data):
+      loss_accumulator, z_loss_accumulator = accumulators
       hidden_chunk, label_chunk, segmentation_chunk = chunk_data
       hidden_chunk = _maybe_shard_with_name(hidden_chunk, chunked_hidden_spec)
       label_chunk = _maybe_shard_with_name(label_chunk, chunked_data_spec)
@@ -141,14 +142,20 @@ def _fwd_scan_body(loss_accumulator, chunk_data):
       )
       chunk_logits = _maybe_shard_with_name(chunk_logits, chunked_logits_spec)
       one_hot_label_chunk = jax.nn.one_hot(label_chunk, config.vocab_size)
-      chunk_xent, _ = max_utils.cross_entropy_with_logits(chunk_logits, one_hot_label_chunk)
+      chunk_xent, chunk_z_loss = max_utils.cross_entropy_with_logits(
+          chunk_logits, one_hot_label_chunk, z_loss=config.z_loss_multiplier
+      )
+
       masked_xent = jnp.sum(chunk_xent * (segmentation_chunk != 0))
+      masked_z_loss = jnp.sum(chunk_z_loss * (segmentation_chunk != 0))
+
       loss_accumulator += masked_xent
-      return loss_accumulator, None
+      z_loss_accumulator += masked_z_loss
+      return (loss_accumulator, z_loss_accumulator), None
 
-    initial_loss = 0.0
-    total_loss, _ = jax.lax.scan(
-        _fwd_scan_body, initial_loss, (reshaped_hidden_states, reshaped_labels, reshaped_segmentation)
+    initial_acc = (0.0, 0.0)
+    (total_loss, total_z_loss), _ = jax.lax.scan(
+        _fwd_scan_body, initial_acc, (reshaped_hidden_states, reshaped_labels, reshaped_segmentation)
     )
     residuals = (
         gathered_params,
@@ -160,9 +167,13 @@ def _fwd_scan_body(loss_accumulator, chunk_data):
         emb_dim,
     )
 
-    return total_loss, residuals
+    return (total_loss, total_z_loss), residuals
+
+  def _chunked_cross_entropy_loss_bwd(residuals, cotangents):
+    # Unpack the cotangents tuple. We ignore the z_loss cotangent since the gradients
+    # of the z_loss term are already factored into the loss_cotangent.
+    loss_cotangent, _ = cotangents
 
-  def _chunked_cross_entropy_loss_bwd(residuals, loss_cotangent):
     gathered_params, reshaped_hidden_states, reshaped_labels, reshaped_segmentation, batch_size, seq_len, emb_dim = (
         residuals
     )
@@ -176,7 +187,7 @@ def _single_chunk_loss_fn(input_params, input_hidden_chunk, input_label_chunk, i
       )
       chunk_logits = _maybe_shard_with_name(chunk_logits, chunked_logits_spec)
       one_hot_label_chunk = jax.nn.one_hot(input_label_chunk, config.vocab_size)
-      xent, _ = max_utils.cross_entropy_with_logits(chunk_logits, one_hot_label_chunk)
+      xent, _ = max_utils.cross_entropy_with_logits(chunk_logits, one_hot_label_chunk, z_loss=config.z_loss_multiplier)
       return jnp.sum(xent * (input_segmentation_chunk != 0))
 
     def _bwd_scan_body(grad_params_acc, chunk_data):
@@ -228,11 +239,11 @@ def _bwd_scan_body(grad_params_acc, chunk_data):
 
   chunked_cross_entropy_loss.defvjp(_chunked_cross_entropy_loss_fwd, _chunked_cross_entropy_loss_bwd)
 
-  total_loss = chunked_cross_entropy_loss(
+  total_loss, total_z_loss = chunked_cross_entropy_loss(
       gathered_params,
       hidden_states,
       labels,
       segmentation,
   )
 
-  return total_loss
+  return total_loss, total_z_loss
diff --git a/tests/unit/max_utils_test.py b/tests/unit/max_utils_test.py
@@ -86,6 +86,35 @@ def test_t5x_cross_entropy(self):
     # Compare results
     self.assertTrue(jax.numpy.allclose(optax_xent, t5x_xent, rtol=1e-05, atol=1e-08, equal_nan=False))
 
+  def test_cross_entropy_with_z_loss(self):
+    """Tests the exact mathematical output of the z-loss penalty."""
+    # Shape [2, 3] to test across multiple dimensions
+    logits = jnp.array([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [1.0, 1.0, 1.0]]])
+    # Target indices: [2, 1], [0, 2]
+    targets = jnp.array([[2, 1], [0, 2]])
+    one_hot_targets = jax.nn.one_hot(targets, 3)
+
+    z_loss_multiplier = 1e-4
+
+    # 1. Run without z-loss
+    total_loss_no_z, _ = max_utils.cross_entropy_with_logits(logits, one_hot_targets, z_loss=0.0)
+
+    # 2. Run with z-loss
+    total_loss_with_z, z_loss_only = max_utils.cross_entropy_with_logits(
+        logits, one_hot_targets, z_loss=z_loss_multiplier
+    )
+
+    # 3. Calculate expected z-loss manually
+    # Expected log_z = log(sum(exp(logits), axis=-1))
+    expected_log_z = jax.scipy.special.logsumexp(logits, axis=-1)
+    expected_z_loss = z_loss_multiplier * jnp.square(expected_log_z)
+
+    # Compare isolated z_loss component
+    self.assertTrue(jnp.allclose(z_loss_only, expected_z_loss, rtol=1e-5, atol=1e-8))
+
+    # Compare total loss aggregation
+    self.assertTrue(jnp.allclose(total_loss_with_z, total_loss_no_z + z_loss_only, rtol=1e-5, atol=1e-8))
+
 
 class MaxUtilsCustomMesh(unittest.TestCase):
   """Tests for the is_valid_custom_mesh function in max_utils.py"""
diff --git a/tests/unit/tiling_test.py b/tests/unit/tiling_test.py
@@ -19,10 +19,13 @@
 """
 
 import unittest
+import pytest
+
 from flax import linen as nn
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
+
 from maxtext.configs import pyconfig
 from maxtext.common.common_types import Config
 from maxtext.common.common_types import MODEL_MODE_TRAIN
@@ -31,8 +34,8 @@
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
 from maxtext.utils.vocabulary_tiling import vocab_tiling_linen_loss
+
 from tests.utils.test_helpers import get_test_config_path
-import pytest
 
 
 def compute_loss_linen(intermediate_outputs, logits, data, config, model, params, is_train):
@@ -42,10 +45,10 @@ def compute_loss_linen(intermediate_outputs, logits, data, config, model, params
   if config.num_vocab_tiling > 1:
     hidden_state_key = ("intermediates", "decoder", "hidden_states")
     hidden_states = maxtext_utils.get_nested_value(intermediate_outputs, hidden_state_key)[0]
-    total_loss = vocab_tiling_linen_loss(hidden_states, data, config, model, params, is_train)
+    total_loss, _ = vocab_tiling_linen_loss(hidden_states, data, config, model, params, is_train)
   else:
     one_hot_targets = jax.nn.one_hot(data["targets"], config.vocab_size)
-    xent, _ = max_utils.cross_entropy_with_logits(logits, one_hot_targets)
+    xent, _ = max_utils.cross_entropy_with_logits(logits, one_hot_targets, z_loss=config.z_loss_multiplier)
     xent = nn.with_logical_constraint(xent, ("activation_embed_and_logits_batch", "activation_length"))
     # Mask out paddings at the end of each example.
     xent = xent * (data["targets_segmentation"] != 0)
@@ -186,6 +189,74 @@ def test_gradient_accumulation(self):
         "Gradients of embedding table do not match for GA.",
     )
 
+  @pytest.mark.tpu_only
+  def test_vocab_tiling_gradient_with_z_loss(self):
+    """
+    Tests loss and gradient correctness when z-loss is enabled, comparing
+    standard computation vs. vocabulary tiling computation.
+    """
+    cfg_non_tiling = pyconfig.initialize(
+        self.base_config,
+        run_name="grad_test_z_loss_no_tiling",
+        enable_checkpointing=False,
+        enable_dropout=False,
+        max_target_length=self.seq_len,
+        per_device_batch_size=self.batch_size,
+        logits_via_embedding=False,
+        base_num_decoder_layers=0,
+        dtype="float32",
+        matmul_precision="high",
+        num_vocab_tiling=1,
+        z_loss_multiplier=1e-4,  # Enable z-loss
+    )
+    quant_non_tiling = quantizations.configure_quantization(cfg_non_tiling)
+    devices_array_non_tiling = maxtext_utils.create_device_mesh(cfg_non_tiling)
+    mesh_non_tiling = Mesh(devices_array_non_tiling, cfg_non_tiling.mesh_axes)
+    model_non_tiling = models.transformer_as_linen(
+        cfg_non_tiling, mesh=mesh_non_tiling, quant=quant_non_tiling, model_mode=MODEL_MODE_TRAIN
+    )
+
+    rng_model, rng_targets = jax.random.split(self.rng)
+
+    params = model_non_tiling.init(
+        {"params": rng_model, "dropout": rng_model},
+        self.dummy_inputs,
+        self.dummy_inputs,
+    )
+
+    data = {
+        "targets": jax.random.randint(rng_targets, (self.batch_size, self.seq_len), 0, cfg_non_tiling.vocab_size),
+        "targets_segmentation": jnp.ones((self.batch_size, self.seq_len)),
+    }
+
+    loss_non_tiling, grads_non_tiling = self.get_grads(cfg_non_tiling, params, data)
+
+    cfg_tiling = pyconfig.initialize(
+        self.base_config,
+        run_name="grad_test_z_loss_with_tiling",
+        enable_checkpointing=False,
+        enable_dropout=False,
+        max_target_length=self.seq_len,
+        per_device_batch_size=self.batch_size,
+        logits_via_embedding=False,
+        base_num_decoder_layers=0,
+        dtype="float32",
+        matmul_precision="high",
+        num_vocab_tiling=4,
+        z_loss_multiplier=1e-4,  # Enable z-loss
+    )
+    loss_tiling, grads_tiling = self.get_grads(cfg_tiling, params, data)
+
+    # Loss correctness test
+    assert jnp.allclose(loss_non_tiling, loss_tiling, rtol=self.rtol), "Losses do not match when z-loss is enabled."
+
+    # Gradient correctness test
+    self.assert_pytrees_all_close(
+        grads_non_tiling,
+        grads_tiling,
+        "Gradients do not match for vocab tiling when z-loss is enabled.",
+    )
+
   @pytest.mark.tpu_only
   def test_vocab_tiling_gradient_non_tied_embedding(self):
     """

Original file line number	Diff line number	Diff line change
`@@ -462,6 +462,7 @@ class Logits(BaseModel):`
`462`	`462`	`None,`
`463`	`463`	`description="Soft-cap value for the final logits. None or 0.0 means no cap.",`
`464`	`464`	`)`
	`465`	`+ z_loss_multiplier: float = Field(0.0, description="The multiplier for the z-loss (e.g., 1e-4). 0.0 to disable.")`
`465`	`466`
`466`	`467`
`467`	`468`	`class Attention(BaseModel):`