AI-Hypercomputer
diff --git a/‎src/MaxText/utils/qk_clip_utils.py‎
Lines changed: 160 additions & 0 deletions b/‎src/MaxText/utils/qk_clip_utils.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 14 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 13 additions & 2 deletions b/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 13 additions & 2 deletions
@@ -0,0 +1,160 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for QK-Clip (Muon Clip)."""
+
+import jax
+import jax.numpy as jnp
+
+
+def _get_key_name(k):
+  """Helper to unwrap JAX path keys."""
+  if hasattr(k, "key"):
+    return k.key
+  if hasattr(k, "idx"):
+    return k.idx
+  return k
+
+
+def calculate_max_logit_metric(intermediate_outputs):
+  """Extracts and computes the global maximum logit from intermediate outputs.
+
+  Args:
+    intermediate_outputs: A pytree containing model intermediates, potentially
+      including 'max_logits' sowed by Attention layers.
+
+  Returns:
+    The global maximum logit scalar, or None if no logits were found.
+  """
+  all_max_logits = []
+
+  def extract_logits(path, val):
+    # 'sow' stores values in a tuple/list. tree_map descends into it.
+    # The path to the leaf array will look like: (..., 'max_logits', 0)
+    # So we check if the parent key (path[-2]) is 'max_logits'.
+    if len(path) >= 2:
+      parent_key = _get_key_name(path[-2])
+      if parent_key == "max_logits":
+        all_max_logits.append(val)
+
+  jax.tree_util.tree_map_with_path(extract_logits, intermediate_outputs)
+
+  if not all_max_logits:
+    return None
+
+  return jnp.max(jnp.stack(all_max_logits))
+
+
+def apply_qk_clip(state, intermediate_outputs, config):
+  """Applies QK-Clip to MLA weights based on max_logits.
+
+  Iterates over parameters. If a parameter belongs to an MLA attention layer,
+  it finds the corresponding max_logits statistics from intermediate_outputs,
+  calculates the clipping factor, and applies it to W_q and W_k components.
+
+  Args:
+    state: The current training state containing model parameters.
+    intermediate_outputs: A dictionary of intermediate outputs from the model
+      forward pass. It is expected to contain 'max_logits' entries sowed by
+      Attention layers if QK-Clip is enabled.
+    config: The model configuration object, containing QK-Clip hyperparameters
+      (e.g. qk_clip_threshold, qk_nope_head_dim) and attention_type.
+
+  Returns:
+    A new training state with updated (clipped) parameters.
+
+  Raises:
+    ValueError: If the configured attention_type is not 'mla'.
+  """
+  if getattr(config, "attention_type", None) != "mla":
+    raise ValueError(
+        f"QK-Clip is only supported for MLA attention (attention_type='mla'). "
+        f"Current configuration: {getattr(config, 'attention_type', 'None')}"
+    )
+
+  tau = float(config.qk_clip_threshold)
+
+  def clip_mla_weights(path, param):
+    """Applies QK-Clip to a single parameter if it's an MLA projection weight.
+
+    Args:
+      path: A tuple of JAX Key objects representing the hierarchy path to the parameter in the state PyTree.
+      param: The actual JAX array (weight tensor) at the given path.
+
+    Returns:
+      The scaled parameter if it is an MLA projection ('wq_b' or 'wkv_b'), otherwise the original parameter.
+    """
+    # Skip irrelevant weights (embeddings, norms, etc.).
+    # We only care about specific MLA projection matrices ('wq_b', 'wkv_b').
+    if len(path) < 2:
+      return param
+
+    layer_name = _get_key_name(path[-2])
+    if layer_name not in ("wq_b", "wkv_b"):
+      return param
+
+    # Search for max_logits in intermediate_outputs
+    curr = intermediate_outputs.get("intermediates", intermediate_outputs)
+    for node in path[:-2]:
+      key = _get_key_name(node)
+      if isinstance(curr, dict) and key in curr:
+        curr = curr[key]
+      else:
+        return param  # Path not found in intermediates, skip
+
+    if not isinstance(curr, dict) or "max_logits" not in curr:
+      return param
+
+    # max_logits was sowed as a tuple (array,)
+    # shape: [batch, num_heads]
+    max_logits_sowed = curr["max_logits"]
+    if not max_logits_sowed:
+      return param
+
+    max_logits_batch = max_logits_sowed[0]
+
+    # Calculate S_max (per head)
+    # We want the global maximum across the batch dimension.
+    # Result shape: [num_heads]
+    s_max = jnp.max(max_logits_batch, axis=0)
+
+    # Calculate scaling factor gamma
+    # gamma = tau / s_max. Clip if s_max > tau.
+    scale = jnp.minimum(1.0, tau / (s_max + 1e-6))
+
+    # Apply qk clipping based on weight type
+    if layer_name == "wq_b":
+      # MLA Up-projection for Query [rank, heads, q_head_dim]
+      qk_nope = config.qk_nope_head_dim
+      w_qc = param[..., :qk_nope]
+      w_qr = param[..., qk_nope:]
+      scale_b = scale[None, :, None]  # Broadcast: [1, heads, 1]
+      w_qc_new = w_qc * jnp.sqrt(scale_b)
+      w_qr_new = w_qr * scale_b
+      return jnp.concatenate([w_qc_new, w_qr_new], axis=-1)
+
+    elif layer_name == "wkv_b":
+      # MLA Up-projection for Key/Value [rank, heads, kv_head_dim]
+      qk_nope = config.qk_nope_head_dim
+      w_kc = param[..., :qk_nope]
+      w_v = param[..., qk_nope:]
+      scale_b = scale[None, :, None]
+      w_kc_new = w_kc * jnp.sqrt(scale_b)
+      return jnp.concatenate([w_kc_new, w_v], axis=-1)
+
+    return param
+
+  # Apply transformation
+  new_params = jax.tree_util.tree_map_with_path(clip_mla_weights, state.params)
+  return state.replace(params=new_params)
@@ -355,6 +355,10 @@ qk_nope_head_dim: 128
 qk_rope_head_dim: 64
 v_head_dim: 128
 
+# QK-Clip (Muon Clip) Configuration
+use_qk_clip: False  # Enable QK-Clip (supported in MLA with DotProduct or Tokamax Splash)
+qk_clip_threshold: 100.0  # Threshold for clipping (tau in the paper)
+
 # Combine matmuls for QKV and MLP
 fused_qkv: False
 fused_mlp: False
 
@@ -497,6 +497,8 @@ class Attention(BaseModel):
   use_tokamax_splash: bool = Field(False, description="Whether to use tokamax splash attention.")
   use_jax_splash: bool = Field(False, description="Whether to use jax splash attention.")
   force_q_layout: bool = Field(False, description="Force the Q layout")
+  use_qk_clip: bool = Field(False, description="Whether to use QK-Clip (MuonClip) for training stability.")
+  qk_clip_threshold: float = Field(100.0, description="Threshold for QK-Clip (tau).")
 
 
 class MoBa(BaseModel):
@@ -2410,6 +2412,18 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
     if self.force_q_layout and not self.use_jax_splash:
       raise ValueError("`force_q_layout` can only be true if `use_jax_splash` is also true.")
 
+    if self.use_qk_clip and self.attention_type != "mla":
+      raise ValueError(
+          f"QK-Clip is only supported when attention_type='mla', but found attention_type='{self.attention_type}'."
+      )
+
+    if self.use_qk_clip and self.attn_logits_soft_cap is not None:
+      raise ValueError(
+          "QK-Clip monitors raw dot products, but attn_logits_soft_cap is enabled. "
+          "Recording pre-cap max_logits is not fully supported yet. "
+          "Please disable attn_logits_soft_cap when using use_qk_clip."
+      )
+
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
     # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
     if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
 
@@ -1030,15 +1030,26 @@ def __call__(
           attention_mask=attention_mask,
       )
 
+    # Check if we need QK Clip stats
+    use_qk_clip = self.model_mode == MODEL_MODE_TRAIN and self.config.use_qk_clip
+
     if self.config.attention == "paged" and model_mode != MODEL_MODE_TRAIN:
       unnormalized_out, _, exp_sum = self.ds_paged_attention_op(
           query, key, value, decoder_segment_ids, model_mode, previous_chunk, slot=slot, page_state=page_state
       )
       unnormalized_out = unnormalized_out[..., : self.v_head_dim]
       out = unnormalized_out / (exp_sum + 1e-9) if exp_sum is not None else unnormalized_out
     else:
-      # Pass the index_mask to the Attention Op
-      out = self.attention_op(query, key, value, decoder_segment_ids, model_mode, cached_values, index_mask=index_mask)
+      out = self.attention_op(
+          query,
+          key,
+          value,
+          decoder_segment_ids,
+          model_mode,
+          cached_values,
+          index_mask=index_mask,
+          record_max_logits=use_qk_clip,
+      )
 
     out = jax.ad_checkpoint.checkpoint_name(out, "attention_out")
     if model_mode == MODEL_MODE_TRAIN and self.config.expert_shard_attention_option == EP_AS_CONTEXT: