Merge pull request #3218 from AI-Hypercomputer:cos_loss

Google-ML-Automation · Google-ML-Automation · commit 5a4a9c3c4879 · 2026-02-25T23:16:45.000-08:00
PiperOrigin-RevId: 875535755
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1122,3 +1122,11 @@ engram_vocab_bases: []
 engram_kernel_size: 4
 # The seed for Engram hash mapping.
 engram_seed: 0
+
+##### Distillation parameters
+distill_alpha: 0.5
+distill_temperature: 1.0
+# distill_beta is used for cosine similarity loss between intermediate activataitions of out_proj in teacher/student models.
+# 0.0 value disables this feature.
+distill_beta: 0.0
+distill_layer_indices: None
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1057,6 +1057,8 @@ class Distillation(BaseModel):
   # --- Loss Params ---
   distill_alpha: float = Field(0.5, description="Weight for the distillation loss component.")
   distill_temperature: float = Field(1.0, description="Temperature for distillation softening.")
+  distill_beta: float = Field(0.0, description="Weight for the feature loss component. Use 0.0 to disable")
+  distill_layer_indices: None | list = Field(None, description="Feature indices for feature loss.")
 
 
 class TrainingLoop(BaseModel):
@@ -2010,6 +2012,13 @@ def validate_and_set_hlo_dump_defaults():
     # Validate and initiate hlo dump related configs
     validate_and_set_hlo_dump_defaults()
 
+    # Validate nnx sow incompatibility
+    if self.distill_beta > 0.0:
+      if not self.scan_layers:
+        raise ValueError("a value of self.distill_beta > 0.0 requires self.scan_layers = True")
+      if not self.enable_nnx:
+        raise ValueError("a value of self.distill_beta > 0.0 requires self.enable_nnx = True")
+
     # D. CALCULATE MODEL DIMENSIONS from global_parameter_scale
     # This allows scaling the model size up or down easily with a single power-of-two factor.
     emb_scale, num_head_scale, mlp_dim_scale, layer_scale = get_individual_scales(self.global_parameter_scale)
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -1162,5 +1162,7 @@ def __call__(
       out = out.reshape(batch_size, seq_len, self.config.num_query_heads * self.config.head_dim)
       out = out * jax.nn.sigmoid(gate)
     out = self.out_projection(out, out_sharding=out_sharding)
+    if self.config.distill_beta > 0.0:
+      self.sow(nnx.Intermediate, "out_projection_activations", out)
     out = checkpoint_name(out, "out_proj")
     return out, kv_cache
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -477,6 +477,12 @@ def __call__(
     if audio_embeddings is not None:
       audio_masks = mm_processor.get_bidirectional_mask_audio(self.config, decoder_input_tokens)
 
+    mutable_collections = []
+    if self.config.record_internal_nn_metrics:
+      mutable_collections.append("intermediates")
+    if self.config.distill_beta > 0.0 and "intermediates" not in mutable_collections:
+      mutable_collections.append("intermediates")
+
     logits, hidden_state, kv_caches = self.decoder(
         shared_embedding=self.token_embedder,
         decoder_input_tokens=decoder_input_tokens,
@@ -495,6 +501,7 @@ def __call__(
         kv_caches=kv_caches,
         attention_metadata=attention_metadata,
         deepstack_visual_embeds=deepstack_visual_embeds,
+        mutable=mutable_collections,
     )
 
     # Materialize hidden state when vocab tiling is enabled
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -18,7 +18,7 @@
 model structures with Tunix's training interfaces.
 """
 
-from typing import Any, Iterator
+from typing import Any, Iterator, Optional, List, Callable
 
 import flax
 from flax import nnx
@@ -110,9 +110,52 @@ def __next__(self) -> MaxTextTrainingInput:
 # -----------------------------------------------------------------------------
 # Distillation Strategy
 # -----------------------------------------------------------------------------
-class MonitoredLogitStrategy(logit.LogitStrategy):
+class CombinedDistillationStrategy(logit.LogitStrategy):
   """Logit Strategy that returns detailed metrics for TensorBoard."""
 
+  def __init__(
+      self,
+      student_forward_fn: Callable[..., jax.Array],
+      teacher_forward_fn: Callable[..., jax.Array],
+      labels_fn: Callable[..., jax.Array],
+      temperature: float = 2.0,
+      alpha: float = 0.5,
+      beta_feature: float = 0.0,
+      layer_indices: Optional[List[int]] = None,
+      feature_loss_fn: Callable[[jax.Array, jax.Array], jax.Array] | None = None,
+      cosine_distance_axis: int | tuple[int, ...] = -1,
+  ):
+    """Initializes the Combined strategy using tunix logit.LogitStrategy.
+
+    Args:
+        student_forward_fn: Inherited from `logit.LogitStrategy`. Function to compute student model outputs.
+        teacher_forward_fn: Inherited from `logit.LogitStrategy`. Function to compute teacher model outputs.
+        labels_fn: Inherited from `logit.LogitStrategy`. Function to compute labels from model inputs.
+        temperature: Inherited from `logit.LogitStrategy`. Temperature for softening probabilities (> 0).
+        alpha: Inherited from `logit.LogitStrategy`. Weight to balance distillation loss and task loss (0.0 to 1.0).
+        beta_feature: Weight to balance feature loss (0.0 to 1.0). 0.0 disables feature loss.
+        layer_indices: Layer indices to apply feature loss.
+        feature_loss_fn: A function that takes two jax. Arrays (student_map,
+          teacher_map) and returns a scalar loss. Defaults to Cosine Distance.
+        cosine_distance_axis: The axis to use for cosine distance computation if
+          feature_loss_fn is not provided. Defaults to -1.
+    """
+    super().__init__(
+        student_forward_fn=student_forward_fn,
+        teacher_forward_fn=teacher_forward_fn,
+        labels_fn=labels_fn,
+        temperature=temperature,
+        alpha=alpha,
+    )
+    self.beta_feature = beta_feature
+    self.layer_indices = jnp.array(layer_indices) if layer_indices is not None else None
+
+    self.feature_loss_fn = feature_loss_fn
+    if feature_loss_fn is None:
+      self.feature_loss_fn = lambda student_features, teacher_features: jnp.mean(
+          optax.cosine_distance(student_features, teacher_features, axis=cosine_distance_axis)
+      )
+
   def compute_loss(
       self,
       student_output: jax.Array,
@@ -123,8 +166,18 @@ def compute_loss(
     # Calculate Distillation Loss (KL Divergence)
     # Scale logits by temperature T for soft targets
     # We use explicit float32 casting for stability in loss calculation
-    s_logits = student_output.astype(jnp.float32)
-    t_logits = teacher_output.astype(jnp.float32)
+    s_logits = student_output[0].astype(jnp.float32)
+    t_logits = teacher_output[0].astype(jnp.float32)
+
+    # Shape: [num_layers, batch, seq, hidden_dim]
+    s_features = student_output[-1]
+    t_features = teacher_output[-1]
+
+    if (s_features is None or t_features is None) and self.beta_feature > 0.0:
+      raise ValueError(
+          "Features extracted from student or teacher model are None, but distill_beta > 0.0. "
+          "Ensure the model architecture supports feature extraction (e.g., 'out_projection_activations' is sowed)."
+      )
 
     log_student_probs_temp = jax.nn.log_softmax(s_logits / self.temperature, axis=-1)
     teacher_probs_temp = jax.nn.softmax(t_logits / self.temperature, axis=-1)
@@ -144,14 +197,31 @@ def compute_loss(
     teacher_hard_loss = jnp.mean(ce_loss_teacher)
 
     # 3. Combine losses
-    total_loss = (self.alpha * soft_loss) + ((1.0 - self.alpha) * hard_loss)
+    base_logit_loss = (self.alpha * soft_loss) + ((1.0 - self.alpha) * hard_loss)
+
+    feature_loss = 0.0
+    if self.beta_feature > 0.0:
+
+      if self.layer_indices is not None:
+        # jnp.take slices along axis=0 (the layer dimension)
+        s_features_sliced = jnp.take(s_features, self.layer_indices, axis=0)
+        t_features_sliced = jnp.take(t_features, self.layer_indices, axis=0)
+      else:
+        s_features_sliced = s_features
+        t_features_sliced = t_features
+
+      feature_loss = self.beta_feature * self.feature_loss_fn(s_features_sliced, t_features_sliced)
+
+    total_loss = base_logit_loss + feature_loss
 
     # 4. Return Loss AND Metrics
     metrics = {
         "distill/soft_loss": soft_loss,
         "distill/hard_loss": hard_loss,
         "distill/kl_div": jnp.mean(kl_div),
         "distill/teacher_loss": teacher_hard_loss,
+        "distill/out_proj_feature_loss": feature_loss,
+        "distill/total_loss": total_loss,
     }
     return total_loss, metrics
 
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -135,14 +135,16 @@ def model_forward_fn(model, input_tokens, positions, attention_mask, decoder_seg
     del kwargs  # Unused
     del attention_mask  # Unused
     del cache  # Unused
-
     logits = model(
         decoder_input_tokens=input_tokens,
         decoder_positions=positions,
         decoder_segment_ids=decoder_segment_ids,
         enable_dropout=config.enable_dropout,
     )
-    return logits
+    hidden_features = None
+    if config.distill_beta > 0.0:
+      hidden_features = maxtext_utils.get_intermediate_value(model, "out_projection_activations", clear=True)
+    return logits, hidden_features
 
   return model_forward_fn
 
@@ -356,14 +358,18 @@ def labels_fn(targets, **kwargs):
   teacher_forward_fn = create_forward_fn(teacher_config)
 
   # Use Monitored strategy from Utils
-  strategy = distillation_utils.MonitoredLogitStrategy(
+  strategy = distillation_utils.CombinedDistillationStrategy(
       student_forward_fn=student_forward_fn,
       teacher_forward_fn=teacher_forward_fn,
       labels_fn=labels_fn,
       temperature=student_config.distill_temperature,
       alpha=student_config.distill_alpha,
+      beta_feature=student_config.distill_beta,
+      layer_indices=student_config.distill_layer_indices,
   )
 
+  student_model, teacher_model = strategy.pre_process_models(student_model, teacher_model)
+
   # 4. Optimizer & Config
   optimizer = get_distillation_optimizer(student_config, student_config.steps)
 
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -897,6 +897,34 @@ def get_nested_value(dictionary, nested_key, default=None):
   return current_level
 
 
+def get_intermediate_value(model, nested_key, default=None, clear=False):
+  """
+  Retrieves an intermediate value from an NNX model. This functions has context about
+  where the intermediate value is located.
+
+  Args:
+    model: The NNX model.
+    nested_key: A string representing the nested key, e.g., hidden_states_norm_out
+    default: The value to return if the nested key is not found.
+    clear: Clears the intermediate value from the model.
+
+  Returns:
+    The value associated with the nested key, or the default value if not found.
+  """
+  intermediate_value = default
+  match nested_key:
+    case "out_projection_activations":
+      if nested_key in model.decoder.layers["self_attention"]:
+        intermediate_value = model.decoder.layers["self_attention"][nested_key].get_value()[-1]
+        if clear:
+          del model.decoder.layers["self_attention"][nested_key]
+    case _:
+      # Default case to handle any unknown nested keys
+      raise ValueError(f"Incorrect nested_key: {nested_key}")
+
+  return intermediate_value
+
+
 def update_state_param(state, target_path, value):
   """
   Updates a specific parameter in state.params at the given path.
diff --git a/tests/unit/maxtext_utils_test.py b/tests/unit/maxtext_utils_test.py
diff --git a/tests/unit/train_distill_test.py b/tests/unit/train_distill_test.py