Merge pull request #3260 from AI-Hypercomputer:cleanup_distillation_classes

Google-ML-Automation · Google-ML-Automation · commit 5c76bddf83ce · 2026-02-26T15:05:16.000-08:00
PiperOrigin-RevId: 875907919
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -40,6 +40,16 @@
 # -----------------------------------------------------------------------------
 
 
+@flax.struct.dataclass(frozen=True)
+class DistillationForwardOutput:
+  """Dataclass to carry MaxText-specific output fields."""
+
+  #: logits
+  logits: jax.Array = None
+  #: out_projection_activations
+  out_projection_activations: jax.Array = None
+
+
 @flax.struct.dataclass(frozen=True)
 class MaxTextTrainingInput(distillation_trainer.TrainingInput):
   """Extended TrainingInput dataclass to carry MaxText-specific fields."""
@@ -115,8 +125,8 @@ class CombinedDistillationStrategy(logit.LogitStrategy):
 
   def __init__(
       self,
-      student_forward_fn: Callable[..., jax.Array],
-      teacher_forward_fn: Callable[..., jax.Array],
+      student_forward_fn: Callable[..., DistillationForwardOutput],
+      teacher_forward_fn: Callable[..., DistillationForwardOutput],
       labels_fn: Callable[..., jax.Array],
       temperature: float = 2.0,
       alpha: float = 0.5,
@@ -158,20 +168,20 @@ def __init__(
 
   def compute_loss(
       self,
-      student_output: jax.Array,
-      teacher_output: jax.Array,
+      student_output: DistillationForwardOutput,
+      teacher_output: DistillationForwardOutput,
       labels: jax.Array,
   ) -> tuple[jax.Array, dict[str, jax.Array]]:
     """Computes Loss and Auxiliary Metrics."""
     # Calculate Distillation Loss (KL Divergence)
     # Scale logits by temperature T for soft targets
     # We use explicit float32 casting for stability in loss calculation
-    s_logits = student_output[0].astype(jnp.float32)
-    t_logits = teacher_output[0].astype(jnp.float32)
+    s_logits = student_output.logits.astype(jnp.float32)
+    t_logits = teacher_output.logits.astype(jnp.float32)
 
     # Shape: [num_layers, batch, seq, hidden_dim]
-    s_features = student_output[-1]
-    t_features = teacher_output[-1]
+    s_features = student_output.out_projection_activations
+    t_features = teacher_output.out_projection_activations
 
     if (s_features is None or t_features is None) and self.beta_feature > 0.0:
       raise ValueError(
@@ -210,6 +220,9 @@ def compute_loss(
         s_features_sliced = s_features
         t_features_sliced = t_features
 
+      s_features_sliced = s_features_sliced.astype(jnp.float32)
+      t_features_sliced = t_features_sliced.astype(jnp.float32)
+
       feature_loss = self.beta_feature * self.feature_loss_fn(s_features_sliced, t_features_sliced)
 
     total_loss = base_logit_loss + feature_loss
@@ -227,13 +240,13 @@ def compute_loss(
 
   def compute_eval_loss(
       self,
-      student_output: jax.Array,
+      student_output: DistillationForwardOutput,
       labels: jax.Array,
   ) -> tuple[jax.Array, dict[str, jax.Array]]:
     """Computes Eval Loss and returns empty aux dict (required for consistency)."""
     # Parent logic for task loss
     # We re-implement simple CE here to ensure float32 casting
-    s_logits = student_output.astype(jnp.float32)
+    s_logits = student_output.logits.astype(jnp.float32)
     ce_loss = optax.softmax_cross_entropy(logits=s_logits, labels=labels)
     task_loss = jnp.mean(ce_loss)
 
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -33,7 +33,7 @@
    a standard interface (call signature) that the Tunix `DistillationTrainer` expects.
 """
 
-from typing import Sequence
+from typing import Sequence, Callable
 from absl import app
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
@@ -119,7 +119,7 @@ def optimizer_factory(learning_rate):
   return optimizer
 
 
-def create_forward_fn(config: pyconfig.HyperParameters):
+def create_forward_fn(config: pyconfig.HyperParameters) -> Callable[..., distillation_utils.DistillationForwardOutput]:
   """Creates a forward function closure that binds the specific model configuration.
 
   Args:
@@ -130,7 +130,9 @@ def create_forward_fn(config: pyconfig.HyperParameters):
     Tunix `LogitStrategy` and handles the MaxText-specific forward call.
   """
 
-  def model_forward_fn(model, input_tokens, positions, attention_mask, decoder_segment_ids=None, cache=None, **kwargs):
+  def model_forward_fn(
+      model, input_tokens, positions, attention_mask, decoder_segment_ids=None, cache=None, **kwargs
+  ) -> distillation_utils.DistillationForwardOutput:
     """Forward pass wrapper adapted for raw MaxText models."""
     del kwargs  # Unused
     del attention_mask  # Unused
@@ -141,10 +143,14 @@ def model_forward_fn(model, input_tokens, positions, attention_mask, decoder_seg
         decoder_segment_ids=decoder_segment_ids,
         enable_dropout=config.enable_dropout,
     )
-    hidden_features = None
+    out_projection_activations = None
     if config.distill_beta > 0.0:
-      hidden_features = maxtext_utils.get_intermediate_value(model, "out_projection_activations", clear=True)
-    return logits, hidden_features
+      out_projection_activations = maxtext_utils.get_intermediate_value(model, "out_projection_activations", clear=True)
+
+    retval = distillation_utils.DistillationForwardOutput(
+        logits=logits, out_projection_activations=out_projection_activations
+    )
+    return retval
 
   return model_forward_fn
 
diff --git a/tests/unit/train_distill_test.py b/tests/unit/train_distill_test.py
@@ -169,15 +169,21 @@ def test_monitored_strategy(self):
 
     # Dummy inputs (batch=1, seq=2, vocab=4)
     # Note: Shapes must align for broadcasting
-    student_logits = (jnp.array([[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]]) * 10, jnp.ones((32, 1, 1, 8)))
-    teacher_logits = (jnp.array([[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]]) * 10, jnp.ones((32, 1, 1, 8)))
+    student_output = distillation_utils.DistillationForwardOutput(
+        logits=jnp.array([[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]]) * 10,
+        out_projection_activations=jnp.ones((32, 1, 1, 8)),
+    )
+    teacher_output = distillation_utils.DistillationForwardOutput(
+        logits=jnp.array([[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]]) * 10,
+        out_projection_activations=jnp.ones((32, 1, 1, 8)),
+    )
 
     # Labels must be One-Hot Encoded to match logits shape (1, 2, 4)
     labels_indices = jnp.array([[0, 1]])
     labels = jax.nn.one_hot(labels_indices, 4)
 
     # Run calculation
-    _, metrics = strategy.compute_loss(student_logits, teacher_logits, labels)
+    _, metrics = strategy.compute_loss(student_output, teacher_output, labels)
 
     # Verify structure
     self.assertIsInstance(metrics, dict)
@@ -203,7 +209,20 @@ def test_strategy_compute_eval_loss(self):
     strategy = distillation_utils.CombinedDistillationStrategy(
         student_forward_fn=mock.Mock(), teacher_forward_fn=mock.Mock(), labels_fn=mock.Mock(), temperature=1.0, alpha=0.5
     )
-    logits = jnp.array([[[10.0, 0.0]]])
+    # Case where feature loss is enabled
+    logits = distillation_utils.DistillationForwardOutput(
+        logits=jnp.array([[[10.0, 0.0]]]), out_projection_activations=np.ones((32, 1, 1, 8))
+    )
+    labels = jnp.array([[[1.0, 0.0]]])
+
+    loss, aux = strategy.compute_eval_loss(logits, labels)
+    self.assertTrue(isinstance(loss, jax.Array))
+    self.assertEqual(aux, {})
+
+    # Case where feature loss is disabled.
+    logits = distillation_utils.DistillationForwardOutput(
+        logits=jnp.array([[[10.0, 0.0]]]), out_projection_activations=None
+    )
     labels = jnp.array([[[1.0, 0.0]]])
 
     loss, aux = strategy.compute_eval_loss(logits, labels)