added a unit test + format

vlad-karp · vlad-karp · commit b6bea94d5aca · 2026-03-12T17:33:26.000Z
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -131,8 +131,7 @@ def create_forward_fn(config: pyconfig.HyperParameters) -> Callable[..., distill
   """
 
   def model_forward_fn(
-      model, input_tokens, positions, attention_mask, decoder_segment_ids=None, cache=None, 
-      **kwargs
+      model, input_tokens, positions, attention_mask, decoder_segment_ids=None, cache=None, **kwargs
   ) -> distillation_utils.DistillationForwardOutput:
     """Forward pass wrapper adapted for raw MaxText models."""
     del attention_mask  # Unused
@@ -215,7 +214,7 @@ def _train_step(self, model, optimizer, inputs):
 
     batch = self.gen_model_input_fn(inputs)
 
-    def loss_wrapper(student, teacher, batch):      
+    def loss_wrapper(student, teacher, batch):
       if "teacher_output" in batch:
         teacher_output = batch["teacher_output"]
       else:
@@ -440,7 +439,7 @@ def train_distill(student_config: pyconfig.HyperParameters, teacher_config: pyco
 
   # 3. Define Distillation Strategy
   def labels_fn(targets, targets_segmentation=None, **kwargs):
-    """Converts integer targets to masked one-hot vectors for hard label loss."""    
+    """Converts integer targets to masked one-hot vectors for hard label loss."""
     del kwargs  # Unused
     one_hot = jax.nn.one_hot(targets, student_config.vocab_size)
     mask = jnp.not_equal(targets, pad_id).astype(one_hot.dtype)[..., None]
diff --git a/tests/unit/train_distill_test.py b/tests/unit/train_distill_test.py
@@ -249,6 +249,66 @@ def test_train_step_calls_teacher_forward_when_output_missing(self, mock_value_a
     self.assertEqual(loss, mock_loss)
     self.assertEqual(aux, mock_aux)
 
+  @mock.patch("maxtext.trainers.post_train.distillation.train_distill.jax.tree.map")
+  @mock.patch("maxtext.trainers.post_train.distillation.train_distill.nnx.value_and_grad")
+  def test_train_step_passes_targets_segmentation(self, mock_value_and_grad, mock_tree_map):
+    """Verifies strategy callbacks receive decoder_target_tokens and decoder_target_mask."""
+    # 1. Initialize Trainer
+    # pylint: disable=no-value-for-parameter
+    trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
+    trainer.strategy = mock.Mock()
+
+    # 2. Setup Batch WITH targets_segmentation
+    mock_targets_segmentation = jnp.array([[1, 1, 0]])
+    mock_batch = {
+        "input_tokens": mock.Mock(),
+        "positions": mock.Mock(),
+        "attention_mask": mock.Mock(),
+        "decoder_segment_ids": mock.Mock(),
+        "targets": mock.Mock(),
+        "targets_segmentation": mock_targets_segmentation,
+    }
+    trainer.gen_model_input_fn = mock.Mock(return_value=mock_batch)
+
+    # 3. Setup Models & Inputs
+    teacher_model, student_model = mock.Mock(), mock.Mock()
+    model_bundle = train_distill.ModelBundle(teacher_model=teacher_model, student_model=student_model)
+    optimizer, inputs = mock.Mock(), mock.Mock()
+
+    # 4. Configure mocked nnx.value_and_grad
+    mock_grad_fn = mock.Mock(return_value=((mock.Mock(), mock.Mock()), mock.Mock()))
+    mock_value_and_grad.return_value = mock_grad_fn
+
+    # 5. Execute outer function & trigger inner loss_wrapper
+    trainer._train_step(model_bundle, optimizer, inputs)
+    loss_wrapper = mock_value_and_grad.call_args[0][0]
+    loss_wrapper(student_model, teacher_model, mock_batch)
+
+    # 6. Assertions
+    trainer.strategy.labels_fn.assert_called_once_with(
+        mock_batch["targets"], targets_segmentation=mock_targets_segmentation
+    )
+    trainer.strategy.student_forward_fn.assert_called_once_with(
+        model=student_model,
+        input_tokens=mock_batch["input_tokens"],
+        positions=mock_batch["positions"],
+        attention_mask=mock_batch["attention_mask"],
+        decoder_segment_ids=mock_batch["decoder_segment_ids"],
+        decoder_target_tokens=mock_batch["targets"],
+        decoder_target_mask=mock_targets_segmentation,
+        cache=None,
+    )
+    trainer.strategy.teacher_forward_fn.assert_called_once_with(
+        model=teacher_model,
+        input_tokens=mock_batch["input_tokens"],
+        positions=mock_batch["positions"],
+        attention_mask=mock_batch["attention_mask"],
+        decoder_segment_ids=mock_batch["decoder_segment_ids"],
+        decoder_target_tokens=mock_batch["targets"],
+        decoder_target_mask=mock_targets_segmentation,
+        cache=None,
+    )
+
   def test_optimizer_factory(self):
     """Verifies the optimizer factory injects hyperparams and handles configs."""
     # Mock config