Merge pull request #3332 from AI-Hypercomputer:vladk/distill-sft-hf

Google-ML-Automation · Google-ML-Automation · commit 27b37905f4a2 · 2026-03-09T17:30:05.000-07:00
PiperOrigin-RevId: 881116982
diff --git a/src/maxtext/configs/post_train/distillation-sft.yml b/src/maxtext/configs/post_train/distillation-sft.yml
@@ -0,0 +1,33 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Soft Distillation Configuration
+
+# Inherit MaxText defaults
+base_config: "post_train/distillation.yml"
+
+use_sft: True
+sft_train_on_completion_only: True
+
+# --- Dataset & Tokenizer ---
+hf_path: "HuggingFaceH4/ultrachat_200k"
+dataset_type: "hf"
+
+# chat_template is required for sft mode & HF pipeline. Many tokenizers already provide it.
+# Some non-instruct versions of hf tokenizers have no chat template defined so one has to specify it here
+chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+train_split: "train_sft"
+eval_split: "test_sft"
+train_data_columns: ["messages"]
+eval_data_columns: ["messages"]
diff --git a/src/maxtext/configs/post_train/distillation.yml b/src/maxtext/configs/post_train/distillation.yml
@@ -51,7 +51,7 @@ per_device_batch_size: 2
 gradient_accumulation_steps: 8
 
 # --- Learning Rate Schedule ---
-learning_rate: 2.0e-4 
+learning_rate: 2.0e-4
 learning_rate_schedule_steps: 200000
 warmup_steps_fraction: 0.1
-cosine_learning_rate_final_fraction: 0.1
+learning_rate_final_fraction: 0.1
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -19,6 +19,7 @@
 import datetime
 import enum
 from enum import Enum
+from jinja2 import Environment, TemplateSyntaxError
 import logging
 import math
 from math import prod
@@ -926,6 +927,9 @@ class Tokenizer(BaseModel):
   tokenizer_type: TokenizerType = Field(TokenizerType.SENTENCEPIECE, description="The type of tokenizer.")
   use_chat_template: bool = Field(False, description="Whether to use the chat template for tokenization.")
   chat_template_path: str = Field("", description="Path to chat template json file.")
+  chat_template: str = Field(
+      "", description="Chat template to use with HF tokenizers. It should be a valid Jinja2-formatted template."
+  )
   tokenize_train_data: bool = Field(True, description="If False, assumes the training dataset is pre-tokenized.")
   tokenize_eval_data: bool = Field(True, description="If False, assumes the evaluation dataset is pre-tokenized.")
   add_bos: bool = Field(True, description="Whether to add a beginning-of-sentence token.")
@@ -1991,6 +1995,15 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
       )
       self.tokenizer_path = tokenizer_path
 
+    # validate chat_template format if defined
+    chat_template = getattr(self, "chat_template", "")
+    if chat_template:
+      try:
+        env = Environment()
+        env.parse(chat_template)
+      except TemplateSyntaxError as e:
+        raise ValueError(f"Specified chat_template is invalid: {e}") from e
+
     # C. SET PRIMARY DEPENDENCIES & DEFAULTS
     # If learning_rate_schedule_steps is -1, it defaults to the total number of training steps.
     if self.learning_rate_schedule_steps == -1:
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -14,6 +14,8 @@
 
 """Input pipeline using Huggingface datasets."""
 
+from typing import Optional
+
 import ml_collections
 
 import jax
@@ -213,6 +215,7 @@ def preprocessing_pipeline(
     grain_worker_count=1,  # only support 0 or 1
     max_segments_per_seq=None,
     num_epoch=1,
+    chat_template: Optional[str] = None,
 ):
   """pipeline for preprocessing HF dataset"""
   import datasets  # pylint: disable=import-outside-toplevel
@@ -242,19 +245,22 @@ def preprocessing_pipeline(
       token=hf_access_token,
   )
 
+  dataset = dataset.select_columns(data_column_names)
+
   if use_sft:
-    dataset = dataset.select_columns(data_column_names)
+    if chat_template:
+      tokenizer.chat_template = chat_template
 
     supported_columns = [["prompt", "completion"], ["messages"], ["question", "answer"]]
     assert any(
         set(data_column_names) == set(supported) for supported in supported_columns
     ), f"Dataset column names mismatch. Expected columns to match one of {supported_columns}, but got {data_column_names}"
 
     # convert instruction dataset to conversational format
+    # currently only works for Q&A datasets
     dataset, data_column_names = instruction_data_processing.convert_to_conversational_format(
         dataset=dataset, data_columns=data_column_names, chat_template_path=chat_template_path
     )
-
     assert input_pipeline_utils.is_conversational(
         dataset.features, data_column_names
     ), "Dataset is not in conversational format."
@@ -276,8 +282,6 @@ def preprocessing_pipeline(
         input_pipeline_utils.apply_chat_template,
         fn_kwargs={"tokenizer_model": tokenizer, "data_column_name": data_column_names[0]},
     )
-  else:
-    dataset = dataset.select_columns(data_column_names)
 
   pad_id = _get_pad_id(tokenizer)
 
@@ -426,6 +430,7 @@ def make_hf_train_iterator(
         chat_template_path=config.chat_template_path,
         max_segments_per_seq=config.max_segments_per_seq,
         num_epoch=config.num_epoch,
+        chat_template=config.chat_template,
     )
   return train_iter
 
@@ -482,5 +487,6 @@ def make_hf_eval_iterator(
         sft_train_on_completion_only=config.sft_train_on_completion_only,
         chat_template_path=config.chat_template_path,
         max_segments_per_seq=config.max_segments_per_seq,
+        chat_template=config.chat_template,
     )
   return eval_iter
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -60,6 +60,10 @@ class MaxTextTrainingInput(distillation_trainer.TrainingInput):
   decoder_segment_ids: jax.Array = None
   #: Ground truth target tokens (used for loss calculation and logging).
   targets: jax.Array = None
+  #: Position indices for the target tokens.
+  targets_position: jax.Array = None
+  #: Segment IDs for packed target tokens.
+  targets_segmentation: jax.Array = None
 
 
 # -----------------------------------------------------------------------------
@@ -106,6 +110,11 @@ def __next__(self) -> MaxTextTrainingInput:
       input_mask = jnp.ones_like(batch["inputs"], dtype=bool)
       seg_ids = None
 
+    # If in SFT-mode, 'targets' contains prompts which should be masked out when computing the loss.
+    # If using with packing the targets_segmentation mask is supposed to be a combined target+packing mask
+    targets_segmentation = batch.get("targets_segmentation", jnp.ones_like(batch["targets"]))
+    targets_position = batch.get("targets_position", batch.get("inputs_position"))
+
     # pylint: disable=unexpected-keyword-arg
     return MaxTextTrainingInput(
         input_tokens=batch["inputs"],
@@ -114,6 +123,8 @@ def __next__(self) -> MaxTextTrainingInput:
         positions=batch["inputs_position"],
         decoder_segment_ids=seg_ids,
         targets=batch["targets"],
+        targets_position=targets_position,
+        targets_segmentation=targets_segmentation,
     )
 
 
@@ -134,6 +145,7 @@ def __init__(
       layer_indices: Optional[List[int]] = None,
       feature_loss_fn: Callable[[jax.Array, jax.Array], jax.Array] | None = None,
       cosine_distance_axis: int | tuple[int, ...] = -1,
+      sft_mode: bool = False,
   ):
     """Initializes the Combined strategy using tunix logit.LogitStrategy.
 
@@ -165,6 +177,7 @@ def __init__(
       self.feature_loss_fn = lambda student_features, teacher_features: jnp.mean(
           optax.cosine_distance(student_features, teacher_features, axis=cosine_distance_axis)
       )
+    self.sft_mode = sft_mode
 
   def compute_loss(
       self,
@@ -192,19 +205,23 @@ def compute_loss(
     log_student_probs_temp = jax.nn.log_softmax(s_logits / self.temperature, axis=-1)
     teacher_probs_temp = jax.nn.softmax(t_logits / self.temperature, axis=-1)
 
+    # labels are supposed to have all sft masks applied by this moment
+    labels_mask = jnp.any(labels != 0, axis=-1, keepdims=True) if self.sft_mode else None
+    mean_mask = jnp.squeeze(labels_mask, axis=-1) if labels_mask is not None else None
+
     # KL(Teacher || Student)
-    kl_div = optax.kl_divergence(log_student_probs_temp, teacher_probs_temp)
+    kl_div = optax.kl_divergence(log_student_probs_temp, teacher_probs_temp, where=labels_mask)
 
     # Scale gradients by T^2 (Hinton et al.)
-    soft_loss = jnp.mean(kl_div) * (self.temperature**2)
+    soft_loss = jnp.mean(kl_div, where=mean_mask) * (self.temperature**2)
 
     # 1. Student Hard Loss (Existing)
-    ce_loss_student = optax.softmax_cross_entropy(logits=s_logits, labels=labels)
-    hard_loss = jnp.mean(ce_loss_student)
+    ce_loss_student = optax.softmax_cross_entropy(logits=s_logits, labels=labels, where=labels_mask)
+    hard_loss = jnp.mean(ce_loss_student, where=mean_mask)
 
     # 2. Teacher Hard Loss (For Verification)
-    ce_loss_teacher = optax.softmax_cross_entropy(logits=t_logits, labels=labels)
-    teacher_hard_loss = jnp.mean(ce_loss_teacher)
+    ce_loss_teacher = optax.softmax_cross_entropy(logits=t_logits, labels=labels, where=labels_mask)
+    teacher_hard_loss = jnp.mean(ce_loss_teacher, where=mean_mask)
 
     # 3. Combine losses
     base_logit_loss = (self.alpha * soft_loss) + ((1.0 - self.alpha) * hard_loss)
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -134,14 +134,15 @@ def model_forward_fn(
       model, input_tokens, positions, attention_mask, decoder_segment_ids=None, cache=None, **kwargs
   ) -> distillation_utils.DistillationForwardOutput:
     """Forward pass wrapper adapted for raw MaxText models."""
-    del kwargs  # Unused
     del attention_mask  # Unused
     del cache  # Unused
     logits = model(
         decoder_input_tokens=input_tokens,
         decoder_positions=positions,
         decoder_segment_ids=decoder_segment_ids,
         enable_dropout=config.enable_dropout,
+        decoder_target_tokens=kwargs.get("targets", None),
+        decoder_target_mask=kwargs.get("targets_segmentation", None),
     )
     out_projection_activations = None
     if config.distill_beta > 0.0:
@@ -213,6 +214,8 @@ def _prepare_inputs(
         positions=input_data.positions,
         decoder_segment_ids=input_data.decoder_segment_ids,
         targets=input_data.targets,
+        targets_position=input_data.targets_position,
+        targets_segmentation=input_data.targets_segmentation,
     )
 
   def _post_process_train_step(self, aux: dict[str, jax.Array]) -> None:
@@ -352,11 +355,13 @@ def train_distill(student_config: pyconfig.HyperParameters, teacher_config: pyco
   teacher_model = get_maxtext_model(teacher_config, mesh)
 
   # 3. Define Distillation Strategy
-  def labels_fn(targets, **kwargs):
+  def labels_fn(targets, targets_segmentation=None, **kwargs):
     """Converts integer targets to masked one-hot vectors for hard label loss."""
     del kwargs  # Unused
     one_hot = jax.nn.one_hot(targets, student_config.vocab_size)
     mask = jnp.not_equal(targets, pad_id).astype(one_hot.dtype)[..., None]
+    if targets_segmentation is not None:
+      mask = mask * (targets_segmentation != 0)[..., None]
     return one_hot * mask
 
   # Both Student and Teacher use the same forward logic via the adapter
@@ -372,6 +377,7 @@ def labels_fn(targets, **kwargs):
       alpha=student_config.distill_alpha,
       beta_feature=student_config.distill_beta,
       layer_indices=student_config.distill_layer_indices,
+      sft_mode=student_config.use_sft,
   )
 
   student_model, teacher_model = strategy.pre_process_models(student_model, teacher_model)
@@ -436,6 +442,8 @@ def labels_fn(targets, **kwargs):
           "attention_mask": batch.input_mask,
           "decoder_segment_ids": batch.decoder_segment_ids,
           "targets": batch.targets,  # Passed to strategy (labels_fn)
+          "targets_position": batch.targets_position,  # Passed to strategy (labels_fn)
+          "targets_segmentation": batch.targets_segmentation,  # Passed to strategy (labels_fn)
           "cache": None,
       }
   )
diff --git a/tests/unit/train_distill_test.py b/tests/unit/train_distill_test.py
@@ -76,6 +76,27 @@ def test_maxtext_to_tunix_iterator(self):
     expected_mask = dummy_batch["inputs_segmentation"] != 0
     np.testing.assert_array_equal(tunix_input.input_mask, expected_mask)
 
+  def test_maxtext_to_tunix_iterator_sft(self):
+    """Verifies SFT-related fields are handled correctly."""
+    # 1. Create a dummy batch with SFT fields
+    dummy_batch_sft = {
+        "inputs": np.array([[10, 11]]),
+        "inputs_position": np.array([[0, 1]]),
+        "targets": np.array([[11, 12]]),
+        "targets_position": np.array([[100, 101]]),  # Custom position
+        "targets_segmentation": np.array([[0, 1]]),  # Custom segmentation (mask)
+    }
+    dummy_iter_sft = iter([dummy_batch_sft])
+
+    # 2. Initialize Adapter and get output
+    adapter_sft = distillation_utils.MaxTextToTunixIterator(dummy_iter_sft)
+    tunix_input_sft = next(adapter_sft)
+
+    # 3. Verify SFT fields are passed through
+    self.assertIsInstance(tunix_input_sft, distillation_utils.MaxTextTrainingInput)
+    np.testing.assert_array_equal(tunix_input_sft.targets_position, dummy_batch_sft["targets_position"])
+    np.testing.assert_array_equal(tunix_input_sft.targets_segmentation, dummy_batch_sft["targets_segmentation"])
+
   def test_maxtext_to_tunix_iterator_packed_fallback(self):
     """Verifies fallback behavior when segmentation is missing."""
     dummy_batch = {
@@ -161,6 +182,12 @@ def test_optimizer_factory(self):
       train_distill.get_distillation_optimizer(config, max_train_steps=100)
 
   def test_monitored_strategy(self):
+    self._test_monitored_strategy(False)
+
+  def test_monitored_strategy_sft(self):
+    self._test_monitored_strategy(True)
+
+  def _test_monitored_strategy(self, sft_mode: bool):
     """Verifies the strategy calculates metrics and returns the correct tuple."""
     strategy = distillation_utils.CombinedDistillationStrategy(
         student_forward_fn=lambda m, **k: None,
@@ -170,6 +197,7 @@ def test_monitored_strategy(self):
         alpha=0.5,
         beta_feature=1.0,
         layer_indices=None,
+        sft_mode=sft_mode,
     )
 
     # Dummy inputs (batch=1, seq=2, vocab=4)
@@ -210,9 +238,17 @@ def test_monitored_strategy(self):
     self.assertLess(metrics["distill/out_proj_feature_loss"], 1e-5)
 
   def test_strategy_compute_eval_loss(self):
+    self._verify_strategy_compute_eval_loss(sft_mode=False)
+
+  def _verify_strategy_compute_eval_loss(self, sft_mode):
     """Covers MonitoredLogitStrategy.compute_eval_loss."""
     strategy = distillation_utils.CombinedDistillationStrategy(
-        student_forward_fn=mock.Mock(), teacher_forward_fn=mock.Mock(), labels_fn=mock.Mock(), temperature=1.0, alpha=0.5
+        student_forward_fn=mock.Mock(),
+        teacher_forward_fn=mock.Mock(),
+        labels_fn=mock.Mock(),
+        temperature=1.0,
+        alpha=0.5,
+        sft_mode=sft_mode,
     )
     # Case where feature loss is enabled
     logits = distillation_utils.DistillationForwardOutput(
@@ -234,6 +270,9 @@ def test_strategy_compute_eval_loss(self):
     self.assertTrue(isinstance(loss, jax.Array))
     self.assertEqual(aux, {})
 
+  def test_strategy_compute_eval_loss_sft(self):
+    self._verify_strategy_compute_eval_loss(sft_mode=True)
+
   def test_setup_pipeline_grain_enabled(self):
     """Covers _setup_and_restore_input_pipeline when Grain IS detected."""
     mock_trainer = mock.Mock()