AI-Hypercomputer
diff --git a/‎src/MaxText/configs/distillation.yml‎
Lines changed: 57 additions & 0 deletions b/‎src/MaxText/configs/distillation.yml‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 19 additions & 0 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/MaxText/distillation/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎src/MaxText/distillation/__init__.py‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,57 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Soft Distillation Configuration
+
+# Inherit MaxText defaults
+base_config: "base.yml"
+
+# --- Student Specifics ---
+# These are passed as kwargs to the Student config initialization
+student_overrides:
+  model_name: "llama3.1-8b"
+
+# --- Teacher Specifics ---
+# These are passed as kwargs to the Teacher config initialization
+teacher_overrides:
+  model_name: "llama3.1-8b"
+
+# --- Distillation Loss ---
+distill_alpha: 0.5
+distill_temperature: 1.0
+
+# --- Dataset & Tokenizer ---
+hf_path: "OptimalScale/ClimbMix"
+dataset_type: "hf"
+tokenizer_path: "meta-llama/Llama-3.1-8B"
+tokenizer_type: "huggingface"
+
+max_target_length: 2048
+
+# --- Training Loop ---
+steps: 200000
+checkpoint_period: 2000
+log_period: 10
+save_checkpoint_on_completion: True
+
+# --- Batch Size Strategy ---
+# Global Batch Size = per_device_batch_size * num_devices * gradient_accumulation_steps
+per_device_batch_size: 2
+gradient_accumulation_steps: 8
+
+# --- Learning Rate Schedule ---
+learning_rate: 2.0e-4 
+learning_rate_schedule_steps: 200000
+warmup_steps_fraction: 0.1
+cosine_learning_rate_final_fraction: 0.1
@@ -967,6 +967,24 @@ class FineTuning(BaseModel):
   use_grpo: None | bool = Field(None, description="If True, enables Group Relative Policy Optimization.")
 
 
+class Distillation(BaseModel):
+  """Configuration for Knowledge Distillation."""
+
+  # --- Overrides ---
+  # These dictionaries allow flexible configuration injection for Student/Teacher
+  # without needing to duplicate the entire MaxText schema here.
+  student_overrides: dict[str, Any] = Field(
+      default_factory=dict, description="Overrides specific to the Student model (e.g., {'num_query_heads': 16})."
+  )
+  teacher_overrides: dict[str, Any] = Field(
+      default_factory=dict, description="Overrides specific to the Teacher model (e.g., {'num_query_heads': 64})."
+  )
+
+  # --- Loss Params ---
+  distill_alpha: float = Field(0.5, description="Weight for the distillation loss component.")
+  distill_temperature: float = Field(1.0, description="Temperature for distillation softening.")
+
+
 class TrainingLoop(BaseModel):
   """Configuration for the main training loop, evaluation, and reproducibility."""
 
@@ -1634,6 +1652,7 @@ class MaxTextConfig(
     AdamW,
     Muon,
     FineTuning,
+    Distillation,
     # Reinforcement Learning
     RLHardware,
     VLLM,
 
@@ -0,0 +1,13 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.