updated code formatting and style

ajkv-google · ajkv-google · commit 1da3909904a9 · 2026-03-19T22:02:11.000Z
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -72,12 +72,15 @@ class MaxTextTrainingInput(peft_trainer.TrainingInput):
   top_k_logits: jax.Array = None
   top_k_indices: jax.Array = None
 
+
 # -----------------------------------------------------------------------------
 # Data Loading Adapter
 # -----------------------------------------------------------------------------
 
+
 class OfflineArrayRecordIterator:
   """Reads the pre-generated global top-k logits file."""
+
   def __init__(self, data_dir: str, epochs: int = 100):
     # Check if the user passed a directory or a direct file path
     if tf.io.gfile.isdir(data_dir):
@@ -100,18 +103,18 @@ def __iter__(self):
   def __next__(self):
     if self.record_index < self.num_records:
       pass
-      
+
     self.current_epoch += 1
     if self.current_epoch >= self.epochs:
-        raise StopIteration
-    
+      raise StopIteration
+
     self.record_index = 0
     self.reader = array_record_module.ArrayRecordReader(self.filepath)
 
     record = self.reader.read()
     self.record_index += 1
     data = pickle.loads(record)
-    
+
     # Map the arrays to match MaxText's expected dictionary
     batch = {
         "inputs": data["tokens"],
@@ -121,9 +124,10 @@ def __next__(self):
     for key in ["inputs_position", "inputs_segmentation", "targets_segmentation", "targets"]:
       if key in data:
         batch[key] = data[key]
-        
+
     return batch
 
+
 class MaxTextToTunixIterator:
   """Adapts the raw dictionary output of MaxText's data loader to Tunix objects.
 
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -307,7 +307,7 @@ def _prepare_inputs(
         targets_position=input_data.targets_position,
         targets_segmentation=input_data.targets_segmentation,
         top_k_logits=input_data.top_k_logits,
-        top_k_indices=input_data.top_k_indices
+        top_k_indices=input_data.top_k_indices,
     )
 
   def _post_process_train_step(self, aux: dict[str, jax.Array]) -> None:
@@ -406,7 +406,12 @@ def get_maxtext_model(config: pyconfig.HyperParameters, mesh: jax.sharding.Mesh)
 # -----------------------------------------------------------------------------
 
 
-def train_distill(student_config: pyconfig.HyperParameters, teacher_config: pyconfig.HyperParameters, is_offline: bool = False, offline_data_dir: str | None = None) -> None:
+def train_distill(
+    student_config: pyconfig.HyperParameters,
+    teacher_config: pyconfig.HyperParameters,
+    is_offline: bool = False,
+    offline_data_dir: str | None = None,
+) -> None:
   """Main distillation training loop.
 
   Orchestrates the loading of both student and teacher models, configures the
@@ -550,29 +555,23 @@ def custom_gen_model_input_fn(batch):
         "targets_segmentation": batch.targets_segmentation,
         "cache": None,
     }
-    
+
     # If we are in online mode then we exit
     if getattr(batch, "top_k_logits", None) is None:
       return inputs_dict
 
     # Scatter the offline arrays into a dense tensor of -10000s
     dense_shape = batch.input_tokens.shape + (student_config.vocab_size,)
     dense_logits = jnp.full(dense_shape, -10000.0, dtype=jnp.float32)
-    dense_logits = jnp.put_along_axis(
-        dense_logits, 
-        batch.top_k_indices, 
-        batch.top_k_logits, 
-        axis=-1, 
-        inplace=False
-    )
-    
+    dense_logits = jnp.put_along_axis(dense_logits, batch.top_k_indices, batch.top_k_logits, axis=-1, inplace=False)
+
     # Inject it as teacher_output so the trainer skips the teacher forward pass
     inputs_dict["teacher_output"] = distillation_utils.DistillationForwardOutput(
         logits=dense_logits, out_projection_activations=None
     )
-    
+
     return inputs_dict
-  
+
   trainer = trainer.with_gen_model_input_fn(custom_gen_model_input_fn)
 
   # 9. Create Iterator Wrappers (Use Utils)
@@ -635,7 +634,7 @@ def main(argv: Sequence[str], local_args) -> None:
   student_config = pyconfig.initialize(argv, **student_overrides)
 
   # 3. Initialize TEACHER Config
-   # We isolate the Teacher from Student CLI arguments (like pruning params).
+  # We isolate the Teacher from Student CLI arguments (like pruning params).
   teacher_overrides = global_config.teacher_overrides
 
   # Ensure load_parameters_path is set in overrides
@@ -668,7 +667,7 @@ def main(argv: Sequence[str], local_args) -> None:
       default=None,
       help="GCS or local path to the pre-generated ArrayRecord teacher data.",
   )
-  
+
   # parse_known_args separates our custom flags from MaxText's standard args
   local_arg, remaining_args = parser.parse_known_args()