Merge pull request #2989 from AI-Hypercomputer:docs_fix

Google-ML-Automation · Google-ML-Automation · commit 1d29ce181ea0 · 2026-01-22T12:19:14.000-08:00
PiperOrigin-RevId: 859721293
diff --git a/docs/conf.py b/docs/conf.py
@@ -97,6 +97,7 @@
     "cloud_tpu_diagnostics",
     "google_cloud_mldiagnostics",
     "jetstream",
+    "librosa",
     "ml_goodput_measurement",
     "pathwaysutils",
     "safetensors",
@@ -178,6 +179,7 @@ def run_apidoc(_):
       os.path.join(MAXTEXT_REPO_ROOT, "src", "MaxText", "scratch_code"),
       os.path.join(MAXTEXT_REPO_ROOT, "src", "MaxText", "utils", "ckpt_conversion"),
       os.path.join(MAXTEXT_REPO_ROOT, "src", "MaxText", "rl"),
+      os.path.join(MAXTEXT_REPO_ROOT, "src", "MaxText", "multimodal_utils.py"),
   ]
 
   # Run the command and check for errors
diff --git a/docs/tutorials/posttraining/sft.md b/docs/tutorials/posttraining/sft.md
@@ -15,6 +15,7 @@
  -->
 
 # SFT on single-host TPUs
+
 Supervised fine-tuning (SFT) is a process where a pre-trained large language model is fine-tuned on a labeled dataset to adapt the model to perform better on specific tasks.
 
 This tutorial demonstrates step-by-step instructions for setting up the environment and then training the model on a Hugging Face dataset using SFT.
@@ -64,16 +65,19 @@ export TRAIN_DATA_COLUMNS=<data columns to train on> # e.g., ['messages']
 ```
 
 ## Get your model checkpoint
+
 This section explains how to prepare your model checkpoint for use with MaxText. You have two options: using an existing MaxText checkpoint or converting a Hugging Face checkpoint.
 
 ### Option 1: Using an existing MaxText checkpoint
+
 If you already have a MaxText-compatible model checkpoint, simply set the following environment variable and move on to the next section.
 
 ```sh
 export PRE_TRAINED_MODEL_CKPT_PATH=<gcs path for MaxText checkpoint> # e.g., gs://my-bucket/my-model-checkpoint/0/items
 ```
 
 ### Option 2: Converting a Hugging Face checkpoint
+
 If your model checkpoint is from Hugging Face, you need to run a conversion script to make it MaxText-compatible.
 
 1. **Set the Output Path:** First, define where the converted MaxText checkpoint will be saved. For example:
@@ -101,6 +105,7 @@ export PRE_TRAINED_MODEL_CKPT_PATH=${PRE_TRAINED_MODEL_CKPT_DIRECTORY}/0/items
 ```
 
 ## Run SFT on Hugging Face Dataset
+
 Now you are ready to run SFT using the following command:
 
 ```sh
@@ -118,4 +123,5 @@ python3 -m MaxText.sft.sft_trainer src/MaxText/configs/sft.yml \
     train_data_columns=${TRAIN_DATA_COLUMNS} \
     profiler=xplane
 ```
+
 Your fine-tuned model checkpoints will be saved here: `$BASE_OUTPUT_DIRECTORY/$RUN_NAME/checkpoints`.
diff --git a/src/MaxText/maxengine.py b/src/MaxText/maxengine.py
@@ -15,7 +15,7 @@
 """Implementation of Engine API for MaxText."""
 
 from collections import defaultdict
-from typing import Any, Callable
+from typing import Any, Callable, Union
 import functools
 import os.path
 import uuid
@@ -102,7 +102,7 @@ class MaxEngine(engine_api.Engine):
   JetStream efficient serving infrastructure.
   """
 
-  def __init__(self, config: Any, devices: config_lib.Devices | None = None):
+  def __init__(self, config: Any, devices: Union[config_lib.Devices, None] = None):
     self.config = config
 
     # Mesh definition
diff --git a/src/MaxText/multimodal/utils.py b/src/MaxText/multimodal/utils.py
@@ -27,7 +27,7 @@
 class PreprocessorOutput:
   """Holds the output of an image preprocessor.
 
-  Attributes:
+  Args:
     pixel_values: A JAX array containing the processed image pixel data.
                   The shape and format depend on the specific model and
                   preprocessing steps (e.g., [H, W, C] for Gemma3 or