Merge pull request #2975 from AI-Hypercomputer:rl_commit

Google-ML-Automation · Google-ML-Automation · commit 828b66874704 · 2026-01-23T13:47:00.000-08:00
PiperOrigin-RevId: 860239730
diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml
@@ -115,6 +115,7 @@ jobs:
       device_name: v6e-4
       image_type: ${{ matrix.image_type }}
       cloud_runner: linux-x86-ct6e-180-4tpu
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
@@ -139,6 +140,7 @@ jobs:
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
       worker_group: ${{ matrix.worker_group }}
       total_workers: 2
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_tpu_unit_tests:
     needs: build_and_upload_maxtext_package
@@ -158,6 +160,7 @@ jobs:
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_tpu_integration_tests:
     needs: build_and_upload_maxtext_package
@@ -177,6 +180,7 @@ jobs:
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_tpu_pathways_unit_tests:
     needs: build_and_upload_maxtext_package
@@ -196,6 +200,7 @@ jobs:
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_tpu_pathways_integration_tests:
     needs: build_and_upload_maxtext_package
@@ -215,6 +220,7 @@ jobs:
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_gpu_unit_tests:
     needs: build_and_upload_maxtext_package
@@ -235,6 +241,7 @@ jobs:
       tf_force_gpu_allow_growth: true
       container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   maxtext_gpu_integration_tests:
     needs: build_and_upload_maxtext_package
@@ -255,6 +262,7 @@ jobs:
       tf_force_gpu_allow_growth: true
       container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
+      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
   all_tests_passed:
     name: All Required Tests Passed
diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
@@ -29,15 +29,28 @@ on:
       cloud_runner:
         required: false
         type: string
+    outputs:
+      maxtext_sha:
+        description: "MaxText short SHA used for the build"
+        value: ${{ jobs.build_and_upload.outputs.maxtext_sha }}
 
 permissions:
   contents: read
 jobs:
   build_and_upload:
     runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
     container: python:3.12.3-slim-bullseye
+    outputs:
+      maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout MaxText
+        uses: actions/checkout@v5
+      - name: Get metadata
+        id: vars
+        shell: bash
+        run: |
+          # MaxText SHA used to build the package
+          echo "maxtext_sha=${GITHUB_SHA}" >> $GITHUB_OUTPUT
       - name: Install build tools
         run: |
           python -m pip install --upgrade pip build uv
diff --git a/.github/workflows/run_jupyter_notebooks.yml b/.github/workflows/run_jupyter_notebooks.yml
@@ -31,6 +31,9 @@ on:
       cloud_runner:
         required: false
         type: string
+      maxtext_sha:
+        required: true
+        type: string
     secrets:
       HF_TOKEN:
         required: true
@@ -43,7 +46,10 @@ jobs:
     container:
       image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - name: Checkout MaxText
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ inputs.maxtext_sha }}
       - name: Download the MaxText wheel
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
@@ -64,7 +70,8 @@ jobs:
           .venv/bin/python3 -m ipykernel install --user --name maxtext_venv
 
           # Install Tunix for post-training notebooks
-          uv pip install git+https://github.com/google/tunix
+          git clone https://github.com/google/tunix
+          uv pip install ./tunix
           
           # Install vllm for post-training notebooks
           git clone https://github.com/vllm-project/vllm.git
@@ -95,6 +102,20 @@ jobs:
 
             .venv/bin/papermill "$notebook" "$output_name" -k maxtext_venv
           done
+      - name: Record Commit IDs
+        shell: bash
+        run: |
+          echo "--- MaxText and Post-Training Repositories Commit IDs ---"
+          echo "maxtext: ${GITHUB_SHA:0:7}"
+
+          declare -a repos=("tunix" "vllm" "tpu-inference")
+          for repo_dir in "${repos[@]}"; do
+            if [ -d "$repo_dir" ]; then
+              echo "$repo_dir: $(git -C "$repo_dir" rev-parse --short HEAD)"
+            else
+              echo "Warning: $repo_dir directory not found."
+            fi
+          done
       - name: Upload Outputs
         if: always()
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml
@@ -50,6 +50,9 @@ on:
       cloud_runner:
         required: false
         type: string
+      maxtext_sha:
+        required: true
+        type: string
 
 permissions:
   contents: read
@@ -67,7 +70,10 @@ jobs:
         JAX_BACKEND_TARGET: "grpc://localhost:29000"
       options: ${{ inputs.container_resource_option }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout MaxText
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ inputs.maxtext_sha }}
       - name: Download the maxtext wheel
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
@@ -58,6 +58,9 @@ on:
         required: false
         type: number
         default: 1
+      maxtext_sha:
+        required: true
+        type: string
 
 permissions:
   contents: read
@@ -74,7 +77,10 @@ jobs:
         ALLOW_MULTIPLE_LIBTPU_LOAD: ${{ inputs.device_type == 'cpu' && 'true' || '' }} # bypass /tmp/libtpu_lockfile check for cpu tests, which don't actually use accelerators (to allow concurrency)
       options: ${{ inputs.container_resource_option }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Checkout MaxText
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ inputs.maxtext_sha }}
       - name: Download the maxtext wheel
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
diff --git a/docs/tutorials/posttraining/rl.md b/docs/tutorials/posttraining/rl.md
@@ -16,20 +16,39 @@
 
 # Reinforcement Learning on single-host TPUs
 
-This tutorial demonstrates step-by-step instructions for setting up the environment and then training the Llama3.1 8B-IT model on the GSM8K math reasoning dataset using a single host TPU-VM such as `v6e-8/v5p-8`.
-
-We utilize two RL algorithms, implemented via the Tunix library, to enhance the model's reasoning capabilities:
-
-* **Group Relative Policy Optimization (GRPO)**: GRPO is an RL algorithm designed to enhance the reasoning abilities of LLMs. It is a variant of Proximal Policy Optimization (PPO) that reduces memory usage by eliminating the need for a separate value function model. GRPO works by generating multiple responses for a given prompt, evaluating these responses using a reward model, and then calculating a relative advantage based on the group's performance to update the policy.
-
-* **Group Sequence Policy Optimization (GSPO)**: GSPO is an RL algorithm that improves training efficiency and performance of LLMs by using sequence-level importance ratios and operations. GSPO defines the importance ratio based on sequence likelihood and performs sequence-level clipping, rewarding, and optimization.
-
-For efficient model inference and response generation during this process, we rely on the vLLM library.
+This tutorial demonstrates step-by-step instructions for setting up the
+environment and then training the Llama3.1 8B-IT model on the GSM8K math
+reasoning dataset using a single host TPU-VM such as `v6e-8/v5p-8`.
+
+We utilize two RL algorithms, implemented via the Tunix library, to enhance the
+model's reasoning capabilities:
+
+- **Group Relative Policy Optimization (GRPO)**: GRPO is an RL algorithm
+  designed to enhance the reasoning abilities of LLMs. It is a variant of
+  Proximal Policy Optimization (PPO) that reduces memory usage by eliminating
+  the need for a separate value function model. GRPO works by generating
+  multiple responses for a given prompt, evaluating these responses using a
+  reward model, and then calculating a relative advantage based on the group's
+  performance to update the policy.
+
+- **Group Sequence Policy Optimization (GSPO)**: GSPO is an RL algorithm that
+  improves training efficiency and performance of LLMs by using sequence-level
+  importance ratios and operations. GSPO defines the importance ratio based on
+  sequence likelihood and performs sequence-level clipping, rewarding, and
+  optimization.
+
+For efficient model inference and response generation during this process, we
+rely on the vLLM library.
 
 Let's get started!
 
 ## Create virtual environment and Install MaxText dependencies
-If you have already completed the [MaxText installation](../../install_maxtext.md), you can skip to the next section for post-training dependencies installations. Otherwise, please install `MaxText` using the following commands before proceeding.
+
+If you have already completed the
+[MaxText installation](../../install_maxtext.md), you can skip to the next
+section for post-training dependencies installations. Otherwise, please install
+`MaxText` using the following commands before proceeding.
+
 ```bash
 # 1. Clone the repository
 git clone https://github.com/AI-Hypercomputer/maxtext.git
@@ -50,20 +69,48 @@ install_maxtext_github_deps
 
 ### Option 1: From PyPI releases
 
-> **Caution:** RL in MaxText is currently broken with PyPI releases of post-training dependencies. We are working on fixing this and recommend following [Option 2: From Github](#option-2-from-github) in the meantime.
+> **Caution:** RL in MaxText is currently broken with PyPI releases of
+> post-training dependencies. We are working on fixing this and recommend
+> following [Option 2: From Github](#option-2-from-github) in the meantime.
 
-Next, run the following bash script to get all the necessary installations inside the virtual environment (for e.g., `maxtext_venv`).
-This will take few minutes. Follow along the installation logs and look out for any issues!
+Next, run the following bash script to get all the necessary installations
+inside the virtual environment (for e.g., `maxtext_venv`). This will take few
+minutes. Follow along the installation logs and look out for any issues!
 
 ```
 bash tools/setup/setup_post_training_requirements.sh
 ```
 
-Primarily, it installs `Tunix`, and `vllm-tpu` which is [vllm](https://github.com/vllm-project/vllm) and [tpu-inference](https://github.com/vllm-project/tpu-inference) and thereby providing TPU inference for vLLM, with unified JAX and PyTorch support.
+Primarily, it installs `Tunix`, and `vllm-tpu` which is
+[vllm](https://github.com/vllm-project/vllm) and
+[tpu-inference](https://github.com/vllm-project/tpu-inference) and thereby
+providing TPU inference for vLLM, with unified JAX and PyTorch support.
 
 ### Option 2: From Github
 
-You can also locally git clone [tunix](https://github.com/google/tunix) and install using the instructions [here](https://github.com/google/tunix?tab=readme-ov-file#installation). Similarly install [vllm](https://github.com/vllm-project/vllm) and [tpu-inference](https://github.com/vllm-project/tpu-inference) from source following the instructions [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/#install-from-source).
+You can also locally git clone [tunix](https://github.com/google/tunix) and
+install using the instructions
+[here](https://github.com/google/tunix?tab=readme-ov-file#installation).
+Similarly install [vllm](https://github.com/vllm-project/vllm) and
+[tpu-inference](https://github.com/vllm-project/tpu-inference) from source
+following the instructions
+[here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/#install-from-source).
+To get a set of compatible commit IDs for `maxtext`, `tunix`, `tpu-inference`,
+and `vllm`, follow these steps:
+
+1. Navigate to the
+   [MaxText Package Tests](https://github.com/AI-Hypercomputer/maxtext/actions/workflows/build_and_test_maxtext.yml?query=event%3Aschedule)
+   GitHub Actions workflow.
+
+1. Select the latest successful run.
+
+1. Within the workflow run, find and click on the `maxtext_jupyter_notebooks (py312)` job, then expand the `run` job.
+
+1. Locate the `Record Commit IDs` step. The commit SHAs for `maxtext`, `tunix`,
+   `tpu-inference`, and `vllm` that were used in that successful run are listed
+   in the logs of this step.
+
+1. Prior to installation, ensure that the `maxtext`, `tunix`, `vllm`, and `tpu-inference` repositories are synchronized to the specific commits recorded from the CI logs. For each repository, use the following command to switch to the correct commit: `git checkout <commit_id>`.
 
 ## Setup environment variables
 
@@ -86,16 +133,24 @@ export RUN_NAME=<name for this run> # e.g., $(date +%Y-%m-%d-%H-%M-%S)
 
 ### Option 1: Using an existing MaxText checkpoint
 
-If you already have a MaxText-compatible model checkpoint, simply set the following environment variable and move on to the next section.
+If you already have a MaxText-compatible model checkpoint, simply set the
+following environment variable and move on to the next section.
+
 ```bash
 export MAXTEXT_CKPT_PATH=<gcs path for MaxText checkpoint> # e.g., gs://my-bucket/my-model-checkpoint/0/items
 ```
 
 ### Option 2: Converting from a Hugging Face checkpoint
 
-Otherwise, you can convert a Hugging Face checkpoint to MaxText format using the `src/MaxText/utils/ckpt_conversion/to_maxtext.py` script. This is useful if you have a pre-trained model from Hugging Face that you want to use with MaxText.
+Otherwise, you can convert a Hugging Face checkpoint to MaxText format using the
+`src/MaxText/utils/ckpt_conversion/to_maxtext.py` script. This is useful if you
+have a pre-trained model from Hugging Face that you want to use with MaxText.
 
-First, ensure you have the necessary dependencies installed. Then, run the conversion script on a CPU machine. For large models, it is recommended to use the `--lazy_load_tensors` flag to reduce memory usage during conversion. This command will download the Hugging Face model and convert it to the MaxText format, saving it to the specified GCS bucket.
+First, ensure you have the necessary dependencies installed. Then, run the
+conversion script on a CPU machine. For large models, it is recommended to use
+the `--lazy_load_tensors` flag to reduce memory usage during conversion. This
+command will download the Hugging Face model and convert it to the MaxText
+format, saving it to the specified GCS bucket.
 
 ```bash
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -117,13 +172,13 @@ python3 -m MaxText.utils.ckpt_conversion.to_maxtext MaxText/configs/base.yml \
     --lazy_load_tensors=true
 ```
 
-The converted checkpoint will be saved at the following location. Set this environment variable to use it in the following GRPO/GSPO training sessions:
+The converted checkpoint will be saved at the following location. Set this
+environment variable to use it in the following GRPO/GSPO training sessions:
+
 ```bash
 export MAXTEXT_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/0/items
 ```
 
-
-
 ## Run GRPO
 
 Run the following command for GRPO:
@@ -140,10 +195,12 @@ python3 -m src.MaxText.rl.train_rl src/MaxText/configs/rl.yml \
 
 The overview of what this run will do is as follows:
 
-1. We load a policy model and a reference model. Both are copies of the model checkpoint you specified (e.g., `Llama3.1-8b-Instruct`).
-2. Evaluate the policy model's performance on GSM8K math reasoning benchmark.
-3. Train the policy model using GRPO.
-4. Evaluate the policy model's performance on GSM8K math reasoning benchmark after the post-training with GRPO. 
+1. We load a policy model and a reference model. Both are copies of the model
+   checkpoint you specified (e.g., `Llama3.1-8b-Instruct`).
+1. Evaluate the policy model's performance on GSM8K math reasoning benchmark.
+1. Train the policy model using GRPO.
+1. Evaluate the policy model's performance on GSM8K math reasoning benchmark
+   after the post-training with GRPO.
 
 ## Run GSPO
 
@@ -162,8 +219,9 @@ python3 -m src.MaxText.rl.train_rl src/MaxText/configs/rl.yml \
 
 The overview of what this run will do is as follows:
 
-1. We load a policy model and a reference model. Both are copies of the model checkpoint you specified (e.g., `Llama3.1-8b-Instruct`).
-2. Evaluate the policy model's performance on GSM8K math reasoning benchmark.
-3. Train the policy model using GSPO.
-4. Evaluate the policy model's performance on GSM8K math reasoning benchmark after the post-training with GSPO. 
-
+1. We load a policy model and a reference model. Both are copies of the model
+   checkpoint you specified (e.g., `Llama3.1-8b-Instruct`).
+1. Evaluate the policy model's performance on GSM8K math reasoning benchmark.
+1. Train the policy model using GSPO.
+1. Evaluate the policy model's performance on GSM8K math reasoning benchmark
+   after the post-training with GSPO.
diff --git a/docs/tutorials/posttraining/rl_on_multi_host.md b/docs/tutorials/posttraining/rl_on_multi_host.md