Skip to content

Commit d3ebd81

Browse files
author
Charles Li
committed
Revert "Add run_tests_coordinator"
This reverts commit 8f501bd.
1 parent 4ea0cdc commit d3ebd81

8 files changed

Lines changed: 156 additions & 268 deletions

.github/workflows/build_and_push_docker_image.yml

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44+
is_post_training:
45+
required: false
46+
type: boolean
47+
default: false
4448

4549
permissions:
4650
contents: read
@@ -78,9 +82,7 @@ jobs:
7882
ref: ${{ inputs.maxtext_sha }}
7983

8084
- name: Checkout post-training dependencies
81-
if: |
82-
steps.check.outputs.should_run == 'true' &&
83-
contains(inputs.image_name, 'post_training_nightly')
85+
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
8486
run: |
8587
git clone https://github.com/google/tunix.git ./tunix
8688
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -108,7 +110,8 @@ jobs:
108110
push: true
109111
context: .
110112
file: ${{ inputs.dockerfile }}
111-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
113+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
114+
cache-from: type=gha
112115
outputs: type=image,compression=zstd,force-compression=true
113116
build-args: |
114117
DEVICE=${{ inputs.device }}
@@ -123,19 +126,23 @@ jobs:
123126
shell: bash
124127
run: |
125128
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"
126-
TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}"
129+
130+
# Add date tag
131+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
127132
128133
# Convert date to YYYYMMDD format
129134
clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8)
130135
131136
# Add MaxText tag
132137
maxtext_hash=$(git rev-parse --short HEAD)
133-
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
138+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
134139
135140
# Add post-training dependencies tags
136-
for dir in tunix vllm tpu-inference; do
137-
if [ -d "./$dir" ]; then
138-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
139-
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
140-
fi
141-
done
141+
if [ "${{ inputs.is_post_training }}" == "true" ]; then
142+
for dir in tunix vllm tpu-inference; do
143+
if [ -d "./$dir" ]; then
144+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
145+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
146+
fi
147+
done
148+
fi

.github/workflows/build_and_test_maxtext.yml

Lines changed: 112 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -113,47 +113,72 @@ jobs:
113113
with:
114114
device_type: tpu
115115
device_name: v6e-4
116-
base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
116+
image_type: ${{ matrix.image_type }}
117117
cloud_runner: linux-x86-ct6e-180-4tpu
118118
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119
secrets:
120120
HF_TOKEN: ${{ secrets.HF_TOKEN }}
121121

122-
tpu-tests:
123-
needs: [build_and_upload_maxtext_package]
122+
maxtext_cpu_unit_tests:
123+
needs: build_and_upload_maxtext_package
124124
if: needs.doc_only_check.outputs.run_tests == 'true'
125+
uses: ./.github/workflows/run_tests_against_package.yml
125126
strategy:
126-
fail-fast: false
127-
matrix:
128-
flavor: [tpu-unit, tpu-integration]
129-
uses: ./.github/workflows/run_tests_coordinator.yml
127+
fail-fast: false # don't cancel all jobs on failure
128+
matrix:
129+
image_type: ["py312"]
130+
worker_group: [1, 2]
130131
with:
131-
flavor: ${{ matrix.flavor }}
132-
base_image: maxtext-unit-test-tpu:py312
132+
device_type: cpu
133+
device_name: X64
134+
cloud_runner: linux-x86-n2-16
135+
image_type: ${{ matrix.image_type }}
136+
pytest_marker: 'cpu_only'
137+
xla_python_client_mem_fraction: 0.75
138+
tf_force_gpu_allow_growth: false
139+
container_resource_option: "--privileged"
133140
is_scheduled_run: ${{ github.event_name == 'schedule' }}
141+
worker_group: ${{ matrix.worker_group }}
142+
total_workers: 2
134143
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
135144

136-
gpu-tests:
137-
needs: [build_and_upload_maxtext_package]
145+
maxtext_tpu_unit_tests:
146+
needs: build_and_upload_maxtext_package
138147
if: needs.doc_only_check.outputs.run_tests == 'true'
148+
uses: ./.github/workflows/run_tests_against_package.yml
139149
strategy:
140-
fail-fast: false
141-
matrix:
142-
flavor: [gpu-unit, gpu-integration]
143-
uses: ./.github/workflows/run_tests_coordinator.yml
150+
fail-fast: false
151+
matrix:
152+
image_type: ["py312"]
144153
with:
145-
flavor: ${{ matrix.flavor }}
146-
base_image: maxtext-unit-test-cuda12:py312
154+
device_type: tpu
155+
device_name: v6e-4
156+
image_type: ${{ matrix.image_type }}
157+
cloud_runner: linux-x86-ct6e-180-4tpu
158+
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
159+
xla_python_client_mem_fraction: 0.75
160+
tf_force_gpu_allow_growth: false
161+
container_resource_option: "--privileged"
147162
is_scheduled_run: ${{ github.event_name == 'schedule' }}
148163
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
149164

150-
cpu-tests:
151-
needs: [build_and_upload_maxtext_package]
165+
maxtext_tpu_integration_tests:
166+
needs: build_and_upload_maxtext_package
152167
if: needs.doc_only_check.outputs.run_tests == 'true'
153-
uses: ./.github/workflows/run_tests_coordinator.yml
168+
uses: ./.github/workflows/run_tests_against_package.yml
169+
strategy:
170+
fail-fast: false
171+
matrix:
172+
image_type: ["py312"]
154173
with:
155-
flavor: cpu-unit
156-
base_image: maxtext-unit-test-tpu:py312
174+
device_type: tpu
175+
device_name: v6e-4
176+
image_type: ${{ matrix.image_type }}
177+
cloud_runner: linux-x86-ct6e-180-4tpu
178+
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
179+
xla_python_client_mem_fraction: 0.75
180+
tf_force_gpu_allow_growth: false
181+
container_resource_option: "--privileged"
157182
is_scheduled_run: ${{ github.event_name == 'schedule' }}
158183
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
159184

@@ -163,12 +188,14 @@ jobs:
163188
uses: ./.github/workflows/run_pathways_tests.yml
164189
strategy:
165190
fail-fast: false
191+
matrix:
192+
image_type: ["py312"]
166193
with:
167194
device_type: tpu
168195
device_name: v6e-4
169-
base_image: maxtext-unit-test-tpu:py312
196+
image_type: ${{ matrix.image_type }}
170197
cloud_runner: linux-x86-ct6e-180-4tpu
171-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
198+
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
172199
xla_python_client_mem_fraction: 0.75
173200
tf_force_gpu_allow_growth: false
174201
container_resource_option: "--privileged"
@@ -181,38 +208,85 @@ jobs:
181208
uses: ./.github/workflows/run_pathways_tests.yml
182209
strategy:
183210
fail-fast: false
211+
matrix:
212+
image_type: ["py312"]
184213
with:
185214
device_type: tpu
186215
device_name: v6e-4
187-
base_image: maxtext-unit-test-tpu:py312
216+
image_type: ${{ matrix.image_type }}
188217
cloud_runner: linux-x86-ct6e-180-4tpu
189-
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
218+
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
190219
xla_python_client_mem_fraction: 0.75
191220
tf_force_gpu_allow_growth: false
192221
container_resource_option: "--privileged"
193222
is_scheduled_run: ${{ github.event_name == 'schedule' }}
194223
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
195224

225+
maxtext_gpu_unit_tests:
226+
needs: build_and_upload_maxtext_package
227+
if: needs.doc_only_check.outputs.run_tests == 'true'
228+
uses: ./.github/workflows/run_tests_against_package.yml
229+
strategy:
230+
fail-fast: false
231+
matrix:
232+
image_type: ["py312"]
233+
cuda: ["cuda12"]
234+
with:
235+
device_type: ${{ matrix.cuda }}
236+
device_name: a100-40gb-4
237+
image_type: ${{ matrix.image_type }}
238+
cloud_runner: linux-x86-a2-48-a100-4gpu
239+
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
240+
xla_python_client_mem_fraction: 0.65
241+
tf_force_gpu_allow_growth: true
242+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243+
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245+
246+
maxtext_gpu_integration_tests:
247+
needs: build_and_upload_maxtext_package
248+
if: needs.doc_only_check.outputs.run_tests == 'true'
249+
uses: ./.github/workflows/run_tests_against_package.yml
250+
strategy:
251+
fail-fast: false
252+
matrix:
253+
image_type: ["py312"]
254+
cuda: ["cuda12"]
255+
with:
256+
device_type: ${{ matrix.cuda }}
257+
device_name: a100-40gb-4
258+
image_type: ${{ matrix.image_type }}
259+
cloud_runner: linux-x86-a2-48-a100-4gpu
260+
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
261+
xla_python_client_mem_fraction: 0.65
262+
tf_force_gpu_allow_growth: true
263+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
264+
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266+
196267
all_tests_passed:
197268
name: All Required Tests Passed
198-
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
269+
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
199270
if: always()
200271
runs-on: ubuntu-latest
201272
steps:
202273
- name: Check test results
203274
run: |
275+
# If doc-only, all tests should be skipped
204276
if [ "${{ needs.doc_only_check.outputs.run_tests }}" == "false" ]; then
205277
echo "Documentation-only changes detected, tests were skipped"
206278
exit 0
207279
fi
208280
209281
# Otherwise, check that build and all tests passed or were skipped
210282
echo "Build result: ${{ needs.build_and_upload_maxtext_package.result }}"
211-
echo "TPU Tests (Matrix) result: ${{ needs.tpu-tests.result }}"
212-
echo "GPU Tests (Matrix) result: ${{ needs.gpu-tests.result }}"
213-
echo "CPU Tests (Matrix) result: ${{ needs.cpu-tests.result }}"
214-
echo "Pathways Unit result: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
215-
echo "Pathways Integration result: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
283+
echo "CPU tests: ${{ needs.maxtext_cpu_unit_tests.result }}"
284+
echo "TPU tests: ${{ needs.maxtext_tpu_unit_tests.result }}"
285+
echo "TPU integration: ${{ needs.maxtext_tpu_integration_tests.result }}"
286+
echo "TPU pathways: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
287+
echo "TPU pathways integration: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
288+
echo "GPU tests: ${{ needs.maxtext_gpu_unit_tests.result }}"
289+
echo "GPU integration: ${{ needs.maxtext_gpu_integration_tests.result }}"
216290
217291
# Fail only if any job failed or was cancelled (skipped is OK)
218292
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -249,14 +323,14 @@ jobs:
249323
250324
notify_failure:
251325
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
252-
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
326+
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
253327
if: ${{ always() }}
254328
runs-on: ubuntu-latest
255329
permissions:
256330
issues: write
257331
steps:
258-
- name: Check whether one of the jobs failed
259-
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
260-
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
261-
with:
262-
github-token: ${{ secrets.GITHUB_TOKEN }}
332+
- name: Check whether one of the jobs failed
333+
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
334+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
335+
with:
336+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,15 @@ on:
2525
device_name:
2626
required: true
2727
type: string
28-
base_image:
29-
required: true
28+
image_type:
29+
required: false
3030
type: string
3131
cloud_runner:
3232
required: false
3333
type: string
3434
maxtext_sha:
35-
required: false
35+
required: true
3636
type: string
37-
# Flag to skip source checkout and wheel installation
38-
maxtext_installed:
39-
required: false
40-
type: boolean
41-
default: false
4237
secrets:
4338
HF_TOKEN:
4439
required: true
@@ -49,20 +44,17 @@ jobs:
4944
run:
5045
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
5146
container:
52-
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
47+
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
5348
steps:
5449
- name: Checkout MaxText
55-
if: ${{ !inputs.maxtext_installed }}
5650
uses: actions/checkout@v5
5751
with:
5852
ref: ${{ inputs.maxtext_sha }}
5953
- name: Download the MaxText wheel
60-
if: ${{ !inputs.maxtext_installed }}
6154
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
6255
with:
6356
name: maxtext-wheel
6457
- name: Install MaxText and Dependencies
65-
if: ${{ !inputs.maxtext_installed }}
6658
shell: bash
6759
run: |
6860
python3 -m uv venv --seed
@@ -73,6 +65,10 @@ jobs:
7365
uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
7466
uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
7567
68+
# Install dependencies for running notebooks
69+
uv pip install papermill ipykernel ipywidgets
70+
.venv/bin/python3 -m ipykernel install --user --name maxtext_venv
71+
7672
# Install Tunix for post-training notebooks
7773
git clone https://github.com/google/tunix
7874
uv pip install ./tunix
@@ -94,25 +90,12 @@ jobs:
9490
PYTHONPATH: "${{ github.workspace }}/src"
9591
HF_TOKEN: ${{ secrets.HF_TOKEN }}
9692
run: |
97-
if [ "${{ inputs.maxtext_installed }}" == "true" ]; then
98-
# Move to the directory where code is baked into the image. See the Dockerfile.
99-
# This is necessary because GHA sets an empty workspace by default.
100-
cd /deps
101-
PYTHON_EXE="python3"
102-
PAPERMILL_EXE="papermill"
103-
else
104-
PYTHON_EXE=".venv/bin/python3"
105-
PAPERMILL_EXE=".venv/bin/papermill"
106-
fi
93+
source .venv/bin/activate
10794
10895
export MAXTEXT_REPO_ROOT=$(pwd)
10996
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
11097
export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
11198
112-
# Install dependencies for running notebooks
113-
$PYTHON_EXE -m pip install papermill ipykernel ipywidgets
114-
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
115-
11699
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
117100
filename=$(basename "$notebook")
118101
output_name="${filename%.ipynb}_output.ipynb"
@@ -121,7 +104,7 @@ jobs:
121104
echo "Running $filename ..."
122105
echo "------------------------------------------------------"
123106
124-
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
107+
papermill "$notebook" "$output_name" -k maxtext_venv
125108
done
126109
- name: Record Commit IDs
127110
shell: bash

0 commit comments

Comments
 (0)