Skip to content

Commit bc0eca3

Browse files
Merge pull request #2924 from AI-Hypercomputer:xibin/ci
PiperOrigin-RevId: 868178412
2 parents d71e529 + 8f501bd commit bc0eca3

10 files changed

Lines changed: 417 additions & 161 deletions

.github/workflows/UploadDockerImages.yml

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
1616
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
1717

18-
name: Build Images
18+
name: Build and Test Images
1919

2020
on:
2121
schedule:
@@ -32,6 +32,11 @@ on:
3232
- all
3333
- tpu
3434
- gpu
35+
for_dev_test:
36+
description: 'For development test purpose. All images will be added a -test suffix'
37+
required: false
38+
type: boolean
39+
default: false
3540

3641
permissions:
3742
contents: read
@@ -42,6 +47,7 @@ jobs:
4247
outputs:
4348
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
4449
image_date: ${{ steps.vars.outputs.image_date }}
50+
image_suffix: ${{ steps.vars.outputs.image_suffix }}
4551
steps:
4652
- name: Checkout MaxText
4753
uses: actions/checkout@v5
@@ -55,6 +61,13 @@ jobs:
5561
# Image date
5662
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
5763
64+
# If for_dev_test is true, set suffix to -test, otherwise empty
65+
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
66+
echo "image_suffix=-test" >> $GITHUB_OUTPUT
67+
else
68+
echo "image_suffix=" >> $GITHUB_OUTPUT
69+
fi
70+
5871
tpu-pre-training:
5972
name: ${{ matrix.image_name }}
6073
needs: setup
@@ -72,7 +85,7 @@ jobs:
7285
dockerfile: ./dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
7386
uses: ./.github/workflows/build_and_push_docker_image.yml
7487
with:
75-
image_name: ${{ matrix.image_name }}
88+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
7689
device: ${{ matrix.device }}
7790
build_mode: ${{ matrix.build_mode }}
7891
dockerfile: ${{ matrix.dockerfile }}
@@ -96,14 +109,13 @@ jobs:
96109
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
97110
uses: ./.github/workflows/build_and_push_docker_image.yml
98111
with:
99-
image_name: ${{ matrix.image_name }}
112+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
100113
device: ${{ matrix.device }}
101114
build_mode: ${{ matrix.build_mode }}
102115
dockerfile: ${{ matrix.dockerfile }}
103116
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
104117
image_date: ${{ needs.setup.outputs.image_date }}
105118
base_image: gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ needs.setup.outputs.image_date }}
106-
is_post_training: true
107119

108120
gpu-pre-training:
109121
name: ${{ matrix.image_name }}
@@ -122,9 +134,48 @@ jobs:
122134
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
123135
uses: ./.github/workflows/build_and_push_docker_image.yml
124136
with:
125-
image_name: ${{ matrix.image_name }}
137+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
126138
device: ${{ matrix.device }}
127139
build_mode: ${{ matrix.build_mode }}
128140
dockerfile: ${{ matrix.dockerfile }}
129141
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
130142
image_date: ${{ needs.setup.outputs.image_date }}
143+
144+
# TEST JOBS
145+
pre-training-tpu-tests:
146+
needs: [setup, tpu-pre-training]
147+
strategy:
148+
fail-fast: false
149+
matrix:
150+
image: [maxtext_jax_stable, maxtext_jax_nightly]
151+
uses: ./.github/workflows/test_and_tag_docker_image.yml
152+
with:
153+
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
154+
image_date: ${{ needs.setup.outputs.image_date }}
155+
test_mode: tpu-pre-training
156+
157+
post-training-tpu-tests:
158+
needs: [setup, tpu-post-training]
159+
strategy:
160+
fail-fast: false
161+
matrix:
162+
image: [maxtext_post_training_stable, maxtext_post_training_nightly]
163+
uses: ./.github/workflows/test_and_tag_docker_image.yml
164+
with:
165+
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
166+
image_date: ${{ needs.setup.outputs.image_date }}
167+
test_mode: tpu-post-training
168+
169+
170+
pre-training-gpu-tests:
171+
needs: [setup, gpu-pre-training]
172+
strategy:
173+
fail-fast: false
174+
matrix:
175+
image: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
176+
uses: ./.github/workflows/test_and_tag_docker_image.yml
177+
with:
178+
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
179+
image_date: ${{ needs.setup.outputs.image_date }}
180+
test_mode: gpu-pre-training
181+

.github/workflows/build_and_push_docker_image.yml

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44-
is_post_training:
45-
required: false
46-
type: boolean
47-
default: false
4844

4945
permissions:
5046
contents: read
@@ -82,7 +78,9 @@ jobs:
8278
ref: ${{ inputs.maxtext_sha }}
8379

8480
- name: Checkout post-training dependencies
85-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
81+
if: |
82+
steps.check.outputs.should_run == 'true' &&
83+
contains(inputs.image_name, 'post_training_nightly')
8684
run: |
8785
git clone https://github.com/google/tunix.git ./tunix
8886
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -110,8 +108,7 @@ jobs:
110108
push: true
111109
context: .
112110
file: ${{ inputs.dockerfile }}
113-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
114-
cache-from: type=gha
111+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
115112
outputs: type=image,compression=zstd,force-compression=true
116113
build-args: |
117114
DEVICE=${{ inputs.device }}
@@ -126,23 +123,19 @@ jobs:
126123
shell: bash
127124
run: |
128125
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"
129-
130-
# Add date tag
131-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
126+
TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}"
132127
133128
# Convert date to YYYYMMDD format
134129
clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8)
135130
136131
# Add MaxText tag
137132
maxtext_hash=$(git rev-parse --short HEAD)
138-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
133+
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
139134
140135
# Add post-training dependencies tags
141-
if [ "${{ inputs.is_post_training }}" == "true" ]; then
142-
for dir in tunix vllm tpu-inference; do
143-
if [ -d "./$dir" ]; then
144-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
145-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
146-
fi
147-
done
148-
fi
136+
for dir in tunix vllm tpu-inference; do
137+
if [ -d "./$dir" ]; then
138+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
139+
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
140+
fi
141+
done

0 commit comments

Comments
 (0)