Skip to content

Commit 691e9ad

Browse files
committed
Build post-training docker images in github workflow daily
1 parent b2b7d8f commit 691e9ad

2 files changed

Lines changed: 212 additions & 78 deletions

File tree

.github/workflows/UploadDockerImages.yml

Lines changed: 72 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,35 @@ on:
3333
- tpu
3434
- gpu
3535

36+
permissions:
37+
contents: read
38+
3639
jobs:
37-
build:
38-
name: Build ${{ matrix.device }}-${{ matrix.build_mode }} Image
39-
runs-on: linux-x86-n2-16-buildkit
40-
container: google/cloud-sdk:524.0.0
40+
setup:
41+
runs-on: ubuntu-latest
42+
outputs:
43+
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
44+
image_date: ${{ steps.vars.outputs.image_date }}
45+
steps:
46+
- name: Checkout MaxText
47+
uses: actions/checkout@v5
48+
49+
- name: Get metadata
50+
id: vars
51+
run: |
52+
# MaxText SHA
53+
echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
54+
55+
# Image date
56+
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
4157
42-
# Use Github Actions matrix to run image builds in parallel
58+
tpu-pre-training:
59+
name: ${{ matrix.image_name }}
60+
needs: setup
4361
strategy:
4462
fail-fast: false
4563
matrix:
4664
include:
47-
# TPU Image Builds
4865
- device: tpu
4966
build_mode: stable
5067
image_name: maxtext_jax_stable
@@ -53,7 +70,47 @@ jobs:
5370
build_mode: nightly
5471
image_name: maxtext_jax_nightly
5572
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
56-
# GPU Image Builds
73+
uses: ./.github/workflows/build_and_push_docker_image.yml
74+
with:
75+
image_name: ${{ matrix.image_name }}
76+
device: ${{ matrix.device }}
77+
build_mode: ${{ matrix.build_mode }}
78+
dockerfile: ${{ matrix.dockerfile }}
79+
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
80+
image_date: ${{ needs.setup.outputs.image_date }}
81+
82+
tpu-post-training:
83+
name: ${{ matrix.image_name }}
84+
needs: [setup, tpu-pre-training]
85+
strategy:
86+
fail-fast: false
87+
matrix:
88+
include:
89+
- device: tpu
90+
build_mode: post-training
91+
image_name: maxtext_post_training_stable
92+
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile
93+
- device: tpu
94+
build_mode: post-training
95+
image_name: maxtext_post_training_nightly
96+
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
97+
uses: ./.github/workflows/build_and_push_docker_image.yml
98+
with:
99+
image_name: ${{ matrix.image_name }}
100+
device: ${{ matrix.device }}
101+
build_mode: ${{ matrix.build_mode }}
102+
dockerfile: ${{ matrix.dockerfile }}
103+
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
104+
image_date: ${{ needs.setup.outputs.image_date }}
105+
is_post_training: true
106+
107+
gpu-pre-training:
108+
name: ${{ matrix.image_name }}
109+
needs: setup
110+
strategy:
111+
fail-fast: false
112+
matrix:
113+
include:
57114
- device: gpu
58115
build_mode: stable
59116
image_name: maxtext_gpu_jax_stable
@@ -62,74 +119,11 @@ jobs:
62119
build_mode: nightly
63120
image_name: maxtext_gpu_jax_nightly
64121
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
65-
66-
if: >
67-
github.event_name == 'schedule' ||
68-
github.event_name == 'pull_request' ||
69-
github.event_name == 'workflow_dispatch' && (
70-
github.event.inputs.target_device == 'all' ||
71-
github.event.inputs.target_device == 'tpu' ||
72-
github.event.inputs.target_device == 'gpu'
73-
)
74-
75-
# Setup for GKE runners per b/412986220#comment82 and b/412986220#comment90
76-
steps:
77-
- name: Check if build should run
78-
id: check
79-
shell: bash
80-
run: |
81-
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ matrix.device }}" ]]; then
82-
echo "should_run=false" >> $GITHUB_OUTPUT
83-
echo "Skipping build for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode."
84-
else
85-
echo "should_run=true" >> $GITHUB_OUTPUT
86-
echo "Building for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode."
87-
fi
88-
89-
- name: Checkout git repository
90-
uses: actions/checkout@v5
91-
if: steps.check.outputs.should_run == 'true'
92-
93-
- name: Mark git repository as safe
94-
if: steps.check.outputs.should_run == 'true'
95-
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
96-
97-
- name: Configure Docker
98-
if: steps.check.outputs.should_run == 'true'
99-
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
100-
101-
- name: Set up Docker BuildX
102-
uses: docker/setup-buildx-action@v3.11.1
103-
if: steps.check.outputs.should_run == 'true'
104-
with:
105-
driver: remote
106-
endpoint: tcp://localhost:1234
107-
108-
# Env variables to be passed to Dockerfile
109-
- name: Get metadata
110-
id: vars
111-
if: steps.check.outputs.should_run == 'true'
112-
run: |
113-
echo "commit_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
114-
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
115-
116-
# Docker BuildX command config
117-
- name: Build and Push Docker Image
118-
uses: docker/build-push-action@v6
119-
if: steps.check.outputs.should_run == 'true'
120-
with:
121-
push: true
122-
context: .
123-
file: ${{ matrix.dockerfile }}
124-
tags: |
125-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:maxtext_${{ steps.vars.outputs.commit_hash }}
126-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:${{ steps.vars.outputs.image_date }}
127-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest
128-
cache-from: type=gha
129-
cache-to: type=gha,mode=max
130-
provenance: false
131-
build-args: |
132-
DEVICE=${{ matrix.device }}
133-
MODE=${{ matrix.build_mode }}
134-
JAX_VERSION=NONE
135-
LIBTPU_GCS_PATH=NONE
122+
uses: ./.github/workflows/build_and_push_docker_image.yml
123+
with:
124+
image_name: ${{ matrix.image_name }}
125+
device: ${{ matrix.device }}
126+
build_mode: ${{ matrix.build_mode }}
127+
dockerfile: ${{ matrix.dockerfile }}
128+
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
129+
image_date: ${{ needs.setup.outputs.image_date }}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Copyright 2025 Google LLC
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This workflow will build and push MaxText Docker image to GCR.
16+
17+
name: Build and Push MaxText Docker Images
18+
19+
on:
20+
workflow_call:
21+
inputs:
22+
image_name:
23+
required: true
24+
type: string
25+
device:
26+
required: true
27+
type: string
28+
build_mode:
29+
required: true
30+
type: string
31+
dockerfile:
32+
required: true
33+
type: string
34+
maxtext_sha:
35+
required: true
36+
type: string
37+
image_date:
38+
required: true
39+
type: string
40+
is_post_training:
41+
required: false
42+
type: boolean
43+
default: false
44+
45+
permissions:
46+
contents: read
47+
48+
jobs:
49+
build_and_push:
50+
runs-on: linux-x86-n2-16-buildkit
51+
container: google/cloud-sdk:524.0.0
52+
if: >
53+
github.event_name == 'schedule' ||
54+
github.event_name == 'pull_request' ||
55+
github.event_name == 'workflow_dispatch' && (
56+
github.event.inputs.target_device == 'all' ||
57+
github.event.inputs.target_device == 'tpu' ||
58+
github.event.inputs.target_device == 'gpu'
59+
)
60+
steps:
61+
- name: Check if build should run
62+
id: check
63+
shell: bash
64+
run: |
65+
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ inputs.device }}" ]]; then
66+
echo "should_run=false" >> $GITHUB_OUTPUT
67+
echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
68+
else
69+
echo "should_run=true" >> $GITHUB_OUTPUT
70+
echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
71+
fi
72+
73+
- name: Checkout MaxText
74+
uses: actions/checkout@v5
75+
if: steps.check.outputs.should_run == 'true'
76+
with:
77+
# This ensures that every job clones the exact same commit as "setup" job
78+
ref: ${{ inputs.maxtext_sha }}
79+
80+
- name: Checkout post-training dependencies
81+
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
82+
run: |
83+
git clone https://github.com/google/tunix.git ./tunix
84+
git clone https://github.com/vllm-project/vllm.git ./vllm
85+
git clone https://github.com/vllm-project/tpu-inference.git ./tpu-inference
86+
87+
- name: Mark git repositories as safe
88+
run: git config --global --add safe.directory '*'
89+
if: steps.check.outputs.should_run == 'true'
90+
91+
- name: Configure Docker
92+
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
93+
if: steps.check.outputs.should_run == 'true'
94+
95+
- name: Set up Docker BuildX
96+
uses: docker/setup-buildx-action@v3.11.1
97+
if: steps.check.outputs.should_run == 'true'
98+
with:
99+
driver: remote
100+
endpoint: tcp://localhost:1234
101+
102+
- name: Build and push Docker image
103+
uses: docker/build-push-action@v6
104+
if: steps.check.outputs.should_run == 'true'
105+
with:
106+
push: true
107+
context: .
108+
file: ${{ inputs.dockerfile }}
109+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
110+
cache-from: type=gha
111+
outputs: type=image,compression=zstd,force-compression=true
112+
build-args: |
113+
DEVICE=${{ inputs.device }}
114+
MODE=${{ inputs.build_mode }}
115+
JAX_VERSION=NONE
116+
LIBTPU_GCS_PATH=NONE
117+
BASEIMAGE=gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ inputs.image_date }}
118+
119+
- name: Add tags to Docker image
120+
if: steps.check.outputs.should_run == 'true'
121+
shell: bash
122+
run: |
123+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"
124+
125+
# Add date tag
126+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
127+
128+
# Add MaxText tag
129+
maxtext_hash=$(git rev-parse --short HEAD)
130+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}" --quiet
131+
132+
# Add post-training dependencies tags
133+
if [ "${{ inputs.is_post_training }}" == "true" ]; then
134+
for dir in tunix vllm tpu-inference; do
135+
if [ -d "./$dir" ]; then
136+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
137+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_$dir_hash" --quiet
138+
fi
139+
done
140+
fi

0 commit comments

Comments
 (0)