-
Notifications
You must be signed in to change notification settings - Fork 69
119 lines (101 loc) · 3.96 KB
/
UnitTests.yml
File metadata and controls
119 lines (101 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
# This workflow will run a small MaxText training workload on a GPU runner
# using a custom Docker image with all dependencies pre-installed.
# This workflow will run a small MaxText training workload on a GPU runner
# using a custom Docker image with all code and dependencies pre-installed.
# This workflow runs MaxText training with a pinned version of Transformer Engine.
name: MaxText Custom Image with Pinned TE
on:
pull_request:
push:
branches: [ "main" ]
workflow_dispatch:
jobs:
maxtext_training_workload:
name: "Run MaxText Training Workload"
runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
container:
image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/maxtext-gpu-custom:latest
steps:
- name: List Installed Libraries
run: |
echo "--- Installed Python packages ---"
pip freeze
- name: Run MaxText Training
working-directory: /deps/src
env:
NVTE_FRAMEWORK: jax
TF_FORCE_GPU_ALLOW_GROWTH: "true"
NVTE_FUSED_ATTN: 1
run: |
python MaxText/train.py MaxText/configs/base.yml \
run_name="maxtext-ci-test-${{ github.run_id }}" \
steps=5 \
enable_checkpointing=false \
attention='cudnn_flash_te' \
dataset_type='synthetic'
# name: SDXL Workload Training on GPU
# on:
# pull_request:
# push:
# branches: [ "main" ]
# workflow_dispatch:
# jobs:
# sdxl_training_workload:
# name: "Run SDXL Training Workload"
# # IMPORTANT: Replace with the label for your specific GPU runner if different
# runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
# container:
# image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1
# steps:
# - name: Verify Environment
# run: |
# echo "--- Verifying free space ---"
# free -h
# echo "--- Verifying shared memory size ---"
# df -h /dev/shm
# - name: Checkout Repository
# uses: actions/checkout@v4
# - name: Install Dependencies
# run: |
# pip install -r requirements.txt
# pip uninstall -y tensorflow
# pip install tensorflow-cpu
# pip install --upgrade torch torchvision
# pip install .
# - name: List Installed Libraries
# run: |
# echo "--- Installed Python packages ---"
# pip freeze
# - name: Hugging Face Login
# run: huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
# - name: Run SDXL Training
# env:
# NVTE_FRAMEWORK: jax
# TF_FORCE_GPU_ALLOW_GROWTH: "true"
# run: |
# python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml \
# run_name="sdxl-ci-test-${{ github.run_id }}" \
# output_dir="/tmp/sdxl-output/" \
# max_train_steps=5 \
# hardware=gpu \
# attention="cudnn_flash_te" \
# resolution=512 \
# per_device_batch_size=1 \
# train_new_unet=true \
# train_text_encoder=false \
# cache_latents_text_encoder_outputs=true