maxdiffusion/.github/workflows/UnitTests.yml at e9809b6c10b3e3d454a29c923dfabdc321952d44 · AI-Hypercomputer/maxdiffusion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python


# This workflow will run a small MaxText training workload on a GPU runner
# using a custom Docker image with all dependencies pre-installed.

# This workflow will run a small MaxText training workload on a GPU runner
# using a custom Docker image with all code and dependencies pre-installed.

# This workflow runs MaxText training with a pinned version of Transformer Engine.

name: MaxText Custom Image with Pinned TE

on:
  pull_request:
  push:
    branches: [ "main" ]
  workflow_dispatch:

jobs:
  maxtext_training_workload:
    name: "Run MaxText Training Workload"
    runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
    container:
      image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/maxtext-gpu-custom:latest

    steps:
      - name: List Installed Libraries
        run: |
          echo "--- Installed Python packages ---"
          pip freeze
      - name: Run MaxText Training
        working-directory: /deps/src
        env:
          NVTE_FRAMEWORK: jax
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          NVTE_FUSED_ATTN: 1
        run: |
          python MaxText/train.py MaxText/configs/base.yml \
            run_name="maxtext-ci-test-${{ github.run_id }}" \
            steps=5 \
            enable_checkpointing=false \
            attention='cudnn_flash_te' \
            dataset_type='synthetic'

# name: SDXL Workload Training on GPU

# on:
#   pull_request:
#   push:
#     branches: [ "main" ]
#   workflow_dispatch:

# jobs:
#   sdxl_training_workload:
#     name: "Run SDXL Training Workload"
#     # IMPORTANT: Replace with the label for your specific GPU runner if different
#     runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
#     container:
#       image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1

#     steps:
#       - name: Verify Environment
#         run: |
#           echo "--- Verifying free space ---"
#           free -h
#           echo "--- Verifying shared memory size ---"
#           df -h /dev/shm

#       - name: Checkout Repository
#         uses: actions/checkout@v4

#       - name: Install Dependencies
#         run: |
#           pip install -r requirements.txt
#           pip uninstall -y tensorflow
#           pip install tensorflow-cpu
#           pip install --upgrade torch torchvision
#           pip install .

#       - name: List Installed Libraries
#         run: |
#           echo "--- Installed Python packages ---"
#           pip freeze

#       - name: Hugging Face Login
#         run: huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}

#       - name: Run SDXL Training
#         env:
#           NVTE_FRAMEWORK: jax
#           TF_FORCE_GPU_ALLOW_GROWTH: "true"
#         run: |
#           python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml \
#             run_name="sdxl-ci-test-${{ github.run_id }}" \
#             output_dir="/tmp/sdxl-output/" \
#             max_train_steps=5 \
#             hardware=gpu \
#             attention="cudnn_flash_te" \
#             resolution=512 \
#             per_device_batch_size=1 \
#             train_new_unet=true \
#             train_text_encoder=false \
#             cache_latents_text_encoder_outputs=true