1717
1818# This workflow will run a small SDXL training workload on a GPU runner.
1919
20+ # This workflow will run a small SDXL training workload on a GPU runner.
21+
2022name : SDXL Workload Training on GPU
2123
2224on :
@@ -31,21 +33,39 @@ jobs:
3133 # IMPORTANT: Replace with the label for your specific GPU runner if different
3234 runs-on : ["linux-x86-a3-megagpu-h100-8gpu"]
3335 container :
34- image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1
36+ image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1:latest
3537
3638 steps :
39+ - name : Create and Activate Swap File
40+ run : |
41+ echo "--- Verifying free space before changes ---"
42+ free -h
43+ echo "---"
44+ echo "Creating and activating a 64GB swap file..."
45+ # Deactivate any existing swap to be safe
46+ sudo swapoff -a
47+ # Allocate a 64GB file
48+ sudo fallocate -l 64G /swapfile
49+ # Set the correct permissions
50+ sudo chmod 600 /swapfile
51+ # Format the file as swap
52+ sudo mkswap /swapfile
53+ # Activate the swap file
54+ sudo swapon /swapfile
55+ echo "--- Swap file is now active ---"
56+ sudo swapon --show
57+ echo "--- Verifying free space after changes ---"
58+ free -h
59+
3760 - name : Checkout Repository
3861 uses : actions/checkout@v4
3962
4063 - name : Install Dependencies
4164 run : |
42- pip install -r requirements.txt
43- # Uninstall the full tensorflow package to prevent GPU conflicts
4465 pip uninstall -y tensorflow
45- # Install the CPU-only version of tensorflow
4666 pip install tensorflow-cpu
67+ pip install -r requirements.txt
4768 pip install --upgrade torch torchvision
48- # Install the maxdiffusion package to make it available for execution
4969 pip install .
5070
5171 - name : List Installed Libraries
0 commit comments