1616# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
1717
1818# This workflow will run a small SDXL training workload on a GPU runner.
19-
2019# This workflow will run a small SDXL training workload on a GPU runner.
2120
2221name : SDXL Workload Training on GPU
@@ -33,29 +32,15 @@ jobs:
3332 # IMPORTANT: Replace with the label for your specific GPU runner if different
3433 runs-on : ["linux-x86-a3-megagpu-h100-8gpu"]
3534 container :
36- image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1
35+ image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:jax0.7.2-cuda12.9-rev1:latest
3736
3837 steps :
39- - name : Create and Activate Swap File
38+ - name : Verify Environment
4039 run : |
41- echo "--- Verifying free space before changes ---"
42- free -h
43- echo "---"
44- echo "Creating and activating a 64GB swap file..."
45- # Deactivate any existing swap to be safe
46- sudo swapoff -a
47- # Allocate a 64GB file
48- sudo fallocate -l 64G /swapfile
49- # Set the correct permissions
50- sudo chmod 600 /swapfile
51- # Format the file as swap
52- sudo mkswap /swapfile
53- # Activate the swap file
54- sudo swapon /swapfile
55- echo "--- Swap file is now active ---"
56- sudo swapon --show
57- echo "--- Verifying free space after changes ---"
40+ echo "--- Verifying free space ---"
5841 free -h
42+ echo "--- Verifying shared memory size ---"
43+ df -h /dev/shm
5944
6045 - name : Checkout Repository
6146 uses : actions/checkout@v4
0 commit comments