File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 2222# This workflow will run a small MaxText training workload on a GPU runner
2323# using a custom Docker image with all code and dependencies pre-installed.
2424
25- name : MaxText Custom Image Workload
25+ # This workflow runs MaxText training with a pinned version of Transformer Engine.
26+
27+ name : MaxText Custom Image with Pinned TE
2628
2729on :
2830 pull_request :
@@ -35,18 +37,22 @@ jobs:
3537 name : " Run MaxText Training Workload"
3638 runs-on : ["linux-x86-a3-megagpu-h100-8gpu"]
3739 container :
38- # Use your custom image which contains the source code and dependencies.
3940 image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/maxtext-gpu-custom:latest
4041
4142 steps :
43+ - name : Pin Transformer Engine Version
44+ run : |
45+ echo "Uninstalling any existing Transformer Engine..."
46+ pip uninstall -y transformer-engine
47+ echo "Installing Transformer Engine version 2.6.0..."
48+ pip install transformer-engine[jax]==2.6.0
49+
4250 - name : Run MaxText Training
43- # The Docker image's working directory is /deps, but the code is in /deps/src.
4451 working-directory : /deps/src
4552 env :
4653 NVTE_FRAMEWORK : jax
4754 TF_FORCE_GPU_ALLOW_GROWTH : " true"
4855 run : |
49- # Run the main training script from the /deps/src directory.
5056 python MaxText/train.py MaxText/configs/base.yml \
5157 run_name="maxtext-ci-test-${{ github.run_id }}" \
5258 steps=5 \
You can’t perform that action at this time.
0 commit comments