Test with te-cu13 package

Rohan-Bierneni · Rohan-Bierneni · commit 4ac11bb6bd82 · 2025-09-30T02:49:52.000Z
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
@@ -24,44 +24,44 @@ on:
   workflow_dispatch:
 
 jobs:
-  maxtext_workload:
-    name: "Run MaxText Workload"
-    # IMPORTANT: Replace with the label for your runner (e.g., v5p-8)
-    runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
-    container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_candidate_gpu:latest
-    steps:
-      - name: Checkout MaxText Repo
-        uses: actions/checkout@v4
-        with:
-          repository: AI-Hypercomputer/maxtext
-          path: maxtext
-          ref: rbierneni-test-gpu-run
+  # maxtext_workload:
+  #   name: "Run MaxText Workload"
+  #   # IMPORTANT: Replace with the label for your runner (e.g., v5p-8)
+  #   runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
+  #   container:
+  #     image: gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_candidate_gpu:latest
+  #   steps:
+  #     - name: Checkout MaxText Repo
+  #       uses: actions/checkout@v4
+  #       with:
+  #         repository: AI-Hypercomputer/maxtext
+  #         path: maxtext
+  #         ref: rbierneni-test-gpu-run
       
-      - name: Print dependencies
-        run: |
-          pip uninstall -y transformer-engine transformer-engine-jax transformer-engine-cu12
-          pip install -U transformer-engine[jax]==2.6.0
-          # pip uninstall -y tensorflow
-          # pip install tensorflow-cpu
-          pip freeze
+  #     - name: Print dependencies
+  #       run: |
+  #         pip uninstall -y transformer-engine transformer-engine-jax transformer-engine-cu12
+  #         pip install -U transformer-engine[jax]==2.6.0
+  #         # pip uninstall -y tensorflow
+  #         # pip install tensorflow-cpu
+  #         pip freeze
 
-      - name: Run MaxText Training
-        run: |
-          # This command is adapted from your DAG for a single-slice configuration.
-          cd maxtext && \
-          pip install . --no-dependencies
+  #     - name: Run MaxText Training
+  #       run: |
+  #         # This command is adapted from your DAG for a single-slice configuration.
+  #         cd maxtext && \
+  #         pip install . --no-dependencies
 
-          export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
-          export TF_FORCE_GPU_ALLOW_GROWTH=true
+  #         export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
+  #         export TF_FORCE_GPU_ALLOW_GROWTH=true
 
-          python3 -m MaxText.train MaxText/configs/base.yml \
-            steps=2 \
-            enable_checkpointing=false \
-            attention=cudnn_flash_te \
-            dataset_type=synthetic \
-            run_name=rbierneni-test-maxtext-gpu \
-            base_output_directory=gs://rbierneni-multipod-dev/maxtext/${{ github.run_id }}
+  #         python3 -m MaxText.train MaxText/configs/base.yml \
+  #           steps=2 \
+  #           enable_checkpointing=false \
+  #           attention=cudnn_flash_te \
+  #           dataset_type=synthetic \
+  #           run_name=rbierneni-test-maxtext-gpu \
+  #           base_output_directory=gs://rbierneni-multipod-dev/maxtext/${{ github.run_id }}
 
   # STAGE 1: PULL MAXDIFFUSION IMAGE AND RUN WORKLOAD
   maxdiffusion_workload:
@@ -78,6 +78,8 @@ jobs:
         run: |
           # pip uninstall -y transformer-engine transformer-engine-jax transformer-engine-cu12
           pip install -U transformer-engine[jax]==2.6.0
+          pip uninstall -y transformer-engine-cu12
+          pip install transformer-engine-cu13
           # pip uninstall -y tensorflow
           # pip install tensorflow-cpu
           pip freeze