Try with new image that works

Rohan-Bierneni · Rohan-Bierneni · commit c029bdd12ddd · 2025-10-01T19:28:40.000Z
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
@@ -24,168 +24,42 @@ on:
   workflow_dispatch:
 
 jobs:
-  # maxtext_workload:
-  #   name: "Run MaxText Workload"
-  #   # IMPORTANT: Replace with the label for your runner (e.g., v5p-8)
-  #   runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
-  #   container:
-  #     image: gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_candidate_gpu:latest
-  #   steps:
-  #     - name: Checkout MaxText Repo
-  #       uses: actions/checkout@v4
-  #       with:
-  #         repository: AI-Hypercomputer/maxtext
-  #         path: maxtext
-  #         ref: rbierneni-test-gpu-run
-      
-  #     - name: Print dependencies
-  #       run: |
-  #         pip uninstall -y transformer-engine transformer-engine-jax transformer-engine-cu12
-  #         pip install -U transformer-engine[jax]==2.6.0
-  #         # pip uninstall -y tensorflow
-  #         # pip install tensorflow-cpu
-  #         pip freeze
-
-  #     - name: Run MaxText Training
-  #       run: |
-  #         # This command is adapted from your DAG for a single-slice configuration.
-  #         cd maxtext && \
-  #         pip install . --no-dependencies
-
-  #         export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
-  #         export TF_FORCE_GPU_ALLOW_GROWTH=true
-
-  #         python3 -m MaxText.train MaxText/configs/base.yml \
-  #           steps=2 \
-  #           enable_checkpointing=false \
-  #           attention=cudnn_flash_te \
-  #           dataset_type=synthetic \
-  #           run_name=rbierneni-test-maxtext-gpu \
-  #           base_output_directory=gs://rbierneni-multipod-dev/maxtext/${{ github.run_id }}
-
-  # STAGE 1: PULL MAXDIFFUSION IMAGE AND RUN WORKLOAD
-  maxdiffusion_workload:
-    name: "Run MaxDiffusion Workload"
+  maxtext_workload:
+    name: "Run MaxText Workload"
     # IMPORTANT: Replace with the label for your runner (e.g., v5p-8)
     runs-on: ["linux-x86-a3-megagpu-h100-8gpu"]
     container:
-      image: gcr.io/tpu-prod-env-multipod/maxdiffusion_stable_stack_candidate:jax0.7.2_cuda13_te2.6.0
+      image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/maxtext-gpu-custom:latest
     steps:
-      - name: Checkout Repository
+      - name: Checkout MaxText Repo
         uses: actions/checkout@v4
-
-      - name: Check Host CUDA and GPU Environment
-        run: |
-          echo "--- Checking NVIDIA driver and supported CUDA version ---"
-          nvidia-smi || echo "nvidia-smi command not found. No GPU or NVIDIA driver detected."
-          
-          echo ""
-          echo "--- Checking for default CUDA toolkit installation ---"
-          ls -l /usr/local/ | grep cuda || echo "No default CUDA toolkit found in /usr/local/"
-          
-          echo ""
-          echo "--- Checking dynamic linker library path ---"
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-'Not Set'}"
-
+        with:
+          repository: AI-Hypercomputer/maxtext
+          path: maxtext
+          ref: rbierneni-test-gpu-run
+      
       - name: Print dependencies
         run: |
           pip uninstall -y transformer-engine transformer-engine-jax transformer-engine-cu12
-          # pip install transformer_engine[jax]==2.4.0
-          # pip install -U transformer-engine[jax]==2.6.0
-          # pip uninstall -y transformer-engine-cu12
-          # pip install transformer-engine-cu13
+          pip install -U transformer-engine[jax]==2.6.0
           # pip uninstall -y tensorflow
           # pip install tensorflow-cpu
           pip freeze
 
-      - name: Check devices
-        run: |
-          python -c "import jax; print(jax.devices())"
-      
-      - name: Run Conflict Verification Script
+      - name: Run MaxText Training
         run: |
-          # This command creates the file inside the runner
-          cat <<'EOF' > verify_conflict.py
-          print("--- PyTorch vs. JAX Conflict Test ---")
-
-          print("\nStep 1: Attempting to import torch...")
-          try:
-              import torch
-              print(f"Successfully imported torch version: {torch.__version__}")
-              print(f"Is PyTorch using CUDA? -> {torch.cuda.is_available()}")
-          except Exception as e:
-              print(f"Failed to import torch: {e}")
-
-          print("\nStep 2: Now, attempting to initialize JAX...")
-          try:
-              import jax
-              devices = jax.devices()
-              print("\n--- RESULT: SUCCESS ---")
-              print(f"JAX initialized correctly and found devices: {devices}")
-          except Exception as e:
-              print("\n--- RESULT: FAILURE ---")
-              print("JAX failed to initialize after PyTorch was imported.")
-              print(f"JAX Error: {e}")
-          EOF
-
-          # Now that the file exists, this command will work
-          python verify_conflict.py
-
-      # - name: Run MaxDiffusion Training
-      #   run: |
-      #     # This command is adapted from your DAG for a single-slice configuration.
-      #     NVTE_FRAMEWORK=JAX NVTE_FUSED_ATTN=1 pip install . && \
-      #       python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml \
-      #       hardware=gpu \
-      #       train_new_unet=true \
-      #       train_text_encoder=false \
-      #       cache_latents_text_encoder_outputs=true \
-      #       per_device_batch_size=1 \
-      #       attention=dot_product \
-      #       activations_dtype=bfloat16 \
-      #       weights_dtype=bfloat16 \
-      #       max_train_steps=200 \
-      #       enable_profiler=True \
-      #       run_name=1slice-VGpuVersion.XPK_H100_a3-maxdiffusion-jax-stable-stack-2025-09-26-04-12-02 \
-      #       output_dir=gs://rbierneni-multipod-dev/${{ github.run_id }}
-
-# jobs:
-#   build:
-#     strategy:
-#       fail-fast: false
-#       matrix:
-#         tpu-type: ["v5p-8"]
-#     name: "TPU test (${{ matrix.tpu-type }})"
-#     runs-on: ["self-hosted","${{ matrix.tpu-type }}"]
-#     steps:
-#     - uses: actions/checkout@v4
-#     - name: Set up Python 3.12
-#       uses: actions/setup-python@v5
-#       with:
-#         python-version: '3.12'
-#     - name: Install dependencies
-#       run: |
-#         pip install -e .
-#         pip uninstall jax jaxlib libtpu-nightly libtpu -y
-#         bash setup.sh MODE=stable
-#         export PATH=$PATH:$HOME/.local/bin
-#         pip install ruff
-#         pip install isort
-#         pip install pytest
-#     - name: Analysing the code with ruff
-#       run: |
-#         ruff check .
-#     - name: version check
-#       run: |
-#         python --version
-#         pip show jax jaxlib flax transformers datasets tensorflow tensorflow_datasets
-#     - name: PyTest
-#       run: | #--deselect=src/maxdiffusion/tests/input_pipeline_interface_test.py
-#         HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py  -x
-#  add_pull_ready:
-#    if: github.ref != 'refs/heads/main'
-#    permissions:
-#      checks: read
-#      pull-requests: write
-#    needs: build
-#    uses: ./.github/workflows/AddLabel.yml
+          # This command is adapted from your DAG for a single-slice configuration.
+          cd maxtext && \
+          pip install . --no-dependencies
+
+          export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
+          export TF_FORCE_GPU_ALLOW_GROWTH=true
+          export NVTE_FUSED_ATTN=1
+
+          python3 -m MaxText.train MaxText/configs/base.yml \
+            steps=5 \
+            enable_checkpointing=false \
+            attention=cudnn_flash_te \
+            dataset_type=synthetic \
+            run_name=rbierneni-test-maxtext-gpu \
+            base_output_directory=gs://rbierneni-multipod-dev/maxtext/${{ github.run_id }}