check if torch is causing the issue

Rohan-Bierneni · Rohan-Bierneni · commit 5f5749a9274d · 2025-09-30T09:27:50.000Z
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
@@ -98,27 +98,28 @@ jobs:
           # pip install tensorflow-cpu
           pip freeze
 
-      - name: Check per_device_batch_size
+      - name: Check devices
         run: |
           python -c "import jax; print(jax.devices())"
+          python verify_conflict.py
 
-      - name: Run MaxDiffusion Training
-        run: |
-          # This command is adapted from your DAG for a single-slice configuration.
-          NVTE_FRAMEWORK=JAX NVTE_FUSED_ATTN=1 pip install . && \
-            python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml \
-            hardware=gpu \
-            train_new_unet=true \
-            train_text_encoder=false \
-            cache_latents_text_encoder_outputs=true \
-            per_device_batch_size=1 \
-            attention=dot_product \
-            activations_dtype=bfloat16 \
-            weights_dtype=bfloat16 \
-            max_train_steps=200 \
-            enable_profiler=True \
-            run_name=1slice-VGpuVersion.XPK_H100_a3-maxdiffusion-jax-stable-stack-2025-09-26-04-12-02 \
-            output_dir=gs://rbierneni-multipod-dev/${{ github.run_id }}
+      # - name: Run MaxDiffusion Training
+      #   run: |
+      #     # This command is adapted from your DAG for a single-slice configuration.
+      #     NVTE_FRAMEWORK=JAX NVTE_FUSED_ATTN=1 pip install . && \
+      #       python -m src.maxdiffusion.train_sdxl src/maxdiffusion/configs/base_xl.yml \
+      #       hardware=gpu \
+      #       train_new_unet=true \
+      #       train_text_encoder=false \
+      #       cache_latents_text_encoder_outputs=true \
+      #       per_device_batch_size=1 \
+      #       attention=dot_product \
+      #       activations_dtype=bfloat16 \
+      #       weights_dtype=bfloat16 \
+      #       max_train_steps=200 \
+      #       enable_profiler=True \
+      #       run_name=1slice-VGpuVersion.XPK_H100_a3-maxdiffusion-jax-stable-stack-2025-09-26-04-12-02 \
+      #       output_dir=gs://rbierneni-multipod-dev/${{ github.run_id }}
 
 # jobs:
 #   build:
diff --git a/verify_conflict.sh b/verify_conflict.sh
@@ -0,0 +1,22 @@
+print("--- PyTorch vs. JAX Conflict Test ---")
+
+print("\nStep 1: Attempting to import torch...")
+try:
+    import torch
+    print(f"Successfully imported torch version: {torch.__version__}")
+    # This check will confirm you have the CPU-only version
+    print(f"Is PyTorch using CUDA? -> {torch.cuda.is_available()}")
+except Exception as e:
+    print(f"Failed to import torch: {e}")
+
+
+print("\nStep 2: Now, attempting to initialize JAX...")
+try:
+    import jax
+    devices = jax.devices()
+    print("\n--- RESULT: SUCCESS ---")
+    print(f"JAX initialized correctly and found devices: {devices}")
+except Exception as e:
+    print("\n--- RESULT: FAILURE ---")
+    print("JAX failed to initialize after PyTorch was imported.")
+    print(f"JAX Error: {e}")