AI-Hypercomputer
diff --git a/‎.github/workflows/UploadDockerImages.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/UploadDockerImages.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/build_and_upload_images.sh‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/build_and_upload_images.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎end_to_end/tpu/eval_assert.py‎
Lines changed: 77 additions & 5 deletions b/‎end_to_end/tpu/eval_assert.py‎
Lines changed: 77 additions & 5 deletions
diff --git a/‎end_to_end/tpu/test_sdxl_training_loss.sh‎
100644100755 b/‎end_to_end/tpu/test_sdxl_training_loss.sh‎
100644100755
diff --git a/‎maxdiffusion_gpu_dependencies.Dockerfile‎
Lines changed: 1 addition & 3 deletions b/‎maxdiffusion_gpu_dependencies.Dockerfile‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎requirements.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements_with_jax_stable_stack.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements_with_jax_stable_stack.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.sh‎
Lines changed: 4 additions & 4 deletions b/‎setup.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 4 additions & 0 deletions
@@ -35,3 +35,16 @@ jobs:
     - name: build maxdiffusion jax nightly image
       run: |
         bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_nightly MODE=nightly PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_nightly
+
+  build-gpu-image:
+    runs-on: ["self-hosted", "e2", "cpu"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Cleanup old docker images
+      run: docker system prune --all --force
+    - name: build maxdiffusion jax stable stack gpu image
+      run: |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_stable_stack_gpu MODE=stable_stack PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_stable_stack_gpu BASEIMAGE=us-docker.pkg.dev/cloud-tpu-images/jax-stable-stack/tpu:latest DEVICE=gpu
+    - name: build maxdiffusion jax nightly image
+      run: |
+         bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_nightly_gpu MODE=nightly PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_nightly DEVICE=gpu
@@ -34,13 +34,15 @@ for ARGUMENT in "$@"; do
     echo "$KEY"="$VALUE"
 done
 
+export DEVICE="${DEVICE:-tpu}"
+
 if [[ ! -v CLOUD_IMAGE_NAME ]] || [[ ! -v PROJECT ]] || [[ ! -v MODE ]] ; then
   echo "You must set CLOUD_IMAGE_NAME, PROJECT and MODE"
   exit 1
 fi
 
 gcloud auth configure-docker us-docker.pkg.dev --quiet
-bash docker_build_dependency_image.sh LOCAL_IMAGE_NAME=$LOCAL_IMAGE_NAME MODE=$MODE
+bash docker_build_dependency_image.sh LOCAL_IMAGE_NAME=$LOCAL_IMAGE_NAME MODE=$MODE DEVICE=$DEVICE
 image_date=$(date +%Y-%m-%d)
 
 # Upload only dependencies image
 
@@ -14,6 +14,15 @@
  limitations under the License.
  """
 
+"""
+Example to run
+python end_to_end/tpu/eval_assert.py avg_tflops metrics.txt 100
+python end_to_end/tpu/eval_assert.py avg_step_time metrics.txt 0.5 100
+python end_to_end/tpu/eval_assert.py avg_step_time metrics.txt 0.5 100
+"""
+
+
+
 # pylint: skip-file
 """Reads and asserts over target values"""
 from absl import app
@@ -34,26 +43,89 @@ def get_last_n_data(metrics_file, target, n=10):
   return last_n_data
 
 
-def test_final_loss(metrics_file, target_loss):
+def test_final_loss(metrics_file, target_loss, num_samples_str="10"):
   target_loss = float(target_loss)
+  num_samples = int(num_samples_str)
   with open(metrics_file, "r", encoding="utf8") as _:
-    use_last_n_data = 10
-    last_n_data = get_last_n_data(metrics_file, "learning/loss", use_last_n_data)
+    last_n_data = get_last_n_data(metrics_file, "learning/loss",num_samples)
     avg_last_n_data = sum(last_n_data) / len(last_n_data)
     print(f"Mean of last {len(last_n_data)} losses is {avg_last_n_data}")
     print(f"Target loss is {target_loss}")
     assert avg_last_n_data < target_loss
     print("Final loss test passed.")
 
 
+def test_avg_step_time(metrics_file, max_avg_step_time_str, num_samples_str="10"):
+  """Tests if the average of the last N step times is below a maximum threshold."""
+  max_avg_step_time = float(max_avg_step_time_str)
+  num_samples = int(num_samples_str)
+  metric_key = "perf/step_time_seconds"
+  last_n_step_times = get_last_n_data(metrics_file, metric_key, num_samples)
+
+  if not last_n_step_times:
+    raise ValueError(f"Metric '{metric_key}' not found or no data points in {metrics_file}.")
+
+  avg_last_n_step_time = sum(last_n_step_times) / len(last_n_step_times)
+
+  print(f"Found {len(last_n_step_times)} data points for '{metric_key}'.")
+  print(f"Mean of last {len(last_n_step_times)} step times is {avg_last_n_step_time:.4f} s")
+
+  assert (
+      avg_last_n_step_time < max_avg_step_time
+  ), f"Average step time {avg_last_n_step_time:.4f}s is not less than target {max_avg_step_time}s."
+  print("Average step time test passed.")
+
+
+def test_avg_tflops(metrics_file, min_avg_tflops_str, num_samples_str="10"):
+  """Tests if the average of the last N TFLOPs/sec values is above a minimum threshold."""
+  min_avg_tflops = float(min_avg_tflops_str)
+  num_samples = int(num_samples_str)
+  metric_key = "perf/per_device_tflops_per_sec"
+
+  last_n_tflops = get_last_n_data(metrics_file, metric_key, num_samples)
+
+  if not last_n_tflops:
+    raise ValueError(f"Metric '{metric_key}' not found or no data points in {metrics_file}.")
+
+  avg_last_n_tflops = sum(last_n_tflops) / len(last_n_tflops)
+
+  print(f"Found {len(last_n_tflops)} data points for '{metric_key}'.")
+  print(f"Mean of last {len(last_n_tflops)} steps TFLOPs/sec is {avg_last_n_tflops:.2f}")
+
+  assert (
+      avg_last_n_tflops > min_avg_tflops
+  ), f"Average TFLOPs/sec {avg_last_n_tflops:.2f} is not greater than target {min_avg_tflops}."
+  print("Average TFLOPs/sec test passed.")
+
+
 def main(argv: Sequence[str]) -> None:
+  if len(argv) < 2:
+    print("Usage: python script.py <test_scenario> [test_vars...]")
+    print("Available scenarios: final_loss, avg_step_time, avg_tflops")
+    raise ValueError("Test scenario not specified.")
 
   _, test_scenario, *test_vars = argv
 
   if test_scenario == "final_loss":
-    test_final_loss(*test_vars)
+    if len(test_vars) < 2:
+      raise ValueError("Usage: final_loss <metrics_file> <target_loss> [num_samples]")
+    metrics_file, target_loss, *num_samples_opt = test_vars
+    num_samples = num_samples_opt[0] if num_samples_opt else "10"
+    test_final_loss(metrics_file, target_loss, num_samples)
+  elif test_scenario == "avg_step_time":
+    if len(test_vars) < 2:
+      raise ValueError("Usage: avg_step_time <metrics_file> <max_avg_step_time> [num_samples]")
+    metrics_file, max_avg_step_time, *num_samples_opt = test_vars
+    num_samples = num_samples_opt[0] if num_samples_opt else "10"
+    test_avg_step_time(metrics_file, max_avg_step_time, num_samples)
+  elif test_scenario == "avg_tflops":
+    if len(test_vars) < 2:
+      raise ValueError("Usage: avg_tflops <metrics_file> <min_avg_tflops> [num_samples]")
+    metrics_file, min_avg_tflops, *num_samples_opt = test_vars
+    num_samples = num_samples_opt[0] if num_samples_opt else "10"
+    test_avg_tflops(metrics_file, min_avg_tflops, num_samples)
   else:
-    raise ValueError(f"Unrecognized test_scenario {test_scenario}")
+    raise ValueError(f"Unrecognized test_scenario '{test_scenario}'. Available: final_loss, avg_step_time, avg_tflops")
 
 
 if __name__ == "__main__":
 
@@ -22,8 +22,7 @@ RUN apt-get update && apt-get install -y google-cloud-sdk
 # Set environment variables for Google Cloud SDK
 ENV PATH="/usr/local/google-cloud-sdk/bin:${PATH}"
 
-# Upgrade libcusprase to work with Jax
-RUN apt-get update && apt-get install -y libcusparse-12-3
+
 
 ARG MODE
 ENV ENV_MODE=$MODE
@@ -46,5 +45,4 @@ RUN ls .
 RUN echo "Running command: bash setup.sh MODE=$ENV_MODE JAX_VERSION=$ENV_JAX_VERSION DEVICE=${ENV_DEVICE}"
 RUN --mount=type=cache,target=/root/.cache/pip bash setup.sh MODE=${ENV_MODE} JAX_VERSION=${ENV_JAX_VERSION} DEVICE=${ENV_DEVICE}
 
-
 WORKDIR /deps
@@ -19,14 +19,14 @@ Pillow
 pylint
 pyink
 pytest==8.2.2
-tensorflow==2.17.0
+tensorflow>=2.17.0
 tensorflow-datasets>=4.9.6
 ruff>=0.1.5,<=0.2
 git+https://github.com/mlperf/logging.git
 opencv-python-headless==4.10.0.84
 orbax-checkpoint==0.10.3
 tokenizers==0.21.0
-huggingface_hub==0.24.7
+huggingface_hub==0.30.2
 transformers==4.48.1
 einops==0.8.0
 sentencepiece
 
@@ -8,7 +8,7 @@ ftfy
 git+https://github.com/mlperf/logging.git
 google-cloud-storage==2.17.0
 grain-nightly==0.0.10
-huggingface_hub==0.24.7
+huggingface_hub==0.30.2
 jax>=0.4.30
 jaxlib>=0.4.30
 Jinja2
 
@@ -97,7 +97,7 @@
     "filelock",
     "flax>=0.4.1",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub==0.24.7",
+    "huggingface-hub==0.30.0",
     "requests-mock==1.10.0",
     "importlib_metadata",
     "invisible-watermark>=0.2.0",
 
@@ -55,6 +55,9 @@ if [[ -n $JAX_VERSION && ! ($MODE == "stable" || -z $MODE) ]]; then
   exit 1
 fi
 
+# Install dependencies from requirements.txt first
+pip3 install -U -r requirements.txt || echo "Failed to install dependencies in the requirements" >&2
+
 # Install JAX and JAXlib based on the specified mode
 if [[ "$MODE" == "stable" || ! -v MODE ]]; then
   # Stable mode
@@ -78,7 +81,7 @@ if [[ "$MODE" == "stable" || ! -v MODE ]]; then
         pip3 install "jax[cuda12]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
     fi
     export NVTE_FRAMEWORK=jax
-    pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+    pip3 install transformer_engine[jax]==2.1.0
   fi
 
 elif [[ $MODE == "nightly" ]]; then
@@ -106,8 +109,5 @@ else
   exit 1
 fi
 
-# Install dependencies from requirements.txt
-pip3 install -U -r requirements.txt || echo "Failed to install dependencies in the requirements" >&2
-
 # Install maxdiffusion
 pip3 install -U . || echo "Failed to install maxdiffusion" >&2
@@ -19,6 +19,10 @@ metrics_file: "" # for testing, local file that stores scalar metrics. If empty,
 # If true save metrics such as loss and TFLOPS to GCS in {base_output_directory}/{run_name}/metrics/
 write_metrics: True
 gcs_metrics: True
+
+timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
+write_timing_metrics: True 
+
 # If true save config to GCS in {base_output_directory}/{run_name}/
 save_config_to_gcs: False
 log_period: 10000000000  # Flushes Tensorboard