AI-Hypercomputer
diff --git a/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎RESTRUCTURE.md‎
Lines changed: 11 additions & 2 deletions b/‎RESTRUCTURE.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎docs/run_maxtext/decoupled_mode.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/run_maxtext/decoupled_mode.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎end_to_end/tpu/llama3.1/70b/3_test_llama3.1_70b.sh‎
Lines changed: 1 addition & 1 deletion b/‎end_to_end/tpu/llama3.1/70b/3_test_llama3.1_70b.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎end_to_end/tpu/llama3.1/8b/3_test_llama3.1_8b.sh‎
Lines changed: 1 addition & 1 deletion b/‎end_to_end/tpu/llama3.1/8b/3_test_llama3.1_8b.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh‎
Lines changed: 1 addition & 1 deletion b/‎end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎local_datasets/get_minimal_hf_c4_parquet.py‎
Lines changed: 2 additions & 2 deletions b/‎local_datasets/get_minimal_hf_c4_parquet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytest.ini‎
Lines changed: 15 additions & 7 deletions b/‎pytest.ini‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎tests/assets/logits_generation/generate_grpo_golden_logits.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/assets/logits_generation/generate_grpo_golden_logits.py‎
Lines changed: 1 addition & 1 deletion
@@ -231,7 +231,6 @@ jobs:
       image_type: ${{ matrix.image_type }}
       cloud_runner: linux-x86-a2-48-a100-4gpu
       pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
-      pytest_addopts: '--ignore=tests/sft_hooks_test.py'
       xla_python_client_mem_fraction: 0.65
       tf_force_gpu_allow_growth: true
       container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
@@ -252,7 +251,6 @@ jobs:
       image_type: ${{ matrix.image_type }}
       cloud_runner: linux-x86-a2-48-a100-4gpu
       pytest_marker: 'not cpu_only and not tpu_only and integration_test'
-      pytest_addopts: '--ignore=tests/sft_hooks_test.py'
       xla_python_client_mem_fraction: 0.65
       tf_force_gpu_allow_growth: true
       container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
 
@@ -120,7 +120,7 @@ jobs:
             -v \
             -m "${FINAL_PYTEST_MARKER}" \
             --durations=0 \
-            --deselect "tests/tokenizer_test.py::TokenizerTest::test_detokenize" \
+            --deselect "tests/unit/tokenizer_test.py::TokenizerTest::test_detokenize" \
             --cov=MaxText \
             --cov-report=xml \
             --cov-report=term \
 
@@ -24,7 +24,7 @@ python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_
 # If not, we can convert the checkpoint back from MaxText to Huggingface and compare with the original one
 JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.llama_mistral_mixtral_orbax_to_hf "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=gs://runner-maxtext-logs load_parameters_path=${CHECKPOINT_TPU_SCANNED}/0/items run_name=convert_to_hf model_name=${MODEL_SIZE} hf_model_path=$CHECKPOINT_TPU_CONVERTED_BACK
 
-python3 -m tests.hf_checkpoint_conversion_checker --original_ckpt=${CHECKPOINT_ORIGINAL} --converted_ckpt=${CHECKPOINT_TPU_CONVERTED_BACK}
+python3 -m tests.utils.hf_checkpoint_conversion_checker --original_ckpt=${CHECKPOINT_ORIGINAL} --converted_ckpt=${CHECKPOINT_TPU_CONVERTED_BACK}
 
 # If everything looks good, we move on to convert to the unrolled checkpoint for performant serving
 JAX_PLATFORMS=cpu python3 -m MaxText.generate_param_only_checkpoint "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml async_checkpointing=false base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${CHECKPOINT_TPU_SCANNED}/0/items run_name=${RUN_NAME} model_name=${MODEL_SIZE} force_unroll=true
 
@@ -30,7 +30,7 @@ python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_
 # If not, we can convert the checkpoint back from MaxText to Huggingface and compare with the original one
 JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.llama_mistral_mixtral_orbax_to_hf "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=gs://runner-maxtext-logs load_parameters_path=${CHECKPOINT_TPU_SCANNED}/0/items run_name=convert_to_hf model_name=${MODEL_SIZE} hf_model_path=$CHECKPOINT_TPU_CONVERTED_BACK
 
-python3 -m tests.hf_checkpoint_conversion_checker --original_ckpt=${CHECKPOINT_ORIGINAL} --converted_ckpt=${CHECKPOINT_TPU_CONVERTED_BACK}
+python3 -m tests.utils.hf_checkpoint_conversion_checker --original_ckpt=${CHECKPOINT_ORIGINAL} --converted_ckpt=${CHECKPOINT_TPU_CONVERTED_BACK}
 
 # If everything looks good, we move on to convert to the unrolled checkpoint for performant serving
 JAX_PLATFORMS=cpu python3 -m MaxText.generate_param_only_checkpoint "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml async_checkpointing=false base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${CHECKPOINT_TPU_SCANNED}/0/items run_name=${RUN_NAME} model_name=${MODEL_SIZE} force_unroll=true
 
@@ -43,7 +43,7 @@ python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/Max
 # Test whether the forward pass logits match the golden logits - matmul implementation
 python3 -m tests.utils.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=matmul_forward_pass_test per_device_batch_size=1 model_name=mixtral-8x7b tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText/assets}}"/tokenizer.mistral-v1 ici_tensor_parallelism=1 ici_fsdp_parallelism=-1 max_prefill_predict_length=11 max_target_length=11 dtype=float32 megablox=False sparse_matmul=False scan_layers=false --token_size=4 --max_kl_div=3e-3
 
-# To repeat duplicate tests, we have MoE unit test to verify outputs matching for matmul, megablox, and ragged_dot implementation at https://github.com/AI-Hypercomputer/maxtext/blob/5c4090b8d5713a1a25cab85df89b0ec9c9862635/MaxText/tests/moe_test.py#L338-L411
+# To repeat duplicate tests, we have MoE unit test to verify outputs matching for matmul, megablox, and ragged_dot implementation at https://github.com/AI-Hypercomputer/maxtext/blob/5c4090b8d5713a1a25cab85df89b0ec9c9862635/MaxText/tests/unit/moe_test.py#L338-L411
 
 # Run pre-training - megablox implementation
 python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=megablox_pre_training per_device_batch_size=4 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=-1 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText/assets}}"/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16
 
@@ -16,8 +16,8 @@
 
 Fetch the first train & validation TFRecord 00000-of shard for a version and
 sample rows into two tiny parquet files with fixed output names for the usage
-in tests/grain_data_processing_test.py, tests/hf_data_processing_test.py,
-tests/train_tests.py:
+in tests/unit/grain_data_processing_test.py, tests/unit/hf_data_processing_test.py,
+tests/integration/train_tests.py:
     c4-train-00000-of-01637.parquet
     c4-validation-00000-of-01637.parquet
 """
 
@@ -5,13 +5,21 @@ testpaths =
 python_files = *_test.py *_tests.py
 addopts =
     -rf --import-mode=importlib --strict-markers
-    --ignore=tests/profiler_test.py
-    --ignore=tests/train_smoke_test.py
-    --ignore=tests/train_int8_smoke_test.py
-    --ignore=tests/train_gpu_smoke_test.py
-    --ignore=tests/train_using_ragged_dot_smoke_test.py
-    --ignore=tests/grpo_trainer_correctness_test.py
-    --ignore=tests/offline_engine_test.py
+    --ignore=tests/integration/grpo_trainer_correctness_test.py
+    --ignore=tests/integration/smoke/train_gpu_smoke_test.py
+    --ignore=tests/integration/smoke/train_int8_smoke_test.py
+    --ignore=tests/integration/smoke/train_smoke_test.py
+    --ignore=tests/integration/smoke/train_using_ragged_dot_smoke_test.py
+    --ignore=tests/unit/dequantize_mxfp4_test.py
+    --ignore=tests/unit/gemma3_layers_test.py
+    --ignore=tests/unit/gpt_vs_reference_test.py
+    --ignore=tests/unit/llama4_layers_test.py
+    --ignore=tests/unit/mla_vs_reference_test.py
+    --ignore=tests/unit/moba_vs_reference_test.py
+    --ignore=tests/unit/offline_engine_test.py
+    --ignore=tests/unit/profiler_test.py
+    --ignore=tests/unit/qwen3_embedding_vs_reference_test.py
+    --ignore=tests/unit/qwen3_next_vs_reference_test.py
 markers =
     tpu_only: marks tests to be run on TPUs only
     gpu_only: marks tests to be run on GPUs only
 
@@ -52,7 +52,7 @@
 from MaxText.globals import MAXTEXT_TEST_ASSETS_ROOT, MAXTEXT_PKG_DIR
 from MaxText.layers import models
 
-from tests.grpo_trainer_correctness_test import prepare_maxtext_inputs
+from tests.integration.grpo_trainer_correctness_test import prepare_maxtext_inputs
 
 
 class GRPOTest(unittest.TestCase):