Skip to content

Commit 086c50d

Browse files
Merge pull request #3394 from AI-Hypercomputer:post-training-ci-gate
PiperOrigin-RevId: 884691170
2 parents a3c19fd + 8c778eb commit 086c50d

15 files changed

Lines changed: 92 additions & 18 deletions

.github/workflows/build_and_test_maxtext.yml

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ jobs:
133133
device_name: X64
134134
cloud_runner: linux-x86-n2-16
135135
image_type: ${{ matrix.image_type }}
136-
pytest_marker: 'cpu_only'
136+
pytest_marker: 'cpu_only and not post_training'
137137
xla_python_client_mem_fraction: 0.75
138138
tf_force_gpu_allow_growth: false
139139
container_resource_option: "--privileged"
@@ -155,7 +155,7 @@ jobs:
155155
device_name: v6e-4
156156
image_type: ${{ matrix.image_type }}
157157
cloud_runner: linux-x86-ct6e-180-4tpu
158-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
158+
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
159159
xla_python_client_mem_fraction: 0.75
160160
tf_force_gpu_allow_growth: false
161161
container_resource_option: "--privileged"
@@ -175,7 +175,7 @@ jobs:
175175
device_name: v6e-4
176176
image_type: ${{ matrix.image_type }}
177177
cloud_runner: linux-x86-ct6e-180-4tpu
178-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
178+
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
179179
xla_python_client_mem_fraction: 0.75
180180
tf_force_gpu_allow_growth: false
181181
container_resource_option: "--privileged"
@@ -195,7 +195,7 @@ jobs:
195195
device_name: v6e-4
196196
image_type: ${{ matrix.image_type }}
197197
cloud_runner: linux-x86-ct6e-180-4tpu
198-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
198+
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
199199
xla_python_client_mem_fraction: 0.75
200200
tf_force_gpu_allow_growth: false
201201
container_resource_option: "--privileged"
@@ -215,7 +215,7 @@ jobs:
215215
device_name: v6e-4
216216
image_type: ${{ matrix.image_type }}
217217
cloud_runner: linux-x86-ct6e-180-4tpu
218-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
218+
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
219219
xla_python_client_mem_fraction: 0.75
220220
tf_force_gpu_allow_growth: false
221221
container_resource_option: "--privileged"
@@ -236,13 +236,57 @@ jobs:
236236
device_name: a100-40gb-4
237237
image_type: ${{ matrix.image_type }}
238238
cloud_runner: linux-x86-a2-48-a100-4gpu
239-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
239+
pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
240240
xla_python_client_mem_fraction: 0.65
241241
tf_force_gpu_allow_growth: true
242242
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243243
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244244
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245245

246+
maxtext_post_training_cpu_unit_tests:
247+
needs: build_and_upload_maxtext_package
248+
if: needs.doc_only_check.outputs.run_tests == 'true'
249+
uses: ./.github/workflows/run_tests_against_package.yml
250+
strategy:
251+
fail-fast: false
252+
matrix:
253+
image_type: ["py312"]
254+
with:
255+
device_type: cpu
256+
device_name: X64
257+
cloud_runner: linux-x86-n2-16
258+
image_type: ${{ matrix.image_type }}
259+
pytest_marker: 'cpu_only'
260+
pytest_addopts: 'tests/post_training/unit'
261+
xla_python_client_mem_fraction: 0.75
262+
tf_force_gpu_allow_growth: false
263+
container_resource_option: "--privileged"
264+
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265+
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
266+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267+
268+
maxtext_post_training_tpu_unit_tests:
269+
needs: build_and_upload_maxtext_package
270+
if: needs.doc_only_check.outputs.run_tests == 'true'
271+
uses: ./.github/workflows/run_tests_against_package.yml
272+
strategy:
273+
fail-fast: false
274+
matrix:
275+
image_type: ["py312"]
276+
with:
277+
device_type: tpu
278+
device_name: v6e-4
279+
image_type: ${{ matrix.image_type }}
280+
cloud_runner: linux-x86-ct6e-180-4tpu
281+
pytest_marker: 'tpu_only'
282+
pytest_addopts: 'tests/post_training/unit'
283+
xla_python_client_mem_fraction: 0.75
284+
tf_force_gpu_allow_growth: false
285+
container_resource_option: "--privileged"
286+
is_scheduled_run: ${{ github.event_name == 'schedule' }}
287+
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
288+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289+
246290
maxtext_gpu_integration_tests:
247291
needs: build_and_upload_maxtext_package
248292
if: needs.doc_only_check.outputs.run_tests == 'true'
@@ -257,7 +301,7 @@ jobs:
257301
device_name: a100-40gb-4
258302
image_type: ${{ matrix.image_type }}
259303
cloud_runner: linux-x86-a2-48-a100-4gpu
260-
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
304+
pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
261305
xla_python_client_mem_fraction: 0.65
262306
tf_force_gpu_allow_growth: true
263307
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
@@ -266,7 +310,7 @@ jobs:
266310

267311
all_tests_passed:
268312
name: All Required Tests Passed
269-
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
313+
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
270314
if: always()
271315
runs-on: ubuntu-latest
272316
steps:
@@ -287,6 +331,8 @@ jobs:
287331
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
288332
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
289333
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
334+
echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
335+
echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
290336
291337
# Fail only if any job failed or was cancelled (skipped is OK)
292338
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -305,6 +351,8 @@ jobs:
305351
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
306352
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
307353
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
354+
NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
355+
NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
308356

309357
all_notebooks_passed:
310358
name: All Notebooks Passed
@@ -337,7 +385,7 @@ jobs:
337385

338386
notify_failure:
339387
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
340-
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
388+
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
341389
if: ${{ always() }}
342390
runs-on: ubuntu-latest
343391
permissions:

.github/workflows/run_tests_against_package.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ on:
6161
maxtext_sha:
6262
required: true
6363
type: string
64+
extra_pip_deps_file:
65+
required: false
66+
type: string
67+
default: ''
6468

6569
permissions:
6670
contents: read
@@ -96,6 +100,12 @@ jobs:
96100
python3 --version
97101
python3 -m pip freeze
98102
uv pip install pytest-cov
103+
- name: Install extra pip deps
104+
if: inputs.extra_pip_deps_file != ''
105+
shell: bash
106+
run: |
107+
source .venv/bin/activate
108+
uv pip install -r ${{ inputs.extra_pip_deps_file }}
99109
- name: Copy test assets files
100110
run : gcloud storage cp gs://maxtext-test-assets/* tests/assets
101111
- name: Run Tests

pytest.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ testpaths =
55
python_files = *_test.py *_tests.py
66
addopts =
77
-rf --import-mode=importlib --strict-markers
8-
--ignore=tests/integration/grpo_trainer_correctness_test.py
8+
--ignore=tests/post_training/integration/grpo_trainer_correctness_test.py
99
--ignore=tests/integration/smoke/train_gpu_smoke_test.py
1010
--ignore=tests/integration/smoke/train_int8_smoke_test.py
1111
--ignore=tests/integration/smoke/train_smoke_test.py
@@ -36,4 +36,5 @@ markers =
3636
e.g., end_to_end tests
3737
external_serving: JetStream / serving / decode server components
3838
external_training: goodput integrations
39+
post_training: marks tests for post-training code paths.
3940

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
-r extra_post_train_base_deps_from_github.txt
12
google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
2-
google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip
33
mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
44
tpu-inference @ https://github.com/vllm-project/tpu-inference/archive/0cae84fc9a883ba1bde02d4f07930e6af9e92958.zip
55
vllm @ git+https://github.com/vllm-project/vllm@ee8a29511fc69e3f0f6291fa6ff1cf6e47f7750d

tests/integration/grpo_correctness.py renamed to tests/post_training/integration/grpo_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
from trl import GRPOConfig, GRPOTrainer
3737

38-
pytestmark = [pytest.mark.external_training] # uses pre-generated checkpoint
38+
pytestmark = [pytest.mark.external_training, pytest.mark.post_training] # uses pre-generated checkpoint
3939

4040

4141
class GRPOTest(unittest.TestCase):

tests/integration/grpo_trainer_correctness_test.py renamed to tests/post_training/integration/grpo_trainer_correctness_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from maxtext/tests/assets/logits_generation/generate_grpo_golden_logits.py
2323
2424
Usage:
25-
pytest tests/integration/grpo_trainer_correctness_test.py
25+
pytest tests/post_training/integration/grpo_trainer_correctness_test.py
2626
"""
2727

2828
import os
@@ -52,7 +52,7 @@
5252
import transformers
5353

5454
# This test is for serving pathways via offline_engine and maxengine.
55-
pytestmark = [pytest.mark.external_training]
55+
pytestmark = [pytest.mark.external_training, pytest.mark.post_training]
5656

5757

5858
def get_golden_data(config):

tests/integration/sft_trainer_correctness_test.py renamed to tests/post_training/integration/sft_trainer_correctness_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
2222
Usage:
2323
24-
pytest tests/integration/sft_trainer_correctness_test.py
24+
pytest tests/post_training/integration/sft_trainer_correctness_test.py
2525
"""
2626

2727
import os.path
@@ -46,6 +46,8 @@
4646
import pytest
4747
from transformers import AutoTokenizer
4848

49+
pytestmark = [pytest.mark.post_training]
50+
4951

5052
def get_golden_data(model_name):
5153
"""Get the golden data for sft_trainer from maxtext/tests/assets/logits_generation/generate_sft_golden_data.py."""

tests/unit/distillation_checkpointing_test.py renamed to tests/post_training/unit/distillation_checkpointing_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import pytest
1818

1919
pytest.importorskip("tunix")
20-
pytestmark = [pytest.mark.tpu_only]
20+
pytestmark = [pytest.mark.tpu_only, pytest.mark.post_training]
2121

2222
import json
2323
import os

tests/unit/distillation_data_processing_test.py renamed to tests/post_training/unit/distillation_data_processing_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414

1515
"""Data processing tests for distillation."""
1616

17+
import pytest
18+
19+
pytestmark = [pytest.mark.post_training, pytest.mark.cpu_only]
20+
1721
import argparse
1822
import os
1923
import subprocess

0 commit comments

Comments
 (0)