@@ -113,47 +113,72 @@ jobs:
113113 with :
114114 device_type : tpu
115115 device_name : v6e-4
116- base_image : maxtext-unit-test-tpu: ${{ matrix.image_type }}
116+ image_type : ${{ matrix.image_type }}
117117 cloud_runner : linux-x86-ct6e-180-4tpu
118118 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119 secrets :
120120 HF_TOKEN : ${{ secrets.HF_TOKEN }}
121121
122- tpu-tests :
123- needs : [ build_and_upload_maxtext_package]
122+ maxtext_cpu_unit_tests :
123+ needs : build_and_upload_maxtext_package
124124 if : needs.doc_only_check.outputs.run_tests == 'true'
125+ uses : ./.github/workflows/run_tests_against_package.yml
125126 strategy :
126- fail-fast : false
127- matrix :
128- flavor : [tpu-unit, tpu-integration ]
129- uses : ./.github/workflows/run_tests_coordinator.yml
127+ fail-fast : false # don't cancel all jobs on failure
128+ matrix :
129+ image_type : ["py312" ]
130+ worker_group : [1, 2]
130131 with :
131- flavor : ${{ matrix.flavor }}
132- base_image : maxtext-unit-test-tpu:py312
132+ device_type : cpu
133+ device_name : X64
134+ cloud_runner : linux-x86-n2-16
135+ image_type : ${{ matrix.image_type }}
136+ pytest_marker : ' cpu_only'
137+ xla_python_client_mem_fraction : 0.75
138+ tf_force_gpu_allow_growth : false
139+ container_resource_option : " --privileged"
133140 is_scheduled_run : ${{ github.event_name == 'schedule' }}
141+ worker_group : ${{ matrix.worker_group }}
142+ total_workers : 2
134143 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
135144
136- gpu-tests :
137- needs : [ build_and_upload_maxtext_package]
145+ maxtext_tpu_unit_tests :
146+ needs : build_and_upload_maxtext_package
138147 if : needs.doc_only_check.outputs.run_tests == 'true'
148+ uses : ./.github/workflows/run_tests_against_package.yml
139149 strategy :
140- fail-fast : false
141- matrix :
142- flavor : [gpu-unit, gpu-integration]
143- uses : ./.github/workflows/run_tests_coordinator.yml
150+ fail-fast : false
151+ matrix :
152+ image_type : ["py312"]
144153 with :
145- flavor : ${{ matrix.flavor }}
146- base_image : maxtext-unit-test-cuda12:py312
154+ device_type : tpu
155+ device_name : v6e-4
156+ image_type : ${{ matrix.image_type }}
157+ cloud_runner : linux-x86-ct6e-180-4tpu
158+ pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
159+ xla_python_client_mem_fraction : 0.75
160+ tf_force_gpu_allow_growth : false
161+ container_resource_option : " --privileged"
147162 is_scheduled_run : ${{ github.event_name == 'schedule' }}
148163 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
149164
150- cpu-tests :
151- needs : [ build_and_upload_maxtext_package]
165+ maxtext_tpu_integration_tests :
166+ needs : build_and_upload_maxtext_package
152167 if : needs.doc_only_check.outputs.run_tests == 'true'
153- uses : ./.github/workflows/run_tests_coordinator.yml
168+ uses : ./.github/workflows/run_tests_against_package.yml
169+ strategy :
170+ fail-fast : false
171+ matrix :
172+ image_type : ["py312"]
154173 with :
155- flavor : cpu-unit
156- base_image : maxtext-unit-test-tpu:py312
174+ device_type : tpu
175+ device_name : v6e-4
176+ image_type : ${{ matrix.image_type }}
177+ cloud_runner : linux-x86-ct6e-180-4tpu
178+ pytest_marker : ' not cpu_only and not gpu_only and integration_test'
179+ xla_python_client_mem_fraction : 0.75
180+ tf_force_gpu_allow_growth : false
181+ container_resource_option : " --privileged"
157182 is_scheduled_run : ${{ github.event_name == 'schedule' }}
158183 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
159184
@@ -163,12 +188,14 @@ jobs:
163188 uses : ./.github/workflows/run_pathways_tests.yml
164189 strategy :
165190 fail-fast : false
191+ matrix :
192+ image_type : ["py312"]
166193 with :
167194 device_type : tpu
168195 device_name : v6e-4
169- base_image : maxtext-unit-test-tpu:py312
196+ image_type : ${{ matrix.image_type }}
170197 cloud_runner : linux-x86-ct6e-180-4tpu
171- pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training '
198+ pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
172199 xla_python_client_mem_fraction : 0.75
173200 tf_force_gpu_allow_growth : false
174201 container_resource_option : " --privileged"
@@ -181,38 +208,85 @@ jobs:
181208 uses : ./.github/workflows/run_pathways_tests.yml
182209 strategy :
183210 fail-fast : false
211+ matrix :
212+ image_type : ["py312"]
184213 with :
185214 device_type : tpu
186215 device_name : v6e-4
187- base_image : maxtext-unit-test-tpu:py312
216+ image_type : ${{ matrix.image_type }}
188217 cloud_runner : linux-x86-ct6e-180-4tpu
189- pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training '
218+ pytest_marker : ' not cpu_only and not gpu_only and integration_test'
190219 xla_python_client_mem_fraction : 0.75
191220 tf_force_gpu_allow_growth : false
192221 container_resource_option : " --privileged"
193222 is_scheduled_run : ${{ github.event_name == 'schedule' }}
194223 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
195224
225+ maxtext_gpu_unit_tests :
226+ needs : build_and_upload_maxtext_package
227+ if : needs.doc_only_check.outputs.run_tests == 'true'
228+ uses : ./.github/workflows/run_tests_against_package.yml
229+ strategy :
230+ fail-fast : false
231+ matrix :
232+ image_type : ["py312"]
233+ cuda : ["cuda12"]
234+ with :
235+ device_type : ${{ matrix.cuda }}
236+ device_name : a100-40gb-4
237+ image_type : ${{ matrix.image_type }}
238+ cloud_runner : linux-x86-a2-48-a100-4gpu
239+ pytest_marker : ' not cpu_only and not tpu_only and not integration_test'
240+ xla_python_client_mem_fraction : 0.65
241+ tf_force_gpu_allow_growth : true
242+ container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
243+ is_scheduled_run : ${{ github.event_name == 'schedule' }}
244+ maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245+
246+ maxtext_gpu_integration_tests :
247+ needs : build_and_upload_maxtext_package
248+ if : needs.doc_only_check.outputs.run_tests == 'true'
249+ uses : ./.github/workflows/run_tests_against_package.yml
250+ strategy :
251+ fail-fast : false
252+ matrix :
253+ image_type : ["py312"]
254+ cuda : ["cuda12"]
255+ with :
256+ device_type : ${{ matrix.cuda }}
257+ device_name : a100-40gb-4
258+ image_type : ${{ matrix.image_type }}
259+ cloud_runner : linux-x86-a2-48-a100-4gpu
260+ pytest_marker : ' not cpu_only and not tpu_only and integration_test'
261+ xla_python_client_mem_fraction : 0.65
262+ tf_force_gpu_allow_growth : true
263+ container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
264+ is_scheduled_run : ${{ github.event_name == 'schedule' }}
265+ maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266+
196267 all_tests_passed :
197268 name : All Required Tests Passed
198- needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
269+ needs : [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
199270 if : always()
200271 runs-on : ubuntu-latest
201272 steps :
202273 - name : Check test results
203274 run : |
275+ # If doc-only, all tests should be skipped
204276 if [ "${{ needs.doc_only_check.outputs.run_tests }}" == "false" ]; then
205277 echo "Documentation-only changes detected, tests were skipped"
206278 exit 0
207279 fi
208280
209281 # Otherwise, check that build and all tests passed or were skipped
210282 echo "Build result: ${{ needs.build_and_upload_maxtext_package.result }}"
211- echo "TPU Tests (Matrix) result: ${{ needs.tpu-tests.result }}"
212- echo "GPU Tests (Matrix) result: ${{ needs.gpu-tests.result }}"
213- echo "CPU Tests (Matrix) result: ${{ needs.cpu-tests.result }}"
214- echo "Pathways Unit result: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
215- echo "Pathways Integration result: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
283+ echo "CPU tests: ${{ needs.maxtext_cpu_unit_tests.result }}"
284+ echo "TPU tests: ${{ needs.maxtext_tpu_unit_tests.result }}"
285+ echo "TPU integration: ${{ needs.maxtext_tpu_integration_tests.result }}"
286+ echo "TPU pathways: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
287+ echo "TPU pathways integration: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
288+ echo "GPU tests: ${{ needs.maxtext_gpu_unit_tests.result }}"
289+ echo "GPU integration: ${{ needs.maxtext_gpu_integration_tests.result }}"
216290
217291 # Fail only if any job failed or was cancelled (skipped is OK)
218292 if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -249,14 +323,14 @@ jobs:
249323
250324 notify_failure :
251325 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
252- needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
326+ needs : [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
253327 if : ${{ always() }}
254328 runs-on : ubuntu-latest
255329 permissions :
256330 issues : write
257331 steps :
258- - name : Check whether one of the jobs failed
259- if : ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule ' }}
260- uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
261- with :
262- github-token : ${{ secrets.GITHUB_TOKEN }}
332+ - name : Check whether one of the jobs failed
333+ if : ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch ' }}
334+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
335+ with :
336+ github-token : ${{ secrets.GITHUB_TOKEN }}
0 commit comments