@@ -53,73 +53,73 @@ jobs:
5353 build_mode : jax_ai_image
5454 base_image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
5555
56- gpu_image :
57- needs : prelim
58- uses : ./.github/workflows/build_upload_internal.yml
59- with :
60- device_type : gpu
61- device_name : a100-40gb-4
62- cloud_runner : linux-x86-n2-16-buildkit
63- build_mode : jax_ai_image
64- base_image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
56+ # gpu_image:
57+ # needs: prelim
58+ # uses: ./.github/workflows/build_upload_internal.yml
59+ # with:
60+ # device_type: gpu
61+ # device_name: a100-40gb-4
62+ # cloud_runner: linux-x86-n2-16-buildkit
63+ # build_mode: jax_ai_image
64+ # base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
6565
66- cpu_unit_tests :
67- needs : tpu_image
68- strategy :
69- fail-fast : false
70- matrix :
71- worker_group : [1, 2, 3, 4]
72- uses : ./.github/workflows/run_tests_internal.yml
73- with :
74- device_type : cpu
75- device_name : X64
76- image_type : tpu
77- pytest_marker : ' cpu_only'
78- xla_python_client_mem_fraction : 0.75
79- tf_force_gpu_allow_growth : false
80- container_resource_option : " --privileged"
81- is_scheduled_run : ${{ github.event_name == 'schedule' }}
82- worker_group : ${{ matrix.worker_group }}
83- total_workers : 4
66+ # cpu_unit_tests:
67+ # needs: tpu_image
68+ # strategy:
69+ # fail-fast: false
70+ # matrix:
71+ # worker_group: [1, 2, 3, 4]
72+ # uses: ./.github/workflows/run_tests_internal.yml
73+ # with:
74+ # device_type: cpu
75+ # device_name: X64
76+ # image_type: tpu
77+ # pytest_marker: 'cpu_only'
78+ # xla_python_client_mem_fraction: 0.75
79+ # tf_force_gpu_allow_growth: false
80+ # container_resource_option: "--privileged"
81+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
82+ # worker_group: ${{ matrix.worker_group }}
83+ # total_workers: 4
8484
85- tpu_unit_tests :
86- needs : tpu_image
87- uses : ./.github/workflows/run_tests_internal.yml
88- with :
89- device_type : tpu
90- device_name : v4-8
91- cloud_runner : linux-x86-ct4p-240-4tpu
92- pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
93- xla_python_client_mem_fraction : 0.75
94- tf_force_gpu_allow_growth : false
95- container_resource_option : " --privileged"
96- is_scheduled_run : ${{ github.event_name == 'schedule' }}
85+ # tpu_unit_tests:
86+ # needs: tpu_image
87+ # uses: ./.github/workflows/run_tests_internal.yml
88+ # with:
89+ # device_type: tpu
90+ # device_name: v4-8
91+ # cloud_runner: linux-x86-ct4p-240-4tpu
92+ # pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
93+ # xla_python_client_mem_fraction: 0.75
94+ # tf_force_gpu_allow_growth: false
95+ # container_resource_option: "--privileged"
96+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
9797
98- tpu_pathways_unit_tests :
99- needs : tpu_image
100- uses : ./.github/workflows/run_pathways_tests_internal.yml
101- with :
102- device_type : tpu
103- device_name : v4-8
104- cloud_runner : linux-x86-ct4p-240-4tpu
105- pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
106- xla_python_client_mem_fraction : 0.75
107- tf_force_gpu_allow_growth : false
108- container_resource_option : " --privileged"
109- is_scheduled_run : ${{ github.event_name == 'schedule' }}
98+ # tpu_pathways_unit_tests:
99+ # needs: tpu_image
100+ # uses: ./.github/workflows/run_pathways_tests_internal.yml
101+ # with:
102+ # device_type: tpu
103+ # device_name: v4-8
104+ # cloud_runner: linux-x86-ct4p-240-4tpu
105+ # pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
106+ # xla_python_client_mem_fraction: 0.75
107+ # tf_force_gpu_allow_growth: false
108+ # container_resource_option: "--privileged"
109+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
110110
111- tpu_integration_tests :
112- needs : tpu_image
113- uses : ./.github/workflows/run_tests_internal.yml
114- with :
115- device_type : tpu
116- device_name : v4-8
117- cloud_runner : linux-x86-ct4p-240-4tpu
118- pytest_marker : ' not cpu_only and not gpu_only and integration_test'
119- xla_python_client_mem_fraction : 0.75
120- tf_force_gpu_allow_growth : false
121- container_resource_option : " --privileged"
122- is_scheduled_run : ${{ github.event_name == 'schedule' }}
111+ # tpu_integration_tests:
112+ # needs: tpu_image
113+ # uses: ./.github/workflows/run_tests_internal.yml
114+ # with:
115+ # device_type: tpu
116+ # device_name: v4-8
117+ # cloud_runner: linux-x86-ct4p-240-4tpu
118+ # pytest_marker: 'not cpu_only and not gpu_only and integration_test'
119+ # xla_python_client_mem_fraction: 0.75
120+ # tf_force_gpu_allow_growth: false
121+ # container_resource_option: "--privileged"
122+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
123123
124124 tpu_pathways_integration_tests :
125125 needs : tpu_image
@@ -134,37 +134,38 @@ jobs:
134134 container_resource_option : " --privileged"
135135 is_scheduled_run : ${{ github.event_name == 'schedule' }}
136136
137- gpu_unit_tests :
138- needs : gpu_image
139- uses : ./.github/workflows/run_tests_internal.yml
140- with :
141- device_type : gpu
142- device_name : a100-40gb-4
143- cloud_runner : linux-x86-a2-48-a100-4gpu
144- pytest_marker : ' not cpu_only and not tpu_only and not integration_test'
145- pytest_addopts : ' --ignore=tests/sft_hooks_test.py'
146- xla_python_client_mem_fraction : 0.65
147- tf_force_gpu_allow_growth : true
148- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
149- is_scheduled_run : ${{ github.event_name == 'schedule' }}
137+ # gpu_unit_tests:
138+ # needs: gpu_image
139+ # uses: ./.github/workflows/run_tests_internal.yml
140+ # with:
141+ # device_type: gpu
142+ # device_name: a100-40gb-4
143+ # cloud_runner: linux-x86-a2-48-a100-4gpu
144+ # pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
145+ # pytest_addopts: '--ignore=tests/sft_hooks_test.py'
146+ # xla_python_client_mem_fraction: 0.65
147+ # tf_force_gpu_allow_growth: true
148+ # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
149+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
150150
151- gpu_integration_tests :
152- needs : gpu_image
153- uses : ./.github/workflows/run_tests_internal.yml
154- with :
155- device_type : gpu
156- device_name : a100-40gb-4
157- cloud_runner : linux-x86-a2-48-a100-4gpu
158- pytest_marker : ' not cpu_only and not tpu_only and integration_test'
159- pytest_addopts : ' --ignore=tests/sft_hooks_test.py'
160- xla_python_client_mem_fraction : 0.65
161- tf_force_gpu_allow_growth : true
162- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
163- is_scheduled_run : ${{ github.event_name == 'schedule' }}
151+ # gpu_integration_tests:
152+ # needs: gpu_image
153+ # uses: ./.github/workflows/run_tests_internal.yml
154+ # with:
155+ # device_type: gpu
156+ # device_name: a100-40gb-4
157+ # cloud_runner: linux-x86-a2-48-a100-4gpu
158+ # pytest_marker: 'not cpu_only and not tpu_only and integration_test'
159+ # pytest_addopts: '--ignore=tests/sft_hooks_test.py'
160+ # xla_python_client_mem_fraction: 0.65
161+ # tf_force_gpu_allow_growth: true
162+ # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
163+ # is_scheduled_run: ${{ github.event_name == 'schedule' }}
164164
165165 clean_up :
166166 if : ${{ always() }}
167- needs : [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
167+ needs : [tpu_pathways_integration_tests]
168+ # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
168169 name : " Clean up"
169170 runs-on : ["self-hosted"]
170171 permissions :
@@ -183,7 +184,8 @@ jobs:
183184
184185 notify_failure :
185186 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
186- needs : [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
187+ needs : [tpu_pathways_integration_tests]
188+ # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
187189 if : ${{ always() }}
188190 runs-on : ubuntu-latest
189191 permissions :
@@ -211,52 +213,52 @@ jobs:
211213 # It will not fail if the labels don't exist.
212214 gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove."
213215
214- notify_success_and_close :
215- name : Close issue after 3 successful builds
216- # This job runs only if all the preceding test jobs succeeded
217- if : ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
218- needs : [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
219- runs-on : ubuntu-latest
220- permissions :
221- issues : write
222- steps :
223- - name : Find existing failure issue
224- id : find_issue
225- env :
226- GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
227- GH_REPO : ${{ github.repository }}
228- run : |
229- ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
230- if [[ -z "$ISSUE_NUMBER" ]]; then
231- echo "No open build failure issue found. Nothing to do."
232- echo "issue_number=" >> $GITHUB_OUTPUT
233- else
234- echo "Found open build failure issue: #${ISSUE_NUMBER}"
235- echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
236- fi
216+ # notify_success_and_close:
217+ # name: Close issue after 3 successful builds
218+ # # This job runs only if all the preceding test jobs succeeded
219+ # if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
220+ # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
221+ # runs-on: ubuntu-latest
222+ # permissions:
223+ # issues: write
224+ # steps:
225+ # - name: Find existing failure issue
226+ # id: find_issue
227+ # env:
228+ # GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
229+ # GH_REPO: ${{ github.repository }}
230+ # run: |
231+ # ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
232+ # if [[ -z "$ISSUE_NUMBER" ]]; then
233+ # echo "No open build failure issue found. Nothing to do."
234+ # echo "issue_number=" >> $GITHUB_OUTPUT
235+ # else
236+ # echo "Found open build failure issue: #${ISSUE_NUMBER}"
237+ # echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
238+ # fi
237239
238- - name : Add success label or close issue
239- if : steps.find_issue.outputs.issue_number != ''
240- env :
241- GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
242- GH_REPO : ${{ github.repository }}
243- run : |
244- ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
245- LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
240+ # - name: Add success label or close issue
241+ # if: steps.find_issue.outputs.issue_number != ''
242+ # env:
243+ # GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
244+ # GH_REPO: ${{ github.repository }}
245+ # run: |
246+ # ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
247+ # LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
246248
247- if echo "$LABELS" | grep -q "success-run-2"; then
248- echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
249- gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
250- gh issue close $ISSUE_NUMBER
251- # Clean up all tracking labels
252- gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
253- elif echo "$LABELS" | grep -q "success-run-1"; then
254- echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
255- gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
256- gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
257- gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
258- else
259- echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
260- gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
261- gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
262- fi
249+ # if echo "$LABELS" | grep -q "success-run-2"; then
250+ # echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
251+ # gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
252+ # gh issue close $ISSUE_NUMBER
253+ # # Clean up all tracking labels
254+ # gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
255+ # elif echo "$LABELS" | grep -q "success-run-1"; then
256+ # echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
257+ # gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
258+ # gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
259+ # gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
260+ # else
261+ # echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
262+ # gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
263+ # gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
264+ # fi
0 commit comments