Skip to content

Commit 83a9416

Browse files
committed
only test tpu_pathways_integration_tests
1 parent 2264696 commit 83a9416

File tree

3 files changed

+142
-138
lines changed

3 files changed

+142
-138
lines changed

.github/workflows/RunTests.yml

Lines changed: 140 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -53,73 +53,73 @@ jobs:
5353
build_mode: jax_ai_image
5454
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
5555

56-
gpu_image:
57-
needs: prelim
58-
uses: ./.github/workflows/build_upload_internal.yml
59-
with:
60-
device_type: gpu
61-
device_name: a100-40gb-4
62-
cloud_runner: linux-x86-n2-16-buildkit
63-
build_mode: jax_ai_image
64-
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
56+
# gpu_image:
57+
# needs: prelim
58+
# uses: ./.github/workflows/build_upload_internal.yml
59+
# with:
60+
# device_type: gpu
61+
# device_name: a100-40gb-4
62+
# cloud_runner: linux-x86-n2-16-buildkit
63+
# build_mode: jax_ai_image
64+
# base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
6565

66-
cpu_unit_tests:
67-
needs: tpu_image
68-
strategy:
69-
fail-fast: false
70-
matrix:
71-
worker_group: [1, 2, 3, 4]
72-
uses: ./.github/workflows/run_tests_internal.yml
73-
with:
74-
device_type: cpu
75-
device_name: X64
76-
image_type: tpu
77-
pytest_marker: 'cpu_only'
78-
xla_python_client_mem_fraction: 0.75
79-
tf_force_gpu_allow_growth: false
80-
container_resource_option: "--privileged"
81-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
82-
worker_group: ${{ matrix.worker_group }}
83-
total_workers: 4
66+
# cpu_unit_tests:
67+
# needs: tpu_image
68+
# strategy:
69+
# fail-fast: false
70+
# matrix:
71+
# worker_group: [1, 2, 3, 4]
72+
# uses: ./.github/workflows/run_tests_internal.yml
73+
# with:
74+
# device_type: cpu
75+
# device_name: X64
76+
# image_type: tpu
77+
# pytest_marker: 'cpu_only'
78+
# xla_python_client_mem_fraction: 0.75
79+
# tf_force_gpu_allow_growth: false
80+
# container_resource_option: "--privileged"
81+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
82+
# worker_group: ${{ matrix.worker_group }}
83+
# total_workers: 4
8484

85-
tpu_unit_tests:
86-
needs: tpu_image
87-
uses: ./.github/workflows/run_tests_internal.yml
88-
with:
89-
device_type: tpu
90-
device_name: v4-8
91-
cloud_runner: linux-x86-ct4p-240-4tpu
92-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
93-
xla_python_client_mem_fraction: 0.75
94-
tf_force_gpu_allow_growth: false
95-
container_resource_option: "--privileged"
96-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
85+
# tpu_unit_tests:
86+
# needs: tpu_image
87+
# uses: ./.github/workflows/run_tests_internal.yml
88+
# with:
89+
# device_type: tpu
90+
# device_name: v4-8
91+
# cloud_runner: linux-x86-ct4p-240-4tpu
92+
# pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
93+
# xla_python_client_mem_fraction: 0.75
94+
# tf_force_gpu_allow_growth: false
95+
# container_resource_option: "--privileged"
96+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
9797

98-
tpu_pathways_unit_tests:
99-
needs: tpu_image
100-
uses: ./.github/workflows/run_pathways_tests_internal.yml
101-
with:
102-
device_type: tpu
103-
device_name: v4-8
104-
cloud_runner: linux-x86-ct4p-240-4tpu
105-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
106-
xla_python_client_mem_fraction: 0.75
107-
tf_force_gpu_allow_growth: false
108-
container_resource_option: "--privileged"
109-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
98+
# tpu_pathways_unit_tests:
99+
# needs: tpu_image
100+
# uses: ./.github/workflows/run_pathways_tests_internal.yml
101+
# with:
102+
# device_type: tpu
103+
# device_name: v4-8
104+
# cloud_runner: linux-x86-ct4p-240-4tpu
105+
# pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
106+
# xla_python_client_mem_fraction: 0.75
107+
# tf_force_gpu_allow_growth: false
108+
# container_resource_option: "--privileged"
109+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
110110

111-
tpu_integration_tests:
112-
needs: tpu_image
113-
uses: ./.github/workflows/run_tests_internal.yml
114-
with:
115-
device_type: tpu
116-
device_name: v4-8
117-
cloud_runner: linux-x86-ct4p-240-4tpu
118-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
119-
xla_python_client_mem_fraction: 0.75
120-
tf_force_gpu_allow_growth: false
121-
container_resource_option: "--privileged"
122-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
111+
# tpu_integration_tests:
112+
# needs: tpu_image
113+
# uses: ./.github/workflows/run_tests_internal.yml
114+
# with:
115+
# device_type: tpu
116+
# device_name: v4-8
117+
# cloud_runner: linux-x86-ct4p-240-4tpu
118+
# pytest_marker: 'not cpu_only and not gpu_only and integration_test'
119+
# xla_python_client_mem_fraction: 0.75
120+
# tf_force_gpu_allow_growth: false
121+
# container_resource_option: "--privileged"
122+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
123123

124124
tpu_pathways_integration_tests:
125125
needs: tpu_image
@@ -134,37 +134,38 @@ jobs:
134134
container_resource_option: "--privileged"
135135
is_scheduled_run: ${{ github.event_name == 'schedule' }}
136136

137-
gpu_unit_tests:
138-
needs: gpu_image
139-
uses: ./.github/workflows/run_tests_internal.yml
140-
with:
141-
device_type: gpu
142-
device_name: a100-40gb-4
143-
cloud_runner: linux-x86-a2-48-a100-4gpu
144-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
145-
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
146-
xla_python_client_mem_fraction: 0.65
147-
tf_force_gpu_allow_growth: true
148-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
149-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
137+
# gpu_unit_tests:
138+
# needs: gpu_image
139+
# uses: ./.github/workflows/run_tests_internal.yml
140+
# with:
141+
# device_type: gpu
142+
# device_name: a100-40gb-4
143+
# cloud_runner: linux-x86-a2-48-a100-4gpu
144+
# pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
145+
# pytest_addopts: '--ignore=tests/sft_hooks_test.py'
146+
# xla_python_client_mem_fraction: 0.65
147+
# tf_force_gpu_allow_growth: true
148+
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
149+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
150150

151-
gpu_integration_tests:
152-
needs: gpu_image
153-
uses: ./.github/workflows/run_tests_internal.yml
154-
with:
155-
device_type: gpu
156-
device_name: a100-40gb-4
157-
cloud_runner: linux-x86-a2-48-a100-4gpu
158-
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
159-
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
160-
xla_python_client_mem_fraction: 0.65
161-
tf_force_gpu_allow_growth: true
162-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
163-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
151+
# gpu_integration_tests:
152+
# needs: gpu_image
153+
# uses: ./.github/workflows/run_tests_internal.yml
154+
# with:
155+
# device_type: gpu
156+
# device_name: a100-40gb-4
157+
# cloud_runner: linux-x86-a2-48-a100-4gpu
158+
# pytest_marker: 'not cpu_only and not tpu_only and integration_test'
159+
# pytest_addopts: '--ignore=tests/sft_hooks_test.py'
160+
# xla_python_client_mem_fraction: 0.65
161+
# tf_force_gpu_allow_growth: true
162+
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
163+
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
164164

165165
clean_up:
166166
if: ${{ always() }}
167-
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
167+
needs: [tpu_pathways_integration_tests]
168+
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
168169
name: "Clean up"
169170
runs-on: ["self-hosted"]
170171
permissions:
@@ -183,7 +184,8 @@ jobs:
183184

184185
notify_failure:
185186
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
186-
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
187+
needs: [tpu_pathways_integration_tests]
188+
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
187189
if: ${{ always() }}
188190
runs-on: ubuntu-latest
189191
permissions:
@@ -211,52 +213,52 @@ jobs:
211213
# It will not fail if the labels don't exist.
212214
gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove."
213215
214-
notify_success_and_close:
215-
name: Close issue after 3 successful builds
216-
# This job runs only if all the preceding test jobs succeeded
217-
if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
218-
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
219-
runs-on: ubuntu-latest
220-
permissions:
221-
issues: write
222-
steps:
223-
- name: Find existing failure issue
224-
id: find_issue
225-
env:
226-
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
227-
GH_REPO: ${{ github.repository }}
228-
run: |
229-
ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
230-
if [[ -z "$ISSUE_NUMBER" ]]; then
231-
echo "No open build failure issue found. Nothing to do."
232-
echo "issue_number=" >> $GITHUB_OUTPUT
233-
else
234-
echo "Found open build failure issue: #${ISSUE_NUMBER}"
235-
echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
236-
fi
216+
# notify_success_and_close:
217+
# name: Close issue after 3 successful builds
218+
# # This job runs only if all the preceding test jobs succeeded
219+
# if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
220+
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
221+
# runs-on: ubuntu-latest
222+
# permissions:
223+
# issues: write
224+
# steps:
225+
# - name: Find existing failure issue
226+
# id: find_issue
227+
# env:
228+
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
229+
# GH_REPO: ${{ github.repository }}
230+
# run: |
231+
# ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
232+
# if [[ -z "$ISSUE_NUMBER" ]]; then
233+
# echo "No open build failure issue found. Nothing to do."
234+
# echo "issue_number=" >> $GITHUB_OUTPUT
235+
# else
236+
# echo "Found open build failure issue: #${ISSUE_NUMBER}"
237+
# echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
238+
# fi
237239

238-
- name: Add success label or close issue
239-
if: steps.find_issue.outputs.issue_number != ''
240-
env:
241-
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
242-
GH_REPO: ${{ github.repository }}
243-
run: |
244-
ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
245-
LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
240+
# - name: Add success label or close issue
241+
# if: steps.find_issue.outputs.issue_number != ''
242+
# env:
243+
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
244+
# GH_REPO: ${{ github.repository }}
245+
# run: |
246+
# ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
247+
# LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
246248

247-
if echo "$LABELS" | grep -q "success-run-2"; then
248-
echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
249-
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
250-
gh issue close $ISSUE_NUMBER
251-
# Clean up all tracking labels
252-
gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
253-
elif echo "$LABELS" | grep -q "success-run-1"; then
254-
echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
255-
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
256-
gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
257-
gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
258-
else
259-
echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
260-
gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
261-
gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
262-
fi
249+
# if echo "$LABELS" | grep -q "success-run-2"; then
250+
# echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
251+
# gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
252+
# gh issue close $ISSUE_NUMBER
253+
# # Clean up all tracking labels
254+
# gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
255+
# elif echo "$LABELS" | grep -q "success-run-1"; then
256+
# echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
257+
# gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
258+
# gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
259+
# gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
260+
# else
261+
# echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
262+
# gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
263+
# gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
264+
# fi

.github/workflows/run_pathways_tests_internal.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ jobs:
6464
IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS: true
6565
JAX_PLATFORMS: "proxy"
6666
JAX_BACKEND_TARGET: "grpc://localhost:29000"
67+
JAX_COORDINATOR_ADDRESS: "localhost"
6768
options: ${{ inputs.container_resource_option }}
6869
steps:
6970
- uses: actions/checkout@v4

src/MaxText/max_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def maybe_initialize_jax_distributed_system(raw_keys):
161161
162162
For CPUs, we call jax.distributed.initialize() explicitly, with the specified arguments.
163163
"""
164+
print(f"LOG: maybe_initialize_jax_distributed_system - {raw_keys = }")
164165
if raw_keys["skip_jax_distributed_system"]:
165166
max_logging.log("Skipping jax distributed system due to skip_jax_distributed_system=True flag.")
166167
return

0 commit comments

Comments
 (0)