Skip to content

Commit 3dfe3c1

Browse files
hbelmiroR3hankhan123
authored andcommitted
test: Improvements to wait_for_pods function (kubeflow#11162)
Signed-off-by: hbelmiro <helber.belmiro@gmail.com>
1 parent 82a6758 commit 3dfe3c1

12 files changed

+142
-74
lines changed

.github/workflows/backend.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,17 @@ jobs:
3434
steps:
3535
- name: Checkout code
3636
uses: actions/checkout@v4
37-
- name: Create KFP cluster
38-
uses: ./.github/actions/kfp-tekton-cluster
39-
- name: Set up Python 3.10
37+
- name: Set up Python 3.9
4038
uses: actions/setup-python@v4
4139
with:
42-
python-version: '3.10'
40+
python-version: '3.9'
4341
- name: Install sdk
4442
run: |
4543
python3 -m venv .venv
4644
. .venv/bin/activate
4745
pip install -e sdk/python
46+
- name: Create KFP cluster
47+
uses: ./.github/actions/kfp-tekton-cluster
4848
- name: "flip coin test"
4949
run: |
5050
. .venv/bin/activate

.github/workflows/e2e-test.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ jobs:
2323
- name: Checkout code
2424
uses: actions/checkout@v4
2525

26+
- name: Set up Python
27+
uses: actions/setup-python@v4
28+
with:
29+
python-version: 3.9
30+
2631
- name: Create KFP cluster
2732
uses: ./.github/actions/kfp-cluster
2833

@@ -46,6 +51,11 @@ jobs:
4651
- name: Checkout code
4752
uses: actions/checkout@v4
4853

54+
- name: Set up Python
55+
uses: actions/setup-python@v4
56+
with:
57+
python-version: 3.9
58+
4959
- name: Create KFP cluster
5060
uses: ./.github/actions/kfp-cluster
5161

@@ -69,6 +79,11 @@ jobs:
6979
- name: Checkout code
7080
uses: actions/checkout@v4
7181

82+
- name: Set up Python
83+
uses: actions/setup-python@v4
84+
with:
85+
python-version: 3.9
86+
7287
- name: Create KFP cluster
7388
uses: ./.github/actions/kfp-cluster
7489

@@ -92,6 +107,11 @@ jobs:
92107
- name: Checkout code
93108
uses: actions/checkout@v4
94109

110+
- name: Set up Python
111+
uses: actions/setup-python@v4
112+
with:
113+
python-version: 3.9
114+
95115
- name: Create KFP cluster
96116
uses: ./.github/actions/kfp-cluster
97117

@@ -115,6 +135,11 @@ jobs:
115135
- name: Checkout code
116136
uses: actions/checkout@v4
117137

138+
- name: Set up Python
139+
uses: actions/setup-python@v4
140+
with:
141+
python-version: 3.9
142+
118143
- name: Create KFP cluster
119144
uses: ./.github/actions/kfp-cluster
120145

@@ -144,6 +169,11 @@ jobs:
144169
- name: Checkout code
145170
uses: actions/checkout@v4
146171

172+
- name: Set up Python
173+
uses: actions/setup-python@v4
174+
with:
175+
python-version: 3.9
176+
147177
- name: Create KFP cluster
148178
uses: ./.github/actions/kfp-cluster
149179

.github/workflows/kfp-kubernetes-execution-tests.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ on:
77
pull_request:
88
paths:
99
- '.github/workflows/kfp-kubernetes-execution-tests.yml'
10+
- 'scripts/deploy/github/**'
1011
- 'sdk/python/**'
1112
- 'api/v2alpha1/**'
1213
- 'kubernetes_platform/**'
@@ -18,17 +19,17 @@ jobs:
1819
- name: Checkout code
1920
uses: actions/checkout@v4
2021

22+
- name: Set up Python
23+
uses: actions/setup-python@v4
24+
with:
25+
python-version: '3.9'
26+
2127
- name: Create KFP cluster
2228
uses: ./.github/actions/kfp-cluster
2329

2430
- name: Forward API port
2531
run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888
2632

27-
- name: Set up Python
28-
uses: actions/setup-python@v4
29-
with:
30-
python-version: '3.9'
31-
3233
- name: apt-get update
3334
run: sudo apt-get update
3435

.github/workflows/kfp-samples.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ on:
66
- master
77
pull_request:
88
paths:
9+
- 'scripts/deploy/github/**'
910
- 'samples/**'
1011
- 'backend/src/v2/**'
1112
- '.github/workflows/kfp-samples.yml'
@@ -21,7 +22,7 @@ jobs:
2122
- name: Set up Python
2223
uses: actions/setup-python@v2
2324
with:
24-
python-version: 3.8
25+
python-version: 3.9
2526

2627
- name: Create KFP cluster
2728
uses: ./.github/actions/kfp-cluster

.github/workflows/kubeflow-pipelines-integration-v2.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ on:
77
pull_request:
88
paths:
99
- '.github/workflows/kubeflow-pipelines-integration-v2.yml'
10+
- 'scripts/deploy/github/**'
1011
- 'samples'
1112
- 'core'
1213
- 'backend'

.github/workflows/periodic.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ jobs:
1010
steps:
1111
- name: Checkout repository
1212
uses: actions/checkout@v4
13+
- name: Set up Python
14+
uses: actions/setup-python@v4
15+
with:
16+
python-version: 3.9
1317
- name: Create KFP cluster
1418
uses: ./.github/actions/kfp-cluster
1519
- name: Port forward kfp apiserver

.github/workflows/sdk-execution.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ on:
77
pull_request:
88
paths:
99
- '.github/workflows/sdk-execution.yml'
10+
- 'scripts/deploy/github/**'
1011
- 'sdk/python/**'
1112
- 'api/v2alpha1/**'
1213

@@ -17,17 +18,17 @@ jobs:
1718
- name: Checkout code
1819
uses: actions/checkout@v4
1920

21+
- name: Set up Python
22+
uses: actions/setup-python@v4
23+
with:
24+
python-version: 3.9
25+
2026
- name: Create KFP cluster
2127
uses: ./.github/actions/kfp-cluster
2228

2329
- name: Forward API port
2430
run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888
2531

26-
- name: Set up Python
27-
uses: actions/setup-python@v4
28-
with:
29-
python-version: 3.8
30-
3132
- name: apt-get update
3233
run: sudo apt-get update
3334

.github/workflows/upgrade-test.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ on:
77
pull_request:
88
paths:
99
- '.github/workflows/upgrade-test.yml'
10+
- 'scripts/deploy/github/**'
1011
- 'backend/**'
1112
- 'manifests/kustomize/**'
1213

@@ -17,6 +18,11 @@ jobs:
1718
- name: Checkout code
1819
uses: actions/checkout@v4
1920

21+
- name: Set up Python
22+
uses: actions/setup-python@v4
23+
with:
24+
python-version: 3.9
25+
2026
- name: Create KFP cluster
2127
uses: ./.github/actions/kfp-cluster
2228

scripts/deploy/github/deploy-kfp-tekton.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,14 @@ then
4040
exit 1
4141
fi
4242

43-
# Check if all pods are running - allow 20 retries (10 minutes)
44-
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
43+
# Check if all pods are running - (10 minutes)
44+
wait_for_pods || EXIT_CODE=$?
4545
if [[ $EXIT_CODE -ne 0 ]]
4646
then
4747
echo "Deploy unsuccessful. Not all pods running."
4848
exit 1
4949
fi
5050

51-
echo "List Kubeflow: "
52-
kubectl get pod -n kubeflow
5351
collect_artifacts kubeflow
5452

5553
echo "List Tekton control plane: "

scripts/deploy/github/deploy-kfp.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,14 @@ then
4141
exit 1
4242
fi
4343

44-
# Check if all pods are running - allow 20 retries (10 minutes)
45-
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
44+
# Check if all pods are running - (10 minutes)
45+
wait_for_pods || EXIT_CODE=$?
4646
if [[ $EXIT_CODE -ne 0 ]]
4747
then
4848
echo "Deploy unsuccessful. Not all pods running."
4949
exit 1
5050
fi
5151

52-
echo "List Kubeflow: "
53-
kubectl get pod -n kubeflow
5452
collect_artifacts kubeflow
5553

5654
echo "Finished KFP deployment."

scripts/deploy/github/helper-functions.sh

Lines changed: 3 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -56,57 +56,9 @@ wait_for_namespace () {
5656
}
5757

5858
wait_for_pods () {
59-
if [[ $# -ne 3 ]]
60-
then
61-
echo "Usage: wait_for_pods namespace max_retries sleep_time"
62-
return 1
63-
fi
64-
65-
local namespace=$1
66-
local max_retries=$2
67-
local sleep_time=$3
68-
69-
local i=0
70-
71-
while [[ $i -lt $max_retries ]]
72-
do
73-
local pods
74-
local statuses
75-
local num_pods
76-
local num_running
77-
pods=$(kubectl get pod -n "$namespace")
78-
# echo "$pods"
79-
# kubectl get pvc -n "$namespace"
80-
81-
if [[ -z $pods ]]
82-
then
83-
echo "no pod is up yet"
84-
else
85-
# Using quotations around variables to keep column format in echo
86-
# Remove 1st line (header line) -> trim whitespace -> cut statuses column (3rd column)
87-
# Might be overkill to parse down to specific columns :).
88-
statuses=$(echo "$pods" | tail -n +2 | tr -s ' ' | cut -d ' ' -f 3)
89-
num_pods=$(echo "$statuses" | wc -l | xargs)
90-
num_running=$(echo "$statuses" | grep -ow "Running\|Completed" | wc -l | xargs)
91-
92-
local msg="${num_running}/${num_pods} pods running in \"${namespace}\"."
93-
94-
if [[ $num_running -ne $num_pods ]]
95-
then
96-
# for debugging
97-
# kubectl get pod -n "$namespace" | grep '0/1' | awk '{print $1}' | xargs kubectl describe pod -n "$namespace"
98-
echo "$msg Checking again in ${sleep_time}s."
99-
else
100-
echo "$msg"
101-
return 0
102-
fi
103-
fi
104-
105-
sleep "$sleep_time"
106-
i=$((i+1))
107-
done
108-
109-
return 1
59+
C_DIR="${BASH_SOURCE%/*}"
60+
pip install -r "${C_DIR}"/../../../sdk/python/requirements.txt
61+
python "${C_DIR}"/kfp-readiness/wait_for_pods.py
11062
}
11163

11264
deploy_with_retries () {
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import logging
2+
import time
3+
import urllib3
4+
import sys
5+
from kubernetes import client, config
6+
7+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
8+
9+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10+
11+
namespace = 'kubeflow'
12+
13+
config.load_kube_config()
14+
v1 = client.CoreV1Api()
15+
16+
17+
def get_pod_statuses():
18+
pods = v1.list_namespaced_pod(namespace=namespace)
19+
statuses = {}
20+
for pod in pods.items:
21+
pod_name = pod.metadata.name
22+
pod_status = pod.status.phase
23+
container_statuses = pod.status.container_statuses or []
24+
ready_containers = sum(1 for status in container_statuses if status.ready)
25+
total_containers = len(container_statuses)
26+
statuses[pod_name] = (pod_status, ready_containers, total_containers)
27+
return statuses
28+
29+
30+
def all_pods_ready(statuses):
31+
return all(pod_status == 'Running' and ready == total
32+
for pod_status, ready, total in statuses.values())
33+
34+
35+
def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
36+
start_time = time.time()
37+
stable_count = 0
38+
previous_statuses = {}
39+
40+
while time.time() - start_time < timeout:
41+
current_statuses = get_pod_statuses()
42+
43+
logging.info("Checking pod statuses...")
44+
for pod_name, (pod_status, ready, total) in current_statuses.items():
45+
logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}")
46+
47+
if current_statuses == previous_statuses:
48+
if all_pods_ready(current_statuses):
49+
stable_count += 1
50+
if stable_count >= retries_after_ready:
51+
logging.info("All pods are calm and fully ready.")
52+
break
53+
else:
54+
logging.info(
55+
f"Pods are calm but have only been stable for {stable_count}/{retries_after_ready} retries.")
56+
else:
57+
stable_count = 0
58+
else:
59+
stable_count = 0
60+
61+
previous_statuses = current_statuses
62+
logging.info(f"Pods are still stabilizing. Retrying in {calm_time} seconds...")
63+
time.sleep(calm_time)
64+
else:
65+
raise Exception("Pods did not stabilize within the timeout period.")
66+
67+
logging.info("Final pod statuses:")
68+
for pod_name, (pod_status, ready, total) in previous_statuses.items():
69+
if pod_status == 'Running' and ready == total:
70+
logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
71+
else:
72+
logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})")
73+
74+
75+
if __name__ == "__main__":
76+
check_pods()

0 commit comments

Comments
 (0)