From f835f142a088b390875e26ad07180db79b55178c Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Tue, 10 Sep 2024 11:57:00 +0000 Subject: [PATCH] Add pod timeout Add an overall pod-timeout to kill the run if the pods fail or get stuck somehow. --- .../testing_on_gke/examples/run-gke-tests.sh | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh index 94afba16d2..a6e6082d7b 100755 --- a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh +++ b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh @@ -119,6 +119,7 @@ readonly DEFAULT_GCSFUSE_MOUNT_OPTIONS="implicit-dirs" # Test runtime configuration readonly DEFAULT_INSTANCE_ID=${USER}-$(date +%Y%m%d-%H%M%S) readonly DEFAULT_POD_WAIT_TIME_IN_SECONDS=300 +readonly DEFAULT_POD_TIMEOUT_IN_SECONDS=300 function printHelp() { echo "Usage guide: " @@ -144,6 +145,7 @@ function printHelp() { echo "csi_src_dir=<\"/path/of/gcs-fuse-csi-driver/to/use/if/available\", default=\"${DEFAULT_SRC_DIR}\"/gcs-fuse-csi-driver>" # Test runtime configuration echo "pod_wait_time_in_seconds=" + echo "pod_timeout_in_seconds=" echo "instance_id=" echo "output_dir=" @@ -226,8 +228,14 @@ fi test -z "${gcsfuse_mount_options}" || (echo "gcsfuse_mount_options set by user is a deprecated option. Please set gcsfuseMountOptions in workload objects in workload configuration file in its place." && exit 1) # Test runtime configuration test -n "${pod_wait_time_in_seconds}" || export pod_wait_time_in_seconds="${DEFAULT_POD_WAIT_TIME_IN_SECONDS}" +test -n "${pod_timeout_in_seconds}" || export pod_timeout_in_seconds="${DEFAULT_POD_TIMEOUT_IN_SECONDS}" test -n "${instance_id}" || export instance_id="${DEFAULT_INSTANCE_ID}" +if [[ ${pod_timeout_in_seconds} -le ${pod_wait_time_in_seconds} ]]; then + echo "pod_timeout_in_seconds (${pod_timeout_in_seconds}) <= pod_wait_time_in_seconds (${pod_wait_time_in_seconds})" + exit 1 +fi + if test -n "${workload_config}"; then test -f "${workload_config}" export workload_config="$(realpath "${workload_config}")" @@ -265,6 +273,7 @@ function printRunParameters() { echo "gke_testing_dir=\"${gke_testing_dir}\"" # Test runtime configuration echo "pod_wait_time_in_seconds=\"${pod_wait_time_in_seconds}\"" + echo "pod_timeout_in_seconds=\"${pod_timeout_in_seconds}\"" echo "instance_id=\"${instance_id}\"" echo "workload_config=\"${workload_config}\"" echo "output_dir=\"${output_dir}\"" @@ -628,9 +637,19 @@ function deployAllDlioHelmCharts() { } function waitTillAllPodsComplete() { - printf "\nScanning and waiting till all pods either complete or fail ...\n\n" + start_epoch=$(date +%s) + printf "\nScanning and waiting till all pods either complete/fail, or time out (start-time epoch = ${start_epoch} seconds, timeout duration = ${pod_timeout_in_seconds} seconds) ...\n\n" while true; do - printf "Checking pods status at "$(date +%s)":\n-----------------------------------\n" + cur_epoch=$(date +%s) + time_till_timeout=$((start_epoch+pod_timeout_in_seconds-cur_epoch)) + if [[ ${time_till_timeout} -lt 0 ]]; then + printf "\nPod-run timed out!\n\n" + printf "Clearing all pods created in this run...\n" + deleteAllPods + exit 1 + fi + printf "Checking pods status at ${cur_epoch} seconds:\n" + printf " -----------------------------------------\n" podslist="$(kubectl get pods --namespace=${appnamespace} -o wide)" echo "${podslist}" num_completed_pods=$(echo "${podslist}" | tail -n +2 | grep -i 'completed\|succeeded' | wc -l) @@ -646,9 +665,10 @@ function waitTillAllPodsComplete() { printf "\nAll pods completed.\n\n" break else - printf "\n${num_noncompleted_pods} pod(s) is/are still pending or running. Will check again in "${pod_wait_time_in_seconds}" seconds. Sleeping for now.\n\n" + printf "\n${num_noncompleted_pods} pod(s) is/are still pending or running (time till timeout=${time_till_timeout} seconds). Will check again in "${pod_wait_time_in_seconds}" seconds. Sleeping for now.\n\n" printf "\nYou can take a break too if you want. Just kill this run and connect back to it later, for fetching and parsing outputs, using the following command: \n" printf " only_parse=true instance_id=${instance_id} project_id=${project_id} project_number=${project_number} zone=${zone} machine_type=${machine_type} use_custom_csi_driver=${use_custom_csi_driver} gcsfuse_src_dir=\"${gcsfuse_src_dir}\" csi_src_dir=\"${csi_src_dir}\" pod_wait_time_in_seconds=${pod_wait_time_in_seconds} workload_config=\"${workload_config}\" cluster_name=${cluster_name} output_dir=\"${output_dir}\" $0 \n" + printf "\nbut remember that this will reset the start-timer for pod timeout.\n\n" printf "\nTo ssh to any specific pod, use the following command: \n" printf " gcloud container clusters get-credentials ${cluster_name} --location=${zone}\n" printf " kubectl config set-context --current --namespace=${appnamespace}\n"