Skip to content

Commit

Permalink
Add pod timeout
Browse files Browse the repository at this point in the history
Add an overall pod-timeout to
kill the run if the pods fail or
get stuck somehow.
  • Loading branch information
gargnitingoogle committed Sep 10, 2024
1 parent 08cd940 commit f835f14
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ readonly DEFAULT_GCSFUSE_MOUNT_OPTIONS="implicit-dirs"
# Test runtime configuration
readonly DEFAULT_INSTANCE_ID=${USER}-$(date +%Y%m%d-%H%M%S)
readonly DEFAULT_POD_WAIT_TIME_IN_SECONDS=300
readonly DEFAULT_POD_TIMEOUT_IN_SECONDS=300

function printHelp() {
echo "Usage guide: "
Expand All @@ -144,6 +145,7 @@ function printHelp() {
echo "csi_src_dir=<\"/path/of/gcs-fuse-csi-driver/to/use/if/available\", default=\"${DEFAULT_SRC_DIR}\"/gcs-fuse-csi-driver>"
# Test runtime configuration
echo "pod_wait_time_in_seconds=<number e.g. 60 for checking pod status every 1 min, default=\"${DEFAULT_POD_WAIT_TIME_IN_SECONDS}\">"
echo "pod_timeout_in_seconds=<number e.g. 3600 for timing out pod runs, should be more than the value of pod_wait_time_in_seconds, default=\"${DEFAULT_POD_TIMEOUT_IN_SECONDS}\">"
echo "instance_id=<string, not containing spaces, representing unique id for particular test-run e.g. \"${DEFAULT_INSTANCE_ID}\""
echo "workload_config=<path/to/workload/configuration/file e.g. /a/b/c.json >"
echo "output_dir=</absolute/path/to/output/dir, output files will be written at output_dir/fio/output.csv and output_dir/dlio/output.csv>"
Expand Down Expand Up @@ -226,8 +228,14 @@ fi
test -z "${gcsfuse_mount_options}" || (echo "gcsfuse_mount_options set by user is a deprecated option. Please set gcsfuseMountOptions in workload objects in workload configuration file in its place." && exit 1)
# Test runtime configuration
test -n "${pod_wait_time_in_seconds}" || export pod_wait_time_in_seconds="${DEFAULT_POD_WAIT_TIME_IN_SECONDS}"
test -n "${pod_timeout_in_seconds}" || export pod_timeout_in_seconds="${DEFAULT_POD_TIMEOUT_IN_SECONDS}"
test -n "${instance_id}" || export instance_id="${DEFAULT_INSTANCE_ID}"

if [[ ${pod_timeout_in_seconds} -le ${pod_wait_time_in_seconds} ]]; then
echo "pod_timeout_in_seconds (${pod_timeout_in_seconds}) <= pod_wait_time_in_seconds (${pod_wait_time_in_seconds})"
exit 1
fi

if test -n "${workload_config}"; then
test -f "${workload_config}"
export workload_config="$(realpath "${workload_config}")"
Expand Down Expand Up @@ -265,6 +273,7 @@ function printRunParameters() {
echo "gke_testing_dir=\"${gke_testing_dir}\""
# Test runtime configuration
echo "pod_wait_time_in_seconds=\"${pod_wait_time_in_seconds}\""
echo "pod_timeout_in_seconds=\"${pod_timeout_in_seconds}\""
echo "instance_id=\"${instance_id}\""
echo "workload_config=\"${workload_config}\""
echo "output_dir=\"${output_dir}\""
Expand Down Expand Up @@ -628,9 +637,19 @@ function deployAllDlioHelmCharts() {
}
function waitTillAllPodsComplete() {
printf "\nScanning and waiting till all pods either complete or fail ...\n\n"
start_epoch=$(date +%s)
printf "\nScanning and waiting till all pods either complete/fail, or time out (start-time epoch = ${start_epoch} seconds, timeout duration = ${pod_timeout_in_seconds} seconds) ...\n\n"
while true; do
printf "Checking pods status at "$(date +%s)":\n-----------------------------------\n"
cur_epoch=$(date +%s)
time_till_timeout=$((start_epoch+pod_timeout_in_seconds-cur_epoch))
if [[ ${time_till_timeout} -lt 0 ]]; then
printf "\nPod-run timed out!\n\n"
printf "Clearing all pods created in this run...\n"
deleteAllPods
exit 1
fi
printf "Checking pods status at ${cur_epoch} seconds:\n"
printf " -----------------------------------------\n"
podslist="$(kubectl get pods --namespace=${appnamespace} -o wide)"
echo "${podslist}"
num_completed_pods=$(echo "${podslist}" | tail -n +2 | grep -i 'completed\|succeeded' | wc -l)
Expand All @@ -646,9 +665,10 @@ function waitTillAllPodsComplete() {
printf "\nAll pods completed.\n\n"
break
else
printf "\n${num_noncompleted_pods} pod(s) is/are still pending or running. Will check again in "${pod_wait_time_in_seconds}" seconds. Sleeping for now.\n\n"
printf "\n${num_noncompleted_pods} pod(s) is/are still pending or running (time till timeout=${time_till_timeout} seconds). Will check again in "${pod_wait_time_in_seconds}" seconds. Sleeping for now.\n\n"
printf "\nYou can take a break too if you want. Just kill this run and connect back to it later, for fetching and parsing outputs, using the following command: \n"
printf " only_parse=true instance_id=${instance_id} project_id=${project_id} project_number=${project_number} zone=${zone} machine_type=${machine_type} use_custom_csi_driver=${use_custom_csi_driver} gcsfuse_src_dir=\"${gcsfuse_src_dir}\" csi_src_dir=\"${csi_src_dir}\" pod_wait_time_in_seconds=${pod_wait_time_in_seconds} workload_config=\"${workload_config}\" cluster_name=${cluster_name} output_dir=\"${output_dir}\" $0 \n"
printf "\nbut remember that this will reset the start-timer for pod timeout.\n\n"
printf "\nTo ssh to any specific pod, use the following command: \n"
printf " gcloud container clusters get-credentials ${cluster_name} --location=${zone}\n"
printf " kubectl config set-context --current --namespace=${appnamespace}\n"
Expand Down

0 comments on commit f835f14

Please sign in to comment.