From 7e90fad061bbaccc73548cbf4d10296f5125b6e3 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Mon, 27 Nov 2023 19:26:50 +0000 Subject: [PATCH 01/42] added PID kill on label change --- ci/scripts/driver.sh | 84 ++++++++++++++++++++++++++---------- ci/scripts/utils/ci_utils.sh | 16 +++++++ 2 files changed, 77 insertions(+), 23 deletions(-) create mode 100644 ci/scripts/utils/ci_utils.sh diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 7988ff17a1..e5a3559cdc 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -48,6 +48,7 @@ esac # setup runtime env for correct python install and git ###################################################### set +x +source "${ROOT_DIR}/ci/scipts/ci_utils.sh" source "${ROOT_DIR}/ush/module-setup.sh" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" @@ -68,24 +69,54 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") - pr_id=0 + output_ci="${pr_dir}/output_build_${id}" + output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_driver_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set # and in that case remove all previous jobs in scheduler and # and remove PR from filesystem to start clean ############################################################# if [[ "${db_list}" == *"already is in list"* ]]; then - pr_id=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true - pr_id=$((pr_id+1)) - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Ready "${pr_id}" - for cases in "${pr_dir}/RUNTESTS/"*; do - if [[ -z "${cases+x}" ]]; then - break + driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true + driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true + driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true + host_name=$(hostname -s) + + { + echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests on $(date +'%A %b %Y')" || true + } >> "${output_ci_single}" + + if [[ "${driver_PID}" -ne 0 ]]; then + if [[ "${driver_PID}" -ne "$$" ]]; then + echo "Driver PID: ${driver_PID} no longer running this build having it killed" + if [[ "${driver_HOST}" == "${host_name}" ]]; then + kill -9 "${driver_PID}" + else + ssh "${driver_HOST}" kill -9 "${driver_PID}" + fi + { + echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" + echo "Driver_PID: has restarted as {$$} on ${driver_HOST}" + } >> "${output_ci_single}" + } fi - pslot=$(basename "${cases}") - sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true - done + fi + + experiments=$(find "${pr_dir}/RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true + if [[ -z "${experiments}" ]]; then + echo "No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^}" >> "${output_ci_single}" + else + for cases in ${experiments}; do + case_name=$(basename "${cases}") + cancel_slurm_jobs "${case_name}" + { + echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}" + } >> "${output_ci_single}" + done + fi rm -Rf "${pr_dir}" + sed -i "1 i\`\`\`" "${output_ci_single}" + "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" fi done @@ -110,28 +141,35 @@ for pr in ${pr_list}; do if [[ -z "${pr_building+x}" ]]; then continue fi - "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building - echo "Processing Pull Request #${pr}" pr_dir="${GFS_CI_ROOT}/PR/${pr}" + output_ci="${pr_dir}/output_build_${id}" + output_ci_single="${pr_dir}/output_driver_single.log" + driver_build_PID=$$ + driver_build_HOST=$(hostname -s) + "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}" rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" - # call clone-build_ci to clone and build PR id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') + { + echo "Cloning and building global-workflow PR: ${pr}" + echo "CI on ${MACHINE_ID^} started at $(date +'%A %b %Y') for repo ${REPO_URL}" || true + echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" + echo "" + } >> "${output_ci}" set +e - output_ci="${pr_dir}/output_build_${id}" - rm -f "${output_ci}" - "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}" - #echo "SKIPPING: ${ROOT_DIR}/ci/scripts/clone-build_ci.sh" + "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${pr_dir}/output_${id}" ci_status=$? ################################################################## # Checking for special case when Ready label was updated - # that cause a running driver exit fail because was currently - # building so we force and exit 0 instead to does not get relabled + # but a race condtion caused the clone-build_ci.sh to start + # and this instance fails before it was killed. In th case we + # we need to exit this instance of the driver script ################################################################# if [[ ${ci_status} -ne 0 ]]; then - pr_id_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}') || true - if [[ "${pr_id}" -ne "${pr_id_check}" ]]; then + build_PID_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true + if [[ "${build_PID_check}" -ne "$$" ]]; then + echo "Driver build PID: ${build_PID_check} no longer running this build ... exiting" exit 0 fi fi @@ -159,7 +197,7 @@ for pr in ${pr_list}; do set +e export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log" rm -f "${LOGFILE_PATH}" - "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" 2>&1 "${LOGFILE_PATH}" + "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" > "${LOGFILE_PATH}" 2>&1 ci_status=$? set -e if [[ ${ci_status} -eq 0 ]]; then diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh new file mode 100644 index 0000000000..474f305fd8 --- /dev/null +++ b/ci/scripts/utils/ci_utils.sh @@ -0,0 +1,16 @@ +#!/bin/env bash + +function cancel_slurm_jobs() { + + local substring=$1 + local job_ids + job_ids=$(squeue -u "${USER}" -h -o "%i") + + for job_id in ${job_ids}; do + job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true + if [[ "${job_name}" == *"${substring}"* ]]; then + echo"Canceling Slurm Job ${job_name} with: scancel ${job_id}" + scancel "${job_id}" + fi + done +} From 3bad05680be9fb75dbe8d78f4fff88a50c4e094c Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Mon, 27 Nov 2023 13:30:53 -0600 Subject: [PATCH 02/42] updated chmod on ci_utils.sh --- ci/scripts/utils/ci_utils.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 ci/scripts/utils/ci_utils.sh diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh old mode 100644 new mode 100755 From 739a0c04ba1d28eab5c1a5bc8db68bcabb45b63a Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Mon, 27 Nov 2023 14:47:08 -0600 Subject: [PATCH 03/42] remove bracket and move id up --- ci/scripts/driver.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index e5a3559cdc..c55a024240 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -18,7 +18,8 @@ set -eux # TODO using static build for GitHub CLI until fixed in HPC-Stack ################################################################# export GH=${HOME}/bin/gh -export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} +#export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} +export REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git ################################################################ # Setup the reletive paths to scripts and PS4 for better logging @@ -48,7 +49,7 @@ esac # setup runtime env for correct python install and git ###################################################### set +x -source "${ROOT_DIR}/ci/scipts/ci_utils.sh" +source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh" source "${ROOT_DIR}/ush/module-setup.sh" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" @@ -98,7 +99,6 @@ for pr in ${pr_list}; do echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" echo "Driver_PID: has restarted as {$$} on ${driver_HOST}" } >> "${output_ci_single}" - } fi fi @@ -141,6 +141,7 @@ for pr in ${pr_list}; do if [[ -z "${pr_building+x}" ]]; then continue fi + id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') pr_dir="${GFS_CI_ROOT}/PR/${pr}" output_ci="${pr_dir}/output_build_${id}" output_ci_single="${pr_dir}/output_driver_single.log" @@ -150,7 +151,6 @@ for pr in ${pr_list}; do "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}" rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" - id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') { echo "Cloning and building global-workflow PR: ${pr}" echo "CI on ${MACHINE_ID^} started at $(date +'%A %b %Y') for repo ${REPO_URL}" || true From a6192b181c930ee2b8b374f671c80fda7d667b39 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Mon, 27 Nov 2023 17:39:48 -0600 Subject: [PATCH 04/42] added better kill all to make sure to get all descendants --- ci/scripts/driver.sh | 45 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index c55a024240..784d1c2e7b 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -70,8 +70,7 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") - output_ci="${pr_dir}/output_build_${id}" - output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_driver_single.log" + output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set # and in that case remove all previous jobs in scheduler and @@ -82,24 +81,23 @@ for pr in ${pr_list}; do driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true host_name=$(hostname -s) - + rm -f "${output_ci_single}" { echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests on $(date +'%A %b %Y')" || true } >> "${output_ci_single}" - if [[ "${driver_PID}" -ne 0 ]]; then - if [[ "${driver_PID}" -ne "$$" ]]; then - echo "Driver PID: ${driver_PID} no longer running this build having it killed" - if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -9 "${driver_PID}" - else - ssh "${driver_HOST}" kill -9 "${driver_PID}" - fi - { - echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" - echo "Driver_PID: has restarted as {$$} on ${driver_HOST}" - } >> "${output_ci_single}" + echo "Driver PID: ${driver_PID} no longer running this build having it killed" + if [[ "${driver_HOST}" == "${host_name}" ]]; then + kill -- -$(ps -o pgid= "${driver_PID}" | grep -o [0-9]*) + sleep 60 + else + ssh "${driver_HOST}" kill -- -$(ps -o pgid= "${driver_PID}" | grep -o [0-9]*) + sleep 60 fi + { + echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" + echo "Driver_PID: has restarted as $$ on ${host_name}" + } >> "${output_ci_single}" fi experiments=$(find "${pr_dir}/RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true @@ -114,9 +112,10 @@ for pr in ${pr_list}; do } >> "${output_ci_single}" done fi - rm -Rf "${pr_dir}" sed -i "1 i\`\`\`" "${output_ci_single}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" + db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}") + db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") fi done @@ -143,8 +142,8 @@ for pr in ${pr_list}; do fi id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') pr_dir="${GFS_CI_ROOT}/PR/${pr}" - output_ci="${pr_dir}/output_build_${id}" - output_ci_single="${pr_dir}/output_driver_single.log" + output_ci="${pr_dir}/output_ci_${id}" + output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" driver_build_PID=$$ driver_build_HOST=$(hostname -s) "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" @@ -152,13 +151,15 @@ for pr in ${pr_list}; do rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" { - echo "Cloning and building global-workflow PR: ${pr}" - echo "CI on ${MACHINE_ID^} started at $(date +'%A %b %Y') for repo ${REPO_URL}" || true + echo "CI stated Cloning and Building global-workflow PR: ${pr}" + echo "on ${MACHINE_ID^} started at $(date +'%A %b %Y')" || true echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" - } >> "${output_ci}" + } >> "${output_ci_single}" + sed -i "1 i\`\`\`" "${output_ci_single}" + "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" set +e - "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${pr_dir}/output_${id}" + "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}" ci_status=$? ################################################################## # Checking for special case when Ready label was updated From cdc48e22c7ddee14570db881b6202b96cbfb2cfd Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Mon, 27 Nov 2023 18:15:33 -0600 Subject: [PATCH 05/42] few shell norms on kill command --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 784d1c2e7b..68a9deef60 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,10 +88,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -- -$(ps -o pgid= "${driver_PID}" | grep -o [0-9]*) + kill -- -$(ps -o pgid= "${driver_PID}" | grep "-o [0-9]*") || true sleep 60 else - ssh "${driver_HOST}" kill -- -$(ps -o pgid= "${driver_PID}" | grep -o [0-9]*) + ssh "${driver_HOST}" kill -- -$(ps -o pgid= "${driver_PID}" | grep "-o [0-9]*") || true sleep 60 fi { From 17601e346ff080ff5b263bd1337f2208fd94e185 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Mon, 27 Nov 2023 18:35:21 -0600 Subject: [PATCH 06/42] another shell norm on kill line --- ci/scripts/driver.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 68a9deef60..4f5ab47ef5 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -26,7 +26,7 @@ export REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git ################################################################ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )" scriptname=$(basename "${BASH_SOURCE[0]}") -echo "Begin ${scriptname} at $(date -u)" || true +echo "Begin ${scriptname} at $(date +'%D %r')" || true export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]' ######################################################################### @@ -83,15 +83,15 @@ for pr in ${pr_list}; do host_name=$(hostname -s) rm -f "${output_ci_single}" { - echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests on $(date +'%A %b %Y')" || true + echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests on $(date +'%D %r')" || true } >> "${output_ci_single}" if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -- -$(ps -o pgid= "${driver_PID}" | grep "-o [0-9]*") || true + kill -- -$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*") || true sleep 60 else - ssh "${driver_HOST}" kill -- -$(ps -o pgid= "${driver_PID}" | grep "-o [0-9]*") || true + ssh "${driver_HOST}" kill -- -$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*") || true sleep 60 fi { @@ -152,7 +152,7 @@ for pr in ${pr_list}; do mkdir -p "${pr_dir}" { echo "CI stated Cloning and Building global-workflow PR: ${pr}" - echo "on ${MACHINE_ID^} started at $(date +'%A %b %Y')" || true + echo "on ${MACHINE_ID^} started at $(date +'%D %r')" || true echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" } >> "${output_ci_single}" From 3d4fdb0d8667b938235f7b21d3c70fb9c569000b Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 17:55:45 +0000 Subject: [PATCH 07/42] added log output on link fail and some touchups on output --- ci/scripts/check_ci.sh | 16 +++++++++------- ci/scripts/clone-build_ci.sh | 20 +++++++++++++------- ci/scripts/driver.sh | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index a5d7c77e66..65ac638816 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -14,7 +14,8 @@ echo "Begin ${scriptname} at $(date -u)" || true export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]' GH=${HOME}/bin/gh -REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" +#REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" +REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git ######################################################################### # Set up runtime environment varibles for accounts on supproted machines @@ -33,6 +34,7 @@ case ${MACHINE_ID} in esac set +x source "${ROOT_DIR}/ush/module-setup.sh" +source "${ROOT_DIR}/ci/scripts/ci_utils.h" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" module list @@ -86,7 +88,7 @@ for pr in ${pr_list}; do if [[ -z $(ls -A "${pr_dir}/RUNTESTS/EXPDIR") ]] ; then "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed" sed -i "1 i\`\`\`" "${output_ci}" - sed -i "1 i\All CI Test Cases Passed:" "${output_ci}" + sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" # Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms @@ -131,8 +133,8 @@ for pr in ${pr_list}; do "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed" error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true { - echo "Experiment ${pslot} Terminated: *** FAILED ***" - echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true + echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}" + echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true echo "Error logs:" echo "${error_logs}" } >> "${output_ci}" @@ -141,7 +143,7 @@ for pr in ${pr_list}; do "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" for kill_cases in "${pr_dir}/RUNTESTS/"*; do pslot=$(basename "${kill_cases}") - sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true + cancel_slurm_jobs "${pslot}" done break fi @@ -151,9 +153,9 @@ for pr in ${pr_list}; do rm -Rf "${pr_dir}/RUNTESTS/COMROT/${pslot}" rm -f "${output_ci_single}" # echo "\`\`\`" > "${output_ci_single}" - DATE=$(date) + DATE=$(date +'%D %r') echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}" - echo "Experiment ${pslot} **SUCCESS** at ${DATE}" >> "${output_ci}" + echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" fi diff --git a/ci/scripts/clone-build_ci.sh b/ci/scripts/clone-build_ci.sh index 4b77d38ab8..0a35e33e7b 100755 --- a/ci/scripts/clone-build_ci.sh +++ b/ci/scripts/clone-build_ci.sh @@ -73,15 +73,16 @@ set +e ./checkout.sh -c -g -u >> log.checkout 2>&1 checkout_status=$? if [[ ${checkout_status} != 0 ]]; then + DATE=$(date +'%D %r') { echo "Checkout: *** FAILED ***" - echo "Checkout: Failed at $(date)" || true + echo "Checkout: Failed at ${DATE}" || true echo "Checkout: see output at ${PWD}/log.checkout" } >> "${outfile}" exit "${checkout_status}" else { - echo "Checkout: Completed at $(date)" || true + echo "Checkout: Completed at ${DATE}" || true } >> "${outfile}" fi @@ -92,25 +93,30 @@ rm -rf log.build ./build_all.sh >> log.build 2>&1 build_status=$? +DATE=$(date +'%D %r') if [[ ${build_status} != 0 ]]; then { echo "Build: *** FAILED ***" - echo "Build: Failed at $(date)" || true - echo "Build: see output at ${PWD}/log.build" + echo "Build: Failed at ${DATE}" + cat "${PWD}/log.build" } >> "${outfile}" exit "${build_status}" else { - echo "Build: Completed at $(date)" || true + echo "Build: Completed at ${DATE}" } >> "${outfile}" fi -./link_workflow.sh +LINK_LOGFILE_PATH=link_workflow.log +rm -f "${LINK_LOGFILE_PATH}" +./link_workflow.sh >> "${LINK_LOGFILE_PATH}" 2>&1 link_status=$? if [[ ${link_status} != 0 ]]; then + DATE=$(date +'%D %r') { echo "Link: *** FAILED ***" - echo "Link: Failed at $(date)" || true + echo "Link: Failed at ${DATE}" + cat "${LINK_LOGFILE_PATH}" } >> "${outfile}" exit "${link_status}" fi diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 4f5ab47ef5..9428239e38 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -213,7 +213,7 @@ for pr in ${pr_list}; do } >> "${output_ci}" else { - echo "*** Failed *** to create experiment: ${pslot}" + echo "*** Failed *** to create experiment: ${pslot} on ${MACINE_ID^}" echo "" cat "${LOGFILE_PATH}" } >> "${output_ci}" From 24a74afd6c019385a5b0e2fc502fb11fd59b2e63 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 12:12:47 -0600 Subject: [PATCH 08/42] udpated starting message --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 9428239e38..f2571be440 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -151,8 +151,8 @@ for pr in ${pr_list}; do rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" { - echo "CI stated Cloning and Building global-workflow PR: ${pr}" - echo "on ${MACHINE_ID^} started at $(date +'%D %r')" || true + echo "CI Started on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "Cloning and Building global-workflow PR: ${pr}" echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" } >> "${output_ci_single}" From 3c4e7f74940cadfedd918670d275abab40fc8f67 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 12:25:47 -0600 Subject: [PATCH 09/42] moved DATE assignment outside of if --- ci/scripts/clone-build_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/clone-build_ci.sh b/ci/scripts/clone-build_ci.sh index 0a35e33e7b..fb6d2a0da7 100755 --- a/ci/scripts/clone-build_ci.sh +++ b/ci/scripts/clone-build_ci.sh @@ -72,8 +72,8 @@ cd sorc || exit 1 set +e ./checkout.sh -c -g -u >> log.checkout 2>&1 checkout_status=$? +DATE=$(date +'%D %r') if [[ ${checkout_status} != 0 ]]; then - DATE=$(date +'%D %r') { echo "Checkout: *** FAILED ***" echo "Checkout: Failed at ${DATE}" || true From 2e172aeef323e1a32d0c20df8e3098d9e0dab53c Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 19:12:50 +0000 Subject: [PATCH 10/42] quote around ps for shell norms --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index f2571be440..9b743e5b2f 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,10 +88,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -- -$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*") || true + kill -- "-$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*")" || true sleep 60 else - ssh "${driver_HOST}" kill -- -$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*") || true + ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*")" || true sleep 60 fi { From fb5c8fb335b551df9d9e3eed578c5459b78b5b2a Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 13:19:29 -0600 Subject: [PATCH 11/42] removed quotes in grep of ps for kill driver 304527 --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 9b743e5b2f..bea508c903 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,10 +88,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -- "-$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*")" || true + kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o [0-9]*)" || true sleep 60 else - ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep "-o [0-9]*")" || true + ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o [0-9]*)" || true sleep 60 fi { From c2510601414856eae7e645890c9eb6a4faea9762 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 13:22:46 -0600 Subject: [PATCH 12/42] quoted the gerp patter on ps kill of drivers 304527 --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index bea508c903..7d7eae9c13 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,10 +88,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o [0-9]*)" || true + kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true sleep 60 else - ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o [0-9]*)" || true + ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true sleep 60 fi { From ea93c92b3e384020bd2931c8d4cde1415013e023 Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 19:29:35 +0000 Subject: [PATCH 13/42] add ingnore SC009 because there is not a pgrep version of this --- ci/scripts/driver.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 7d7eae9c13..dddb8c48f8 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,9 +88,11 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then + #shellcheck disable=SC009 kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true sleep 60 else + #shellcheck disable=SC009 ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true sleep 60 fi From d55539f34e6ae200e738e0b6403195e225db1bec Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Tue, 28 Nov 2023 19:56:20 +0000 Subject: [PATCH 14/42] added pgrep shellnorms work arounds --- ci/scripts/driver.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index dddb8c48f8..4c1a635dd9 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,11 +88,11 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - #shellcheck disable=SC009 - kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true + pids=$(ps -o "pgid= ${driver_PID}") + kill -- "-$(echo "${pids}" | grep -o "[0-9]*")" || true sleep 60 else - #shellcheck disable=SC009 + # shellcheck disable=SC2009 ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true sleep 60 fi From 97225ac9b0cd71bcee6cd48914cd7a18dda9f9ac Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 20:01:31 +0000 Subject: [PATCH 15/42] removed pid from data base after building --- ci/scripts/driver.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 4c1a635dd9..435da3456f 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,7 +88,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - pids=$(ps -o "pgid= ${driver_PID}") + pids=$(ps -o "pgid= ${driver_PID}") kill -- "-$(echo "${pids}" | grep -o "[0-9]*")" || true sleep 60 else @@ -178,7 +178,7 @@ for pr in ${pr_list}; do fi set -e if [[ ${ci_status} -eq 0 ]]; then - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0" #setup space to put an experiment # export RUNTESTS for yaml case files to pickup export RUNTESTS="${pr_dir}/RUNTESTS" @@ -227,7 +227,7 @@ for pr in ${pr_list}; do done "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" else From 0997559153607987c78f0e1734ad04195643b18e Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 20:19:55 +0000 Subject: [PATCH 16/42] fixed woron path to ci_utils.sh --- ci/scripts/check_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 65ac638816..c5bcd9b59c 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -34,7 +34,7 @@ case ${MACHINE_ID} in esac set +x source "${ROOT_DIR}/ush/module-setup.sh" -source "${ROOT_DIR}/ci/scripts/ci_utils.h" +source "${ROOT_DIR}/ci/scripts/utils/ci_utils.h" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" module list From b746d08445581cf818fb63e257841550032c3b13 Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 20:24:04 +0000 Subject: [PATCH 17/42] type syntax error on echo in scancel --- ci/scripts/utils/ci_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 474f305fd8..508d22ffee 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -9,7 +9,7 @@ function cancel_slurm_jobs() { for job_id in ${job_ids}; do job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true if [[ "${job_name}" == *"${substring}"* ]]; then - echo"Canceling Slurm Job ${job_name} with: scancel ${job_id}" + echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" scancel "${job_id}" fi done From 5e70ba1493eb2ee8856d01c7b00c6d29db84b4e7 Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 21:19:36 +0000 Subject: [PATCH 18/42] better kill switch --- ci/scripts/driver.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 435da3456f..5468228ca9 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -88,12 +88,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - pids=$(ps -o "pgid= ${driver_PID}") - kill -- "-$(echo "${pids}" | grep -o "[0-9]*")" || true - sleep 60 + pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill + sleep 30 else - # shellcheck disable=SC2009 - ssh "${driver_HOST}" kill -- "-$(ps -o "pgid= ${driver_PID}" | grep -o "[0-9]*")" || true + ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' sleep 60 fi { From 7e65d3fcebc0b724686e5543105d58726e4fed6e Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 21:26:40 +0000 Subject: [PATCH 19/42] added cleaner headers on user messages --- ci/scripts/driver.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 5468228ca9..754888945f 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -83,7 +83,9 @@ for pr in ${pr_list}; do host_name=$(hostname -s) rm -f "${output_ci_single}" { - echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests on $(date +'%D %r')" || true + echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "=================================================" + echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests" || true } >> "${output_ci_single}" if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" @@ -92,7 +94,7 @@ for pr in ${pr_list}; do sleep 30 else ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' - sleep 60 + sleep 30 fi { echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" @@ -151,7 +153,8 @@ for pr in ${pr_list}; do rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" { - echo "CI Started on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "=================================================" echo "Cloning and Building global-workflow PR: ${pr}" echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" From afe37b5910f059a8d366fbad4a1946f138e95be4 Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 21:32:28 +0000 Subject: [PATCH 20/42] added true to kill line for shell norms --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 754888945f..fc13afd26e 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -90,7 +90,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill + pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill || true sleep 30 else ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' From 2cde8b40888e88c652337a3a17ed49438788ce44 Mon Sep 17 00:00:00 2001 From: "Terry.McGuinness" Date: Tue, 28 Nov 2023 21:37:05 +0000 Subject: [PATCH 21/42] shorter underline --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index fc13afd26e..b437e65dd9 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -154,7 +154,7 @@ for pr in ${pr_list}; do mkdir -p "${pr_dir}" { echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true - echo "=================================================" + echo "============================================" echo "Cloning and Building global-workflow PR: ${pr}" echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" echo "" From 85996e32c69947fb9505c4160d3dbbb2ed3e4e8d Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 16:13:20 -0600 Subject: [PATCH 22/42] small _ removed from ouput on restart --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 754888945f..32bc0c4d85 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -98,7 +98,7 @@ for pr in ${pr_list}; do fi { echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" - echo "Driver_PID: has restarted as $$ on ${host_name}" + echo "Driver PID: has restarted as $$ on ${host_name}" } >> "${output_ci_single}" fi From e2bfb0e9e589e70eec85e9997b5e22a06481f44a Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Tue, 28 Nov 2023 16:34:07 -0600 Subject: [PATCH 23/42] added machine name on sinlge exe completion lines --- ci/scripts/check_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index c5bcd9b59c..12a762777a 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -154,7 +154,7 @@ for pr in ${pr_list}; do rm -f "${output_ci_single}" # echo "\`\`\`" > "${output_ci_single}" DATE=$(date +'%D %r') - echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}" + echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}" echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" From 86d0236c3f57667fec741b3c7b7d93137f62e78f Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Tue, 28 Nov 2023 22:49:51 +0000 Subject: [PATCH 24/42] updated REPO_URL to global just before to submit PR --- ci/scripts/check_ci.sh | 3 +-- ci/scripts/driver.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 12a762777a..11f7ae020e 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -14,8 +14,7 @@ echo "Begin ${scriptname} at $(date -u)" || true export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]' GH=${HOME}/bin/gh -#REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" -REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git +REPO_URL="https://github.com/NOAA-EMC/global-workflow.git" ######################################################################### # Set up runtime environment varibles for accounts on supproted machines diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 56d05677c9..1e003459d5 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -18,8 +18,7 @@ set -eux # TODO using static build for GitHub CLI until fixed in HPC-Stack ################################################################# export GH=${HOME}/bin/gh -#export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} -export REPO_URL=git@github.com:TerrenceMcGuinness-NOAA/global-workflow.git +export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} ################################################################ # Setup the reletive paths to scripts and PS4 for better logging From b0bc70c396f669287910b0975f625a1fc6caf6e7 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 09:03:48 -0500 Subject: [PATCH 25/42] Update ci/scripts/check_ci.sh typo to fix source of slrum kill routine Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/check_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index 11f7ae020e..164d423c67 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -33,7 +33,7 @@ case ${MACHINE_ID} in esac set +x source "${ROOT_DIR}/ush/module-setup.sh" -source "${ROOT_DIR}/ci/scripts/utils/ci_utils.h" +source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" module list From 33a8a70d1f4d94864824b57e02658286a73a228a Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 09:23:25 -0500 Subject: [PATCH 26/42] Update ci/scripts/driver.sh better grep filter Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 1e003459d5..f75ea5e4e8 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -89,7 +89,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill || true + pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill sleep 30 else ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' From 7311a8b5c89e31fe93a818df6bf23a71773900b8 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 09:24:28 -0500 Subject: [PATCH 27/42] Update ci/scripts/driver.sh ok Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index f75ea5e4e8..30a3cbd050 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -96,7 +96,7 @@ for pr in ${pr_list}; do sleep 30 fi { - echo "Driver PID: ${driver_PID} on ${driver_HOST} is no longer running this test" + echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}" echo "Driver PID: has restarted as $$ on ${host_name}" } >> "${output_ci_single}" fi From f55bf8ffe508bc1c20aa2fee19cb0355f2f0a2d5 Mon Sep 17 00:00:00 2001 From: Terry McGuinness Date: Wed, 29 Nov 2023 08:31:46 -0600 Subject: [PATCH 28/42] moved STMP to /work2 on orion because /work is full --- ci/platforms/config.orion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/platforms/config.orion b/ci/platforms/config.orion index 886a6e63b2..3e87ef97a1 100644 --- a/ci/platforms/config.orion +++ b/ci/platforms/config.orion @@ -2,7 +2,7 @@ export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR -export STMP="/work/noaa/stmp/${USER}" +export STMP="/work2/noaa/stmp/${USER}" export SLURM_ACCOUNT=nems export max_concurrent_cases=5 export max_concurrent_pr=4 From f4142008a5276283901da853220fbb08ce1c0aeb Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 09:43:17 -0500 Subject: [PATCH 29/42] Update driver.sh singular case makes for better grammar and readability --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 30a3cbd050..f632663f42 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -105,8 +105,8 @@ for pr in ${pr_list}; do if [[ -z "${experiments}" ]]; then echo "No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^}" >> "${output_ci_single}" else - for cases in ${experiments}; do - case_name=$(basename "${cases}") + for case in ${experiments}; do + case_name=$(basename "${case}") cancel_slurm_jobs "${case_name}" { echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}" From 9c7d2c72c8ae079c09e7863148c2719882690f8e Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 09:47:38 -0500 Subject: [PATCH 30/42] Update driver.sh added exception for shell norms to remove || true on kill line --- ci/scripts/driver.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index f632663f42..73730139a1 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -89,11 +89,10 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then + # shellcheck disable=SC2312 pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill - sleep 30 else ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' - sleep 30 fi { echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}" From 94c29fe71dd4b386d970ab9d3f20bb8563459329 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 14:56:15 +0000 Subject: [PATCH 31/42] Update driver.sh removed vestigial side effects on updating database line --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 73730139a1..beb8a86196 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -114,8 +114,8 @@ for pr in ${pr_list}; do fi sed -i "1 i\`\`\`" "${output_ci_single}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" - db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}") - db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") + $("${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}") + $("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") fi done From f3634f2f6c0df41c654f103b7b21682453967768 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 14:57:00 +0000 Subject: [PATCH 32/42] Update ci/scripts/utils/ci_utils.sh tested at works! Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/utils/ci_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 508d22ffee..377d3c416b 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -8,7 +8,7 @@ function cancel_slurm_jobs() { for job_id in ${job_ids}; do job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true - if [[ "${job_name}" == *"${substring}"* ]]; then + if [[ "${job_name}" =~ "${substring}" ]]; then echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" scancel "${job_id}" fi From 31ee4bb99af311f75b3b7d9859bf407e1bab8437 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 14:57:53 +0000 Subject: [PATCH 33/42] Update driver.sh --- ci/scripts/driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index beb8a86196..8821764333 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -114,8 +114,8 @@ for pr in ${pr_list}; do fi sed -i "1 i\`\`\`" "${output_ci_single}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" - $("${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}") - $("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}" fi done From 1f0f842934a407c555b03c85d7dae2d154d85717 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 15:00:17 +0000 Subject: [PATCH 34/42] Update ci_utils.sh --- ci/scripts/utils/ci_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 377d3c416b..6cdddb470a 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -8,7 +8,7 @@ function cancel_slurm_jobs() { for job_id in ${job_ids}; do job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true - if [[ "${job_name}" =~ "${substring}" ]]; then + if [[ "${job_name}" =~ ${substring} ]]; then echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" scancel "${job_id}" fi From 9fcc897a3039eb25f59b8ff2655ff1fee1c9a67c Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 10:10:22 -0500 Subject: [PATCH 35/42] Update driver.sh added shell check exception --- ci/scripts/driver.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 8821764333..0c0052b997 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -92,6 +92,7 @@ for pr in ${pr_list}; do # shellcheck disable=SC2312 pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill else + # shellcheck disable=SC2312 ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' fi { From 2b29dee0b0474555748f55c95324b55b2605d299 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 13:41:17 -0500 Subject: [PATCH 36/42] Update driver.sh --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 0c0052b997..6fb3fd2bbb 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -89,7 +89,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - # shellcheck disable=SC2312 + #shellcheck disable=SC2312 pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill else # shellcheck disable=SC2312 From efa3e080eccc4973b73c14b4bcd18eb88fc95825 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Wed, 29 Nov 2023 13:43:40 -0500 Subject: [PATCH 37/42] Update driver.sh --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 6fb3fd2bbb..3eb8957925 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -89,7 +89,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - #shellcheck disable=SC2312 + #shellcheck disable=SC2312,SC2312 pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill else # shellcheck disable=SC2312 From 7c602dfe52b5cc4891efaecf0244634f06a34b36 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Thu, 30 Nov 2023 11:42:31 -0500 Subject: [PATCH 38/42] Update ci/scripts/driver.sh Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 3eb8957925..0c0052b997 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -89,7 +89,7 @@ for pr in ${pr_list}; do if [[ "${driver_PID}" -ne 0 ]]; then echo "Driver PID: ${driver_PID} no longer running this build having it killed" if [[ "${driver_HOST}" == "${host_name}" ]]; then - #shellcheck disable=SC2312,SC2312 + # shellcheck disable=SC2312 pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill else # shellcheck disable=SC2312 From 010a86ff6c10fa47b04f4d140593b75f6dd18870 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Fri, 1 Dec 2023 09:48:23 -0500 Subject: [PATCH 39/42] Update ci/scripts/driver.sh Co-authored-by: Rahul Mahajan --- ci/scripts/driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 0c0052b997..b4629b8288 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -215,7 +215,7 @@ for pr in ${pr_list}; do } >> "${output_ci}" else { - echo "*** Failed *** to create experiment: ${pslot} on ${MACINE_ID^}" + echo "*** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^}" echo "" cat "${LOGFILE_PATH}" } >> "${output_ci}" From 501753e4d3e478ef7f09fcd8685054331d716304 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Fri, 1 Dec 2023 15:25:20 +0000 Subject: [PATCH 40/42] added some edification documentation for clearity --- ci/scripts/driver.sh | 7 +++++-- ci/scripts/utils/ci_utils.sh | 10 +++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index b4629b8288..a0edb4b4c3 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -72,10 +72,13 @@ for pr in ${pr_list}; do output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set - # and in that case remove all previous jobs in scheduler and - # and remove PR from filesystem to start clean + # and in that case completely kill the previose driver.sh cron + # job and all its decedands as well as removing all previous + # jobs in scheduler and associated files in the PR ############################################################# if [[ "${db_list}" == *"already is in list"* ]]; then + # Get the the PID and HOST of the driver.sh cron job + # that is stored int he CI database for this PR driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 6cdddb470a..c65074f5ce 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -2,15 +2,23 @@ function cancel_slurm_jobs() { + # Usage: cancel_slurm_jobs + # Example: cancel_slurm_jobs "C48_ATM_3c4e7f74" + # + # Cancel all Slurm jobs that have the given substring in their name + # So like in the example all jobs with "C48_ATM_3c4e7f74" + # in their name will be canceled + local substring=$1 local job_ids job_ids=$(squeue -u "${USER}" -h -o "%i") for job_id in ${job_ids}; do job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true - if [[ "${job_name}" =~ ${substring} ]]; then + if [[ "${job_name}" == *"${substring}"* ]]; then echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" scancel "${job_id}" + continue fi done } From b1e2b136f514b3728c08437af8d2f726fbbdaaf8 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Fri, 1 Dec 2023 11:49:19 -0500 Subject: [PATCH 41/42] Update clone-build_ci.sh took out || true because we are using DATE not date --- ci/scripts/clone-build_ci.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/clone-build_ci.sh b/ci/scripts/clone-build_ci.sh index fb6d2a0da7..4af44507e9 100755 --- a/ci/scripts/clone-build_ci.sh +++ b/ci/scripts/clone-build_ci.sh @@ -76,13 +76,13 @@ DATE=$(date +'%D %r') if [[ ${checkout_status} != 0 ]]; then { echo "Checkout: *** FAILED ***" - echo "Checkout: Failed at ${DATE}" || true + echo "Checkout: Failed at ${DATE}" echo "Checkout: see output at ${PWD}/log.checkout" } >> "${outfile}" exit "${checkout_status}" else { - echo "Checkout: Completed at ${DATE}" || true + echo "Checkout: Completed at ${DATE}" } >> "${outfile}" fi From 4b27ebef74e0ae2d2fb9826dc912b2f65441d4f0 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Mon, 4 Dec 2023 12:43:32 -0500 Subject: [PATCH 42/42] Update ci/scripts/utils/ci_utils.sh better regex for testing for a substring (pervasively missed) Co-authored-by: Walter Kolczynski - NOAA --- ci/scripts/utils/ci_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index c65074f5ce..737a3e5a86 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -15,7 +15,7 @@ function cancel_slurm_jobs() { for job_id in ${job_ids}; do job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true - if [[ "${job_name}" == *"${substring}"* ]]; then + if [[ "${job_name}" =~ ${substring} ]]; then echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" scancel "${job_id}" continue