diff --git a/ci/platforms/config.orion b/ci/platforms/config.orion index 886a6e63b2..3e87ef97a1 100644 --- a/ci/platforms/config.orion +++ b/ci/platforms/config.orion @@ -2,7 +2,7 @@ export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR -export STMP="/work/noaa/stmp/${USER}" +export STMP="/work2/noaa/stmp/${USER}" export SLURM_ACCOUNT=nems export max_concurrent_cases=5 export max_concurrent_pr=4 diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index a5d7c77e66..164d423c67 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -33,6 +33,7 @@ case ${MACHINE_ID} in esac set +x source "${ROOT_DIR}/ush/module-setup.sh" +source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" module list @@ -86,7 +87,7 @@ for pr in ${pr_list}; do if [[ -z $(ls -A "${pr_dir}/RUNTESTS/EXPDIR") ]] ; then "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed" sed -i "1 i\`\`\`" "${output_ci}" - sed -i "1 i\All CI Test Cases Passed:" "${output_ci}" + sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" # Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms @@ -131,8 +132,8 @@ for pr in ${pr_list}; do "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed" error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true { - echo "Experiment ${pslot} Terminated: *** FAILED ***" - echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true + echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}" + echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true echo "Error logs:" echo "${error_logs}" } >> "${output_ci}" @@ -141,7 +142,7 @@ for pr in ${pr_list}; do "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" for kill_cases in "${pr_dir}/RUNTESTS/"*; do pslot=$(basename "${kill_cases}") - sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true + cancel_slurm_jobs "${pslot}" done break fi @@ -151,9 +152,9 @@ for pr in ${pr_list}; do rm -Rf "${pr_dir}/RUNTESTS/COMROT/${pslot}" rm -f "${output_ci_single}" # echo "\`\`\`" > "${output_ci_single}" - DATE=$(date) - echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}" - echo "Experiment ${pslot} **SUCCESS** at ${DATE}" >> "${output_ci}" + DATE=$(date +'%D %r') + echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}" + echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" fi diff --git a/ci/scripts/clone-build_ci.sh b/ci/scripts/clone-build_ci.sh index 4b77d38ab8..4af44507e9 100755 --- a/ci/scripts/clone-build_ci.sh +++ b/ci/scripts/clone-build_ci.sh @@ -72,16 +72,17 @@ cd sorc || exit 1 set +e ./checkout.sh -c -g -u >> log.checkout 2>&1 checkout_status=$? +DATE=$(date +'%D %r') if [[ ${checkout_status} != 0 ]]; then { echo "Checkout: *** FAILED ***" - echo "Checkout: Failed at $(date)" || true + echo "Checkout: Failed at ${DATE}" echo "Checkout: see output at ${PWD}/log.checkout" } >> "${outfile}" exit "${checkout_status}" else { - echo "Checkout: Completed at $(date)" || true + echo "Checkout: Completed at ${DATE}" } >> "${outfile}" fi @@ -92,25 +93,30 @@ rm -rf log.build ./build_all.sh >> log.build 2>&1 build_status=$? +DATE=$(date +'%D %r') if [[ ${build_status} != 0 ]]; then { echo "Build: *** FAILED ***" - echo "Build: Failed at $(date)" || true - echo "Build: see output at ${PWD}/log.build" + echo "Build: Failed at ${DATE}" + cat "${PWD}/log.build" } >> "${outfile}" exit "${build_status}" else { - echo "Build: Completed at $(date)" || true + echo "Build: Completed at ${DATE}" } >> "${outfile}" fi -./link_workflow.sh +LINK_LOGFILE_PATH=link_workflow.log +rm -f "${LINK_LOGFILE_PATH}" +./link_workflow.sh >> "${LINK_LOGFILE_PATH}" 2>&1 link_status=$? if [[ ${link_status} != 0 ]]; then + DATE=$(date +'%D %r') { echo "Link: *** FAILED ***" - echo "Link: Failed at $(date)" || true + echo "Link: Failed at ${DATE}" + cat "${LINK_LOGFILE_PATH}" } >> "${outfile}" exit "${link_status}" fi diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 7988ff17a1..a0edb4b4c3 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -25,7 +25,7 @@ export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"} ################################################################ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )" scriptname=$(basename "${BASH_SOURCE[0]}") -echo "Begin ${scriptname} at $(date -u)" || true +echo "Begin ${scriptname} at $(date +'%D %r')" || true export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]' ######################################################################### @@ -48,6 +48,7 @@ esac # setup runtime env for correct python install and git ###################################################### set +x +source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh" source "${ROOT_DIR}/ush/module-setup.sh" module use "${ROOT_DIR}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" @@ -68,24 +69,57 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" - for pr in ${pr_list}; do pr_dir="${GFS_CI_ROOT}/PR/${pr}" db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}") - pr_id=0 + output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" ############################################################# # Check if a Ready labeled PR has changed back from once set - # and in that case remove all previous jobs in scheduler and - # and remove PR from filesystem to start clean + # and in that case completely kill the previose driver.sh cron + # job and all its decedands as well as removing all previous + # jobs in scheduler and associated files in the PR ############################################################# if [[ "${db_list}" == *"already is in list"* ]]; then - pr_id=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true - pr_id=$((pr_id+1)) - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Ready "${pr_id}" - for cases in "${pr_dir}/RUNTESTS/"*; do - if [[ -z "${cases+x}" ]]; then - break + # Get the the PID and HOST of the driver.sh cron job + # that is stored int he CI database for this PR + driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true + driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true + driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true + host_name=$(hostname -s) + rm -f "${output_ci_single}" + { + echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "=================================================" + echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests" || true + } >> "${output_ci_single}" + if [[ "${driver_PID}" -ne 0 ]]; then + echo "Driver PID: ${driver_PID} no longer running this build having it killed" + if [[ "${driver_HOST}" == "${host_name}" ]]; then + # shellcheck disable=SC2312 + pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill + else + # shellcheck disable=SC2312 + ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill' fi - pslot=$(basename "${cases}") - sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true - done - rm -Rf "${pr_dir}" + { + echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}" + echo "Driver PID: has restarted as $$ on ${host_name}" + } >> "${output_ci_single}" + fi + + experiments=$(find "${pr_dir}/RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true + if [[ -z "${experiments}" ]]; then + echo "No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^}" >> "${output_ci_single}" + else + for case in ${experiments}; do + case_name=$(basename "${case}") + cancel_slurm_jobs "${case_name}" + { + echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}" + } >> "${output_ci_single}" + done + fi + sed -i "1 i\`\`\`" "${output_ci_single}" + "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}" + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}" fi done @@ -110,34 +144,44 @@ for pr in ${pr_list}; do if [[ -z "${pr_building+x}" ]]; then continue fi - "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building - echo "Processing Pull Request #${pr}" + id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') pr_dir="${GFS_CI_ROOT}/PR/${pr}" + output_ci="${pr_dir}/output_ci_${id}" + output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log" + driver_build_PID=$$ + driver_build_HOST=$(hostname -s) + "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building" + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}" rm -Rf "${pr_dir}" mkdir -p "${pr_dir}" - # call clone-build_ci to clone and build PR - id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id') + { + echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true + echo "============================================" + echo "Cloning and Building global-workflow PR: ${pr}" + echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}" + echo "" + } >> "${output_ci_single}" + sed -i "1 i\`\`\`" "${output_ci_single}" + "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}" set +e - output_ci="${pr_dir}/output_build_${id}" - rm -f "${output_ci}" "${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}" - #echo "SKIPPING: ${ROOT_DIR}/ci/scripts/clone-build_ci.sh" ci_status=$? ################################################################## # Checking for special case when Ready label was updated - # that cause a running driver exit fail because was currently - # building so we force and exit 0 instead to does not get relabled + # but a race condtion caused the clone-build_ci.sh to start + # and this instance fails before it was killed. In th case we + # we need to exit this instance of the driver script ################################################################# if [[ ${ci_status} -ne 0 ]]; then - pr_id_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}') || true - if [[ "${pr_id}" -ne "${pr_id_check}" ]]; then + build_PID_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true + if [[ "${build_PID_check}" -ne "$$" ]]; then + echo "Driver build PID: ${build_PID_check} no longer running this build ... exiting" exit 0 fi fi set -e if [[ ${ci_status} -eq 0 ]]; then - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0" #setup space to put an experiment # export RUNTESTS for yaml case files to pickup export RUNTESTS="${pr_dir}/RUNTESTS" @@ -159,7 +203,7 @@ for pr in ${pr_list}; do set +e export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log" rm -f "${LOGFILE_PATH}" - "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" 2>&1 "${LOGFILE_PATH}" + "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" > "${LOGFILE_PATH}" 2>&1 ci_status=$? set -e if [[ ${ci_status} -eq 0 ]]; then @@ -174,7 +218,7 @@ for pr in ${pr_list}; do } >> "${output_ci}" else { - echo "*** Failed *** to create experiment: ${pslot}" + echo "*** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^}" echo "" cat "${LOGFILE_PATH}" } >> "${output_ci}" @@ -186,7 +230,7 @@ for pr in ${pr_list}; do done "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running" - "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running + "${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}" else diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh new file mode 100755 index 0000000000..737a3e5a86 --- /dev/null +++ b/ci/scripts/utils/ci_utils.sh @@ -0,0 +1,24 @@ +#!/bin/env bash + +function cancel_slurm_jobs() { + + # Usage: cancel_slurm_jobs + # Example: cancel_slurm_jobs "C48_ATM_3c4e7f74" + # + # Cancel all Slurm jobs that have the given substring in their name + # So like in the example all jobs with "C48_ATM_3c4e7f74" + # in their name will be canceled + + local substring=$1 + local job_ids + job_ids=$(squeue -u "${USER}" -h -o "%i") + + for job_id in ${job_ids}; do + job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true + if [[ "${job_name}" =~ ${substring} ]]; then + echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" + scancel "${job_id}" + continue + fi + done +}