openqa-label-known-issues

#!/bin/bash -e

# Usage
# echo https://openqahost/tests/1234 | openqa-label-known-issues

set -o pipefail -o errtrace

# shellcheck source=/dev/null
. "$(dirname "$0")"/_common

host="${host:-"openqa.opensuse.org"}"
scheme="${scheme:-"https"}"
host_url="$scheme://$host"
dry_run="${dry_run:-"0"}"
min_search_term="${min_search_term:-"16"}"
issue_marker="${issue_marker:-"auto_review%3A"}"
issue_query="${issue_query:-"https://progress.opensuse.org/projects/openqav3/issues.json?limit=200&subproject_id=*&subject=~${issue_marker}"}"
reason_min_length="${reason_min_length:-"8"}"
grep_timeout="${grep_timeout:-5}"
to_review=()
curl_args=(--user-agent "openqa-label-known-issues")
client_args=(api --header 'User-Agent: openqa-label-known-issues (https://github.com/os-autoinst/scripts)' --host "$host_url")

out="${REPORT_FILE:-$(mktemp)}"
trap 'error-handler "$LINENO"' ERR
trap 'test "$KEEP_REPORT_FILE" == "1" || rm "$out"' EXIT

echoerr() { echo "$@" >&2; }

handle_unreachable_or_no_log() {
    local id=$1
    local testurl=$2
    local out=$3

    if ! curl "${curl_args[@]}" -s --head "$testurl" -o /dev/null; then
        # the page might be gone, try the scheme+host we configured (might be different one though)
        if ! grep -q "$host_url" <<< "$testurl"; then
            echo "'$testurl' is not reachable and 'host_url' parameter does not match '$testurl', can not check further, continuing with next"
            return
        fi
        if ! curl "${curl_args[@]}" -s --head "$host_url"; then
            echo "'$host_url' is not reachable, bailing out"
            curl "${curl_args[@]}" --head "$host_url"
        fi
        echo "'$testurl' is not reachable, assuming deleted, continuing with next"
        return
    fi
    # resorting to downloading the job details page instead of the
    # log, overwrites $out
    if ! curl "${curl_args[@]}" -s "$testurl" -o "$out"; then
        echo "'$testurl' can be reached but not downloaded, bailing out"
        curl "${curl_args[@]}" "$testurl"
        exit 2
    fi
    # if the page is there but not even an autoinst-log.txt exists
    # then the job might be too old and logs are already deleted.
    # Checking timestamp
    if [[ $(date -uIs -d '-14days') > $(grep timeago "$out" | hxselect -s '\n' -c '.timeago::attr(title)') ]]; then
        echo "'$testurl' does not have autoinst-log.txt but is rather old, ignoring"
        return
    fi
    if hxnormalize -x "$out" | hxselect -s '\n' -c '.links_a .resborder' | grep -qPzo '(?s)Gru job failed.*connection error.*Inactivity timeout'; then
        "${client_call[@]}" -X POST jobs/"$id"/comments text='poo#62456 test incompletes after failing in GRU download task on "Inactivity timeout" with no logs'
        "${client_call[@]}" -X POST jobs/"$id"/restart
        return
    fi
}

label_on_issues_from_issue_tracker() {
    local id=$1
    # Iterate over all progress issues that have the search term included
    # 1. line: issue id
    # 2. line: subject
    echo "$issues" | (while read -r issue; do
        read -r subject
        after=${subject#*\"}
        search=${after%\"*}
        force_result=''
        label="poo#$issue $subject"
        if [[ ${#search} -ge $min_search_term ]]; then
            if [[ $after =~ :force_result:([a-z_]+) ]]; then
                force_result=${BASH_REMATCH[1]}
            fi
            label_on_issue "$id" "$search" "$label" "${after//*\":retry*/1}" "$force_result" && break
        fi
        done
    )
}

label_on_issues_without_tickets() {
    if label_on_issue "$id" '(?s)([dD]ownload.*failed.*404).*Result: setup failure' 'label:non_existing asset, candidate for removal or wrong settings'; then return
    elif label_on_issue "$id" 'File .*\.yaml.* does not exist at .*scheduler.pm' 'label:missing_schedule_file'; then return
    elif label_on_issue "$id" 'Compilation failed in require at .*isotovideo line 28.' 'label:schedule_compilation_error'; then return
    elif label_on_issue "$id" 'qemu-img: Could not open .*: No such file or directory' 'label:missing_asset'; then return
    elif label_on_issue "$id" 'fatal: Remote branch .* not found' 'label:remote_branch_not_found, probably wrong custom git URL specified with branch'; then return
    elif label_on_issue "$id" 'fatal: repository not found' 'label:remote_repo_not_found, probably wrong custom git URL specified'; then return
    elif label_on_issue "$id" '(?s)Cloning git URL.*to use as test distribution.*(No scripts in|needledir not found)' 'label:remote_repo_invalid, probably wrong custom git URL specified'; then return
    elif label_on_issue "$id" '(?s)Cloning git URL.*to use as test distribution.*(SCHEDULE.*not set|loadtest needs a script below)' 'label:remote_repo_schedule_not_found, probably wrong custom git URL + PRODUCTDIR specified'; then return
    elif label_on_issue "$id" '\[error\] Failed to download' 'label:download_error potentially out-of-space worker?' 1; then return
    fi
    false
}

handle_unknown() {
    local testurl=$1 out=$2 reason=$3
    to_review+=("$testurl ${reason:0:50}")
    echoerr "$testurl : Unknown issue, to be reviewed -> $testurl/file/autoinst-log.txt"
    echoerr -e "Likely the error is within this log excerpt, last lines before shutdown:\n---"
    # Look for different termination points with likely context
    (grep -A 12 'Backend process died, backend errors are reported below in the following lines' "$out" || grep -B 10 'sending magic and exit' "$out" || grep -B 5 'killing command server.*because test execution ended through exception' "$out" || grep -B 5 'EXIT 1' "$out" || grep -B 10 '\(Result: died\|isotovideo failed\)' "$out" || echo '(No log excerpt found)') | head -n -1 >&2
    echoerr "---"
}

investigate_issue() {
    local id="${1##*/}"
    local reason
    local curl_output
    reason=$(openqa-cli "${client_args[@]}" jobs/"$id" | runjq -r '.job.reason')
    curl_output=$(curl "${curl_args[@]}" -s -w "%{http_code}" "$testurl/file/autoinst-log.txt" -o "$out")
    # combine both the reason and autoinst-log.txt to check known issues
    # against even in case when autoinst-log.txt is missing the details, e.g.
    # see https://progress.opensuse.org/issues/69178
    echo "$reason" >> "$out"
    if [[ "$curl_output" != "200" ]] && [[ "$curl_output" != "301" ]]; then
        # if we can not even access the page it is something more critical
        handle_unreachable_or_no_log "$id" "$testurl" "$out"
    elif label_on_issues_from_issue_tracker "$id"; then return

    ## Issues without tickets, e.g. potential singular, manual debug jobs,
    # wrong user settings, etc.
    # could create an issue automatically with
    # $client_prefix curl -s -H "Content-Type: application/json" -X POST -H "X-Redmine-API-Key: $(sed -n 's/redmine-token = //p' ~/.query_redminerc)" --data '{"issue": {"project_id": 36, "category_id": 152, priority_id: 5, "subject": "test from command line"}}' https://progress.opensuse.org/issues.json
    # but we should check if the issue already exists, e.g. same
    # subject line
    elif label_on_issues_without_tickets "$id"; then return
    else
        handle_unknown "$testurl" "$out" "$reason"
    fi
}

print_summary() {
    local to_review_count=${#to_review[@]}
    [[ $to_review_count -eq 0 ]] && return
    local msg="\n\e[1m\e[31m$to_review_count unknown issues to be reviewed:\e[0m"
    for job in "${to_review[@]}"; do
        msg+="\n - $job"
    done
    echoerr -e "$msg"
}

main() {
    [[ "$dry_run" = "1" ]] && client_prefix="echo"
    if [[ -z "$client_call" ]]; then
        client_call=(openqa-cli "${client_args[@]}")
        if [[ -n "$client_prefix" ]]; then
            client_call=("$client_prefix" "${client_call[@]}")
        fi
    fi
    # search for issues with a subject search term
    issues=$(runcurl "${curl_args[@]}" -s "$issue_query" | runjq -r '.issues | .[] | (.id,.subject)')
    # shellcheck disable=SC2013
    for testurl in $(cat - | sed 's/ .*$//'); do
        investigate_issue "$testurl"
    done
    print_summary
}

main