From 07cd9338d0701cac23d0b13e8e94031177de32b9 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 19 Dec 2025 21:28:13 +0800 Subject: [PATCH 1/2] ci: Add workflow to automatically retry failed jobs Add a new workflow that monitors other workflows and automatically retries failed jobs on the first attempt. This handles transient failures such as: - Runner acquisition timeouts - Self-hosted runners being temporarily unavailable due to maintenance or failures --- .github/workflows/retry-failed-jobs.yaml | 41 ++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/retry-failed-jobs.yaml diff --git a/.github/workflows/retry-failed-jobs.yaml b/.github/workflows/retry-failed-jobs.yaml new file mode 100644 index 0000000000..55b42bf3b1 --- /dev/null +++ b/.github/workflows/retry-failed-jobs.yaml @@ -0,0 +1,41 @@ +name: "retry-failed-jobs" + +# This workflow automatically retries failed jobs from other workflows to handle transient failures +# such as: +# - Runner acquisition timeouts ("The job was not acquired by Runner of type self-hosted even after +# multiple attempts") +# - Self-hosted runners being temporarily unavailable due to maintenance or failures +# +# How it works: +# 1. Triggers when a monitored workflow completes (success or failure) +# 2. If the workflow failed on its first attempt, re-runs only the failed jobs +# 3. Uses a GitHub-hosted runner to avoid the same runner acquisition issues +# +# To add retry support for another workflow, add its name to the `workflows` list below. + +on: + workflow_run: + workflows: + - "clp-artifact-build" + types: + - "completed" + +jobs: + retry: + name: "retry-failed-jobs" + # Only retry if: + # - The workflow failed (not cancelled or successful) + # - This is the first attempt (prevents infinite retry loops) + if: >- + github.event.workflow_run.conclusion == 'failure' + && github.event.workflow_run.run_attempt == 1 + runs-on: "ubuntu-24.04" + steps: + - name: "Re-run failed jobs" + env: + GH_TOKEN: "${{secrets.GITHUB_TOKEN}}" + run: >- + gh run rerun ${{github.event.workflow_run.id}} + --failed + --repo ${{github.repository}} + shell: "bash" From baf95a61e5defdaaa82e2cdb1cdf2f13288c6afc Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 19 Dec 2025 21:42:42 +0800 Subject: [PATCH 2/2] fix: Add explicit permissions and logging for retry workflow --- .github/workflows/retry-failed-jobs.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/retry-failed-jobs.yaml b/.github/workflows/retry-failed-jobs.yaml index 55b42bf3b1..7fc9d9e54c 100644 --- a/.github/workflows/retry-failed-jobs.yaml +++ b/.github/workflows/retry-failed-jobs.yaml @@ -20,6 +20,9 @@ on: types: - "completed" +permissions: + actions: "write" + jobs: retry: name: "retry-failed-jobs" @@ -31,11 +34,20 @@ jobs: && github.event.workflow_run.run_attempt == 1 runs-on: "ubuntu-24.04" steps: + - name: "Log retry information" + run: | + echo "Retrying failed jobs for workflow run: ${{github.event.workflow_run.id}}" + echo "Original workflow: ${{github.event.workflow_run.name}}" + echo "Triggered by: ${{github.event.workflow_run.event}}" + echo "Conclusion: ${{github.event.workflow_run.conclusion}}" + shell: "bash" + - name: "Re-run failed jobs" env: GH_TOKEN: "${{secrets.GITHUB_TOKEN}}" run: >- gh run rerun ${{github.event.workflow_run.id}} --failed - --repo ${{github.repository}} + --repo ${{github.repository}} || + (echo "Failed to rerun workflow run ${{github.event.workflow_run.id}}" && exit 1) shell: "bash"