diff --git a/.github/workflows/retry-failed-jobs.yaml b/.github/workflows/retry-failed-jobs.yaml new file mode 100644 index 0000000000..7fc9d9e54c --- /dev/null +++ b/.github/workflows/retry-failed-jobs.yaml @@ -0,0 +1,53 @@ +name: "retry-failed-jobs" + +# This workflow automatically retries failed jobs from other workflows to handle transient failures +# such as: +# - Runner acquisition timeouts ("The job was not acquired by Runner of type self-hosted even after +# multiple attempts") +# - Self-hosted runners being temporarily unavailable due to maintenance or failures +# +# How it works: +# 1. Triggers when a monitored workflow completes (success or failure) +# 2. If the workflow failed on its first attempt, re-runs only the failed jobs +# 3. Uses a GitHub-hosted runner to avoid the same runner acquisition issues +# +# To add retry support for another workflow, add its name to the `workflows` list below. + +on: + workflow_run: + workflows: + - "clp-artifact-build" + types: + - "completed" + +permissions: + actions: "write" + +jobs: + retry: + name: "retry-failed-jobs" + # Only retry if: + # - The workflow failed (not cancelled or successful) + # - This is the first attempt (prevents infinite retry loops) + if: >- + github.event.workflow_run.conclusion == 'failure' + && github.event.workflow_run.run_attempt == 1 + runs-on: "ubuntu-24.04" + steps: + - name: "Log retry information" + run: | + echo "Retrying failed jobs for workflow run: ${{github.event.workflow_run.id}}" + echo "Original workflow: ${{github.event.workflow_run.name}}" + echo "Triggered by: ${{github.event.workflow_run.event}}" + echo "Conclusion: ${{github.event.workflow_run.conclusion}}" + shell: "bash" + + - name: "Re-run failed jobs" + env: + GH_TOKEN: "${{secrets.GITHUB_TOKEN}}" + run: >- + gh run rerun ${{github.event.workflow_run.id}} + --failed + --repo ${{github.repository}} || + (echo "Failed to rerun workflow run ${{github.event.workflow_run.id}}" && exit 1) + shell: "bash"