From 4fc7358dd4a62083a4e674049de15fbb19e61ee4 Mon Sep 17 00:00:00 2001 From: "Joshua A. Anderson" Date: Thu, 6 Jun 2024 06:25:47 -0400 Subject: [PATCH] Provide number of active runners as output. --- .github/workflows/start.yaml | 11 +++++----- .github/workflows/test.yaml | 15 +++++++++---- .pre-commit-config.yaml | 5 +++++ auto-shutdown.sh | 2 +- configure-manager.yaml | 3 ++- start/start-action-runners.py | 40 +++++++++++++++++------------------ 6 files changed, 43 insertions(+), 33 deletions(-) diff --git a/.github/workflows/start.yaml b/.github/workflows/start.yaml index f7109b8..005654e 100644 --- a/.github/workflows/start.yaml +++ b/.github/workflows/start.yaml @@ -16,16 +16,16 @@ on: required: true outputs: - exit_code: - description: "The exit code from start.py" - value: ${{ jobs.action_runners.outputs.exit_code }} + active: + description: "The number of active runners." + value: ${{ jobs.action_runners.outputs.active }} jobs: action_runners: name: action runners runs-on: ubuntu-latest outputs: - exit_code: ${{ steps.start.outputs.exit_code }} + active: ${{ steps.start.outputs.active }} steps: - name: Checkout uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 @@ -43,9 +43,8 @@ jobs: lockfile: ".github/workflows/requirements.txt" - id: start name: Start action runners - continue-on-error: true run: | - python3 start/start-action-runners.py ${{ inputs.number }} || echo "exit_code=$?" >> $GITHUB_OUTPUT + echo "active=$(python3 start/start-action-runners.py ${{ inputs.number }})" >> $GITHUB_OUTPUT env: OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 23fb40d..4a9728b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -9,14 +9,21 @@ jobs: start_workflow: name: Start uses: ./.github/workflows/start.yaml + with: + number: 1 secrets: inherit check_output: name: Check output needs: start_workflow - # runs-on: ubuntu-latest - runs-on: ${{ needs.start_workflow.outputs.exit_code == 3 && 'ubuntu-20.04' || 'ubuntu-24.04' }} - container: null + runs-on: ubuntu-latest steps: - - run: "echo Exit code: ${{ needs.start_workflow.outputs.exit_code }}" + - name: "Check active runners" + run: | + echo Number of active runners: ${{ needs.start_workflow.outputs.active }} + if [[ "${{ needs.start_workflow.outputs.active }}" != "1" ]] + then + echo "::error:: The action runner failed to start." + exit 1 + fi diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1d6568b..6bf26b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,8 @@ +ci: + autoupdate_schedule: quarterly + autoupdate_branch: 'trunk-patch' + autofix_prs: false + repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: 'v4.4.0' diff --git a/auto-shutdown.sh b/auto-shutdown.sh index 3c7e6c2..3017ae1 100755 --- a/auto-shutdown.sh +++ b/auto-shutdown.sh @@ -9,7 +9,7 @@ num_users=$(who | wc -l) num_notty_logins=$(pgrep -ai sshd | grep "@notty" | wc -l) # Ensure that the system remains up after boot. -if (( $uptime < 1800 )); then +if (( $uptime < 7200 )); then echo $(date): Skipping auto-shutdown, system up for $uptime seconds. exit 0 fi diff --git a/configure-manager.yaml b/configure-manager.yaml index 0125dba..c6a77a0 100644 --- a/configure-manager.yaml +++ b/configure-manager.yaml @@ -49,7 +49,8 @@ - name: Configure shelve action runners job ansible.builtin.cron: name: shelve-action-runners - hour: "20,2" + # shelf actions runners at 20:00 and 6:00 EDT (the server runs on UTC) + hour: "12,6" minute: "4" job: '/usr/bin/bash -c "date && source {{ home }}/openrc-credentials.sh && {{ miniforge_path }}/bin/python3 {{ home }}/jetstream2-admin/shelve-action-runners.py && echo" 2>&1 >> {{ home }}/shelve-action-runners.log' user: exouser diff --git a/start/start-action-runners.py b/start/start-action-runners.py index 0b6c5d7..edb0f72 100644 --- a/start/start-action-runners.py +++ b/start/start-action-runners.py @@ -17,12 +17,13 @@ def bring_runners_online(connection, N): """Bring N actions-runner servers online. Returns: - True when N (or all) actions-runner servers are online, False otherwise. + (done, n_active): `done` is `True` when N (or all) actions-runner servers are online, + `False` otherwise. """ try: servers = list(connection.compute.servers()) except Exception as e: - print('::warning:: Failed to enumerate servers:', str(e)) + print('::warning:: Failed to enumerate servers:', str(e), file=sys.stderr) return False total_runners = 0 @@ -31,41 +32,41 @@ def bring_runners_online(connection, N): servers.sort(key=lambda server: server.name) for server in servers: - if server.name.startswith('actions-runner'): + if server.name != 'actions-runner-manager' and server.name.startswith('actions-runner'): if N > 0 and total_runners >= N: break total_runners += 1 print( - f'Server {server.name} is {server.status}({server.task_state}).' + f'Server {server.name} is {server.status}({server.task_state}).', file=sys.stderr ) if (server.status == 'SHELVED_OFFLOADED' and server.task_state is None): - print(f'... unshelving {server.name}.') + print(f'... unshelving {server.name}.', file=sys.stderr) try: connection.compute.unshelve_server(server) except Exception as e: print(f'::warning:: Failed to unshelve {server.name}:', - str(e)) + str(e), file=sys.stderr) elif server.status == 'SHUTOFF' and server.task_state is None: - print(f'... starting {server.name}.') + print(f'... starting {server.name}.', file=sys.stderr) try: connection.compute.start_server(server) except Exception as e: print(f'::warning:: Failed to start server {server.name}:', - str(e)) + str(e), file=sys.stderr) elif server.status == 'ACTIVE': active_runners += 1 if total_runners == active_runners: - print(f"Success: {total_runners} actions-runner servers are active.") + print(f"Success: {total_runners} actions-runner servers are active.", file=sys.stderr) - sys.stdout.flush() + sys.stderr.flush() return (total_runners == active_runners, active_runners) @@ -81,26 +82,23 @@ def bring_runners_online(connection, N): args = parser.parse_args() - # catch errors and return success so that this script doesn't stop the whole - # actions job + # catch errors and return success so that this script doesn't stop the whole actions job try: connection = openstack.connect() except Exception as e: - print('::warning:: Failed to connect to cloud:', str(e)) - sys.exit(1) + print('::warning:: Failed to connect to cloud:', str(e), file=sys.stderr) + print(0) + sys.exit(0) # attempt to bring the servers online several times before returning attempts = 0 done, active_runners = bring_runners_online(connection, args.N) while (not done and attempts < NUM_ATTEMPTS): attempts += 1 - print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True) + print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True, file=sys.stderr) time.sleep(TIME_BETWEEN_ATTEMPTS) - print('', flush=True) + print('', flush=True, file=sys.stderr) done, active_runners = bring_runners_online(connection, args.N) - if active_runners == 0: - sys.exit(2) - - # testing - sys.exit(3) + # Calling applications can redirect stdout to determine the number of active runners. + print(active_runners)