Skip to content

Commit

Permalink
Provide number of active runners as output.
Browse files Browse the repository at this point in the history
  • Loading branch information
joaander committed Jun 6, 2024
1 parent 0fb5ce4 commit eb7ead8
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 32 deletions.
11 changes: 5 additions & 6 deletions .github/workflows/start.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ on:
required: true

outputs:
exit_code:
description: "The exit code from start.py"
value: ${{ jobs.action_runners.outputs.exit_code }}
active:
description: "The number of active runners."
value: ${{ jobs.action_runners.outputs.active }}

jobs:
action_runners:
name: action runners
runs-on: ubuntu-latest
outputs:
exit_code: ${{ steps.start.outputs.exit_code }}
active: ${{ steps.start.outputs.active }}
steps:
- name: Checkout
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
Expand All @@ -43,9 +43,8 @@ jobs:
lockfile: ".github/workflows/requirements.txt"
- id: start
name: Start action runners
continue-on-error: true
run: |
python3 start/start-action-runners.py ${{ inputs.number }} || echo "exit_code=$?" >> $GITHUB_OUTPUT
echo "active=$(python3 start/start-action-runners.py ${{ inputs.number }})" >> $GITHUB_OUTPUT
env:
OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }}
OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
Expand Down
14 changes: 10 additions & 4 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,20 @@ jobs:
start_workflow:
name: Start
uses: ./.github/workflows/start.yaml
with:
number: 1
secrets: inherit

check_output:
name: Check output
needs: start_workflow
# runs-on: ubuntu-latest
runs-on: ${{ needs.start_workflow.outputs.exit_code == 3 && 'ubuntu-20.04' || 'ubuntu-24.04' }}
container: null
runs-on: ubuntu-latest

steps:
- run: "echo Exit code: ${{ needs.start_workflow.outputs.exit_code }}"
- name: "Check active runners"
run: |
echo Number of active runners: ${{ needs.start_workflow.outputs.active }}
if [[ "${{ needs.start_workflow.outputs.active }}" != "10" ]]
then
exit 1
fi
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
ci:
autoupdate_schedule: quarterly
autoupdate_branch: 'trunk-patch'
autofix_prs: false

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: 'v4.4.0'
Expand Down
3 changes: 2 additions & 1 deletion configure-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
- name: Configure shelve action runners job
ansible.builtin.cron:
name: shelve-action-runners
hour: "20,2"
# shelf actions runners at 20:00 and 6:00 EDT (the server runs on UTC)
hour: "12,6"
minute: "4"
job: '/usr/bin/bash -c "date && source {{ home }}/openrc-credentials.sh && {{ miniforge_path }}/bin/python3 {{ home }}/jetstream2-admin/shelve-action-runners.py && echo" 2>&1 >> {{ home }}/shelve-action-runners.log'
user: exouser
Expand Down
40 changes: 19 additions & 21 deletions start/start-action-runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ def bring_runners_online(connection, N):
"""Bring N actions-runner servers online.
Returns:
True when N (or all) actions-runner servers are online, False otherwise.
(done, n_active): `done` is `True` when N (or all) actions-runner servers are online,
`False` otherwise.
"""
try:
servers = list(connection.compute.servers())
except Exception as e:
print('::warning:: Failed to enumerate servers:', str(e))
print('::warning:: Failed to enumerate servers:', str(e), file=sys.stderr)
return False

total_runners = 0
Expand All @@ -31,41 +32,41 @@ def bring_runners_online(connection, N):
servers.sort(key=lambda server: server.name)

for server in servers:
if server.name.startswith('actions-runner'):
if server.name != 'actions-runner-manager' and server.name.startswith('actions-runner'):
if N > 0 and total_runners >= N:
break

total_runners += 1

print(
f'Server {server.name} is {server.status}({server.task_state}).'
f'Server {server.name} is {server.status}({server.task_state}).', file=sys.stderr
)
if (server.status == 'SHELVED_OFFLOADED'
and server.task_state is None):
print(f'... unshelving {server.name}.')
print(f'... unshelving {server.name}.', file=sys.stderr)

try:
connection.compute.unshelve_server(server)
except Exception as e:
print(f'::warning:: Failed to unshelve {server.name}:',
str(e))
str(e), file=sys.stderr)

elif server.status == 'SHUTOFF' and server.task_state is None:
print(f'... starting {server.name}.')
print(f'... starting {server.name}.', file=sys.stderr)

try:
connection.compute.start_server(server)
except Exception as e:
print(f'::warning:: Failed to start server {server.name}:',
str(e))
str(e), file=sys.stderr)

elif server.status == 'ACTIVE':
active_runners += 1

if total_runners == active_runners:
print(f"Success: {total_runners} actions-runner servers are active.")
print(f"Success: {total_runners} actions-runner servers are active.", file=sys.stderr)

sys.stdout.flush()
sys.stderr.flush()

return (total_runners == active_runners, active_runners)

Expand All @@ -81,26 +82,23 @@ def bring_runners_online(connection, N):

args = parser.parse_args()

# catch errors and return success so that this script doesn't stop the whole
# actions job
# catch errors and return success so that this script doesn't stop the whole actions job
try:
connection = openstack.connect()
except Exception as e:
print('::warning:: Failed to connect to cloud:', str(e))
sys.exit(1)
print('::warning:: Failed to connect to cloud:', str(e), file=sys.stderr)
print(0)
sys.exit(0)

# attempt to bring the servers online several times before returning
attempts = 0
done, active_runners = bring_runners_online(connection, args.N)
while (not done and attempts < NUM_ATTEMPTS):
attempts += 1
print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True)
print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True, file=sys.stderr)
time.sleep(TIME_BETWEEN_ATTEMPTS)
print('', flush=True)
print('', flush=True, file=sys.stderr)
done, active_runners = bring_runners_online(connection, args.N)

if active_runners == 0:
sys.exit(2)

# testing
sys.exit(3)
# Calling applications can redirect stdout to determine the number of active runners.
print(active_runners)

0 comments on commit eb7ead8

Please sign in to comment.