From b21fb193eaabfad325a6085792c79c016386127a Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Thu, 6 Jun 2024 06:25:47 -0400
Subject: [PATCH] Provide number of active runners as output.

---
 .github/workflows/start.yaml  | 11 +++++-----
 .github/workflows/test.yaml   |  8 +++----
 .pre-commit-config.yaml       |  5 +++++
 configure-manager.yaml        |  3 ++-
 start/start-action-runners.py | 40 +++++++++++++++++------------------
 5 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/start.yaml b/.github/workflows/start.yaml
index f7109b8..cfae9d1 100644
--- a/.github/workflows/start.yaml
+++ b/.github/workflows/start.yaml
@@ -16,16 +16,16 @@ on:
         required: true
 
     outputs:
-      exit_code:
-        description: "The exit code from start.py"
-        value: ${{ jobs.action_runners.outputs.exit_code }}
+      active:
+        description: "The number of active runners."
+        value: ${{ jobs.action_runners.outputs.active }}
 
 jobs:
   action_runners:
     name: action runners
     runs-on: ubuntu-latest
     outputs:
-      exit_code: ${{ steps.start.outputs.exit_code }}
+      active: ${{ steps.start.outputs.active }}
     steps:
     - name: Checkout
       uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6  
@@ -43,9 +43,8 @@ jobs:
         lockfile: ".github/workflows/requirements.txt"
     - id: start
       name: Start action runners
-      continue-on-error: true
       run: |
-        python3 start/start-action-runners.py ${{ inputs.number }} || echo "exit_code=$?" >> $GITHUB_OUTPUT
+        echo "num_active=$(python3 start/start-action-runners.py ${{ inputs.number }})" >> $GITHUB_OUTPUT
       env:
         OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }}
         OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 23fb40d..1b6c829 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -9,14 +9,14 @@ jobs:
   start_workflow:
     name: Start
     uses: ./.github/workflows/start.yaml
+    with:
+      number: 1
     secrets: inherit
 
   check_output:
     name: Check output
     needs: start_workflow
-    # runs-on: ubuntu-latest
-    runs-on: ${{ needs.start_workflow.outputs.exit_code == 3 && 'ubuntu-20.04' || 'ubuntu-24.04' }}
-    container: null
+    runs-on: ubuntu-latest
 
     steps:
-    - run: "echo Exit code: ${{ needs.start_workflow.outputs.exit_code }}"
+    - run: "echo Number of active runners: ${{ needs.start_workflow.outputs.active }}"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1d6568b..6bf26b4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,8 @@
+ci:
+  autoupdate_schedule: quarterly
+  autoupdate_branch: 'trunk-patch'
+  autofix_prs: false
+
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: 'v4.4.0'
diff --git a/configure-manager.yaml b/configure-manager.yaml
index 0125dba..c6a77a0 100644
--- a/configure-manager.yaml
+++ b/configure-manager.yaml
@@ -49,7 +49,8 @@
     - name: Configure shelve action runners job
       ansible.builtin.cron:
         name: shelve-action-runners
-        hour: "20,2"
+        # shelf actions runners at 20:00 and 6:00 EDT (the server runs on UTC)
+        hour: "12,6"
         minute: "4"
         job: '/usr/bin/bash -c "date && source {{ home }}/openrc-credentials.sh && {{ miniforge_path }}/bin/python3 {{ home }}/jetstream2-admin/shelve-action-runners.py && echo" 2>&1 >> {{ home }}/shelve-action-runners.log'
         user: exouser
diff --git a/start/start-action-runners.py b/start/start-action-runners.py
index 0b6c5d7..edb0f72 100644
--- a/start/start-action-runners.py
+++ b/start/start-action-runners.py
@@ -17,12 +17,13 @@ def bring_runners_online(connection, N):
     """Bring N actions-runner servers online.
 
     Returns:
-        True when N (or all) actions-runner servers are online, False otherwise.
+        (done, n_active): `done` is `True` when N (or all) actions-runner servers are online,
+        `False` otherwise.
     """
     try:
         servers = list(connection.compute.servers())
     except Exception as e:
-        print('::warning:: Failed to enumerate servers:', str(e))
+        print('::warning:: Failed to enumerate servers:', str(e), file=sys.stderr)
         return False
 
     total_runners = 0
@@ -31,41 +32,41 @@ def bring_runners_online(connection, N):
     servers.sort(key=lambda server: server.name)
 
     for server in servers:
-        if server.name.startswith('actions-runner'):
+        if server.name != 'actions-runner-manager' and server.name.startswith('actions-runner'):
             if N > 0 and total_runners >= N:
                 break
 
             total_runners += 1
 
             print(
-                f'Server {server.name} is {server.status}({server.task_state}).'
+                f'Server {server.name} is {server.status}({server.task_state}).', file=sys.stderr
             )
             if (server.status == 'SHELVED_OFFLOADED'
                     and server.task_state is None):
-                print(f'... unshelving {server.name}.')
+                print(f'... unshelving {server.name}.', file=sys.stderr)
 
                 try:
                     connection.compute.unshelve_server(server)
                 except Exception as e:
                     print(f'::warning:: Failed to unshelve {server.name}:',
-                          str(e))
+                          str(e), file=sys.stderr)
 
             elif server.status == 'SHUTOFF' and server.task_state is None:
-                print(f'... starting {server.name}.')
+                print(f'... starting {server.name}.', file=sys.stderr)
 
                 try:
                     connection.compute.start_server(server)
                 except Exception as e:
                     print(f'::warning:: Failed to start server {server.name}:',
-                          str(e))
+                          str(e), file=sys.stderr)
 
             elif server.status == 'ACTIVE':
                 active_runners += 1
 
     if total_runners == active_runners:
-        print(f"Success: {total_runners} actions-runner servers are active.")
+        print(f"Success: {total_runners} actions-runner servers are active.", file=sys.stderr)
 
-    sys.stdout.flush()
+    sys.stderr.flush()
 
     return (total_runners == active_runners, active_runners)
 
@@ -81,26 +82,23 @@ def bring_runners_online(connection, N):
 
     args = parser.parse_args()
 
-    # catch errors and return success so that this script doesn't stop the whole
-    # actions job
+    # catch errors and return success so that this script doesn't stop the whole actions job
     try:
         connection = openstack.connect()
     except Exception as e:
-        print('::warning:: Failed to connect to cloud:', str(e))
-        sys.exit(1)
+        print('::warning:: Failed to connect to cloud:', str(e), file=sys.stderr)
+        print(0)
+        sys.exit(0)
 
     # attempt to bring the servers online several times before returning
     attempts = 0
     done, active_runners = bring_runners_online(connection, args.N)
     while (not done and attempts < NUM_ATTEMPTS):
         attempts += 1
-        print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True)
+        print(f'Waiting {TIME_BETWEEN_ATTEMPTS} seconds...', flush=True, file=sys.stderr)
         time.sleep(TIME_BETWEEN_ATTEMPTS)
-        print('', flush=True)
+        print('', flush=True, file=sys.stderr)
         done, active_runners = bring_runners_online(connection, args.N)
 
-    if active_runners == 0:
-        sys.exit(2)
-
-    # testing
-    sys.exit(3)
+    # Calling applications can redirect stdout to determine the number of active runners.
+    print(active_runners)