From e3990e0ec17ae8c338073fe2a72448f8d5b4ef93 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 08:41:08 +0800
Subject: [PATCH 1/9] Use pre-extracted runner from AMI

Skip extraction step since runner is now pre-extracted
during AMI build at /opt/actions-runner/. This reduces
startup time and disk I/O.
---
 user-data.sh | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/user-data.sh b/user-data.sh
index 7225372..d1b0475 100644
--- a/user-data.sh
+++ b/user-data.sh
@@ -57,19 +57,16 @@ sed -i 's/ap-southeast-3/us-east-2/g' /etc/apt/sources.list
 # Add ubuntu user to docker group
 usermod -aG docker ubuntu
 
-# Setup runner directory
-cd /opt
-mkdir -p actions-runner
-chown -R ubuntu:ubuntu actions-runner
-cd actions-runner
-
-# Extract runner
-log_to_cloudwatch "INFO" "Extracting GitHub runner"
-if ! sudo -u ubuntu tar xzf ../runner-cache/actions-runner-linux-* -C .; then
-    log_to_cloudwatch "ERROR" "Failed to extract runner archive"
+# Use pre-extracted runner directory
+cd /opt/actions-runner
+
+# Verify runner exists
+if [ ! -f "./run.sh" ]; then
+    log_to_cloudwatch "ERROR" "Runner not found at /opt/actions-runner"
     shutdown now
     exit 1
 fi
+log_to_cloudwatch "INFO" "Using pre-extracted GitHub runner"
 
 # Get instance type (we already have instance ID from earlier)
 INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type)

From 20b892dfc6bb5636d9f4dcd066497e47c14482da Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 09:14:56 +0800
Subject: [PATCH 2/9] Add warm pool support for faster runner startup

Implement a warm pool of pre-stopped EC2 instances to reduce GitHub
runner startup time. Key features:

- New WarmPoolConfig parameter (JSON map of instance type to pool size)
- Warm pool instances stop after first boot, ready for quick activation
- When activated, shutdown behavior changes to TERMINATE (ephemeral)
- Pool automatically replenishes when instances are used
- Pool size of 0 or empty config disables feature (current behavior)

Example config: {"c8a.4xlarge":2,"c8a.2xlarge":3}

New Lambda permissions: ec2:DescribeInstances, ec2:StartInstances,
ec2:StopInstances, ec2:ModifyInstanceAttribute
---
 main.go       | 474 +++++++++++++++++++++++++++++++++++++++++++-------
 template.yaml |  17 +-
 2 files changed, 427 insertions(+), 64 deletions(-)

diff --git a/main.go b/main.go
index d7b91ef..551de6b 100644
--- a/main.go
+++ b/main.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	_ "embed"
 	"encoding/base64"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"log/slog"
@@ -14,6 +15,7 @@ import (
 	"strconv"
 	"strings"
 	"text/template"
+	"time"
 
 	"github.com/aws/aws-lambda-go/events"
 	"github.com/aws/aws-lambda-go/lambda"
@@ -28,6 +30,323 @@ import (
 //go:embed user-data.sh
 var userData string
 
+// LaunchConfig holds common EC2 launch configuration.
+type LaunchConfig struct {
+	ImageID            string
+	SubnetID           string
+	SecurityGroups     []string
+	KeyName            string
+	InstanceProfileArn string
+}
+
+// parseWarmPoolConfig parses the WARM_POOL_CONFIG environment variable.
+// Returns a map of instance type to target pool size.
+func parseWarmPoolConfig() map[string]int {
+	configStr := os.Getenv("WARM_POOL_CONFIG")
+	if configStr == "" || configStr == "{}" {
+		return nil
+	}
+
+	var poolConfig map[string]int
+	if err := json.Unmarshal([]byte(configStr), &poolConfig); err != nil {
+		slog.Error("failed to parse WARM_POOL_CONFIG", "error", err.Error(), "config", configStr)
+		return nil
+	}
+
+	return poolConfig
+}
+
+// findAvailableWarmInstance searches for a stopped warm pool instance of the requested type.
+func findAvailableWarmInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType) (*string, error) {
+	output, err := svc.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
+		Filters: []types.Filter{
+			{
+				Name:   aws.String("tag:WarmPool"),
+				Values: []string{"true"},
+			},
+			{
+				Name:   aws.String("tag:WarmPoolStatus"),
+				Values: []string{"available"},
+			},
+			{
+				Name:   aws.String("tag:WarmPoolInstanceType"),
+				Values: []string{string(instanceType)},
+			},
+			{
+				Name:   aws.String("instance-state-name"),
+				Values: []string{"stopped"},
+			},
+		},
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to describe instances: %w", err)
+	}
+
+	for _, reservation := range output.Reservations {
+		for _, instance := range reservation.Instances {
+			return instance.InstanceId, nil
+		}
+	}
+
+	return nil, nil // No available instance found
+}
+
+// startWarmInstance activates a stopped warm pool instance for a job.
+// It sets the activation tag, updates user-data, changes shutdown behavior to TERMINATE, and starts the instance.
+func startWarmInstance(ctx context.Context, svc *ec2.Client, instanceID string, jobEventID int64, finalUserData string) error {
+	// Update tags to mark as activated and in-use
+	_, err := svc.CreateTags(ctx, &ec2.CreateTagsInput{
+		Resources: []string{instanceID},
+		Tags: []types.Tag{
+			{Key: aws.String("WarmPoolStatus"), Value: aws.String("in-use")},
+			{Key: aws.String("WarmPoolActivated"), Value: aws.String("true")},
+			{Key: aws.String("GitHub Workflow Job Event ID"), Value: aws.String(strconv.FormatInt(jobEventID, 10))},
+			{Key: aws.String("Name"), Value: aws.String("GitHub Workflow Ephemeral Runner")},
+		},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to update tags: %w", err)
+	}
+
+	// Change shutdown behavior to TERMINATE so instance terminates after job
+	_, err = svc.ModifyInstanceAttribute(ctx, &ec2.ModifyInstanceAttributeInput{
+		InstanceId: aws.String(instanceID),
+		InstanceInitiatedShutdownBehavior: &types.AttributeValue{
+			Value: aws.String("terminate"),
+		},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to update shutdown behavior: %w", err)
+	}
+
+	// Update user data with the full setup script wrapped in multipart format
+	// so it runs on every boot (including this activation)
+	wrappedUserData := wrapInMultipart(finalUserData)
+	encodedUserData := base64.StdEncoding.EncodeToString([]byte(wrappedUserData))
+	_, err = svc.ModifyInstanceAttribute(ctx, &ec2.ModifyInstanceAttributeInput{
+		InstanceId: aws.String(instanceID),
+		UserData:   &types.BlobAttributeValue{Value: []byte(encodedUserData)},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to update user data: %w", err)
+	}
+
+	// Start the instance
+	startOutput, err := svc.StartInstances(ctx, &ec2.StartInstancesInput{
+		InstanceIds: []string{instanceID},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to start instance: %w", err)
+	}
+
+	// Verify the instance was actually stopped before we started it
+	for _, change := range startOutput.StartingInstances {
+		if *change.InstanceId == instanceID {
+			if change.PreviousState.Name != types.InstanceStateNameStopped {
+				return fmt.Errorf("instance %s was not stopped (was %s)", instanceID, change.PreviousState.Name)
+			}
+		}
+	}
+
+	return nil
+}
+
+// countWarmPoolInstances counts available instances in the warm pool for a given type.
+func countWarmPoolInstances(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType) (int, error) {
+	output, err := svc.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
+		Filters: []types.Filter{
+			{
+				Name:   aws.String("tag:WarmPool"),
+				Values: []string{"true"},
+			},
+			{
+				Name:   aws.String("tag:WarmPoolStatus"),
+				Values: []string{"available"},
+			},
+			{
+				Name:   aws.String("tag:WarmPoolInstanceType"),
+				Values: []string{string(instanceType)},
+			},
+			{
+				Name:   aws.String("instance-state-name"),
+				Values: []string{"stopped", "stopping"},
+			},
+		},
+	})
+	if err != nil {
+		return 0, fmt.Errorf("failed to describe instances: %w", err)
+	}
+
+	count := 0
+	for _, reservation := range output.Reservations {
+		count += len(reservation.Instances)
+	}
+
+	return count, nil
+}
+
+// warmPoolInitUserData is a minimal script that stops the instance on first boot.
+// Uses multipart MIME format with cloud-config to run scripts on every boot.
+const warmPoolInitUserData = `Content-Type: multipart/mixed; boundary="//"
+MIME-Version: 1.0
+
+--//
+Content-Type: text/cloud-config; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="cloud-config.txt"
+
+#cloud-config
+cloud_final_modules:
+- [scripts-user, always]
+
+--//
+Content-Type: text/x-shellscript; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="userdata.txt"
+
+#!/bin/bash
+# Warm pool init: just stop the instance after first boot
+# When activated, user-data will be replaced with the real setup script
+echo "Warm pool instance initializing, stopping to enter pool..."
+shutdown -h now
+--//--
+`
+
+// wrapInMultipart wraps a shell script in multipart MIME format that runs on every boot.
+func wrapInMultipart(script string) string {
+	return fmt.Sprintf(`Content-Type: multipart/mixed; boundary="//"
+MIME-Version: 1.0
+
+--//
+Content-Type: text/cloud-config; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="cloud-config.txt"
+
+#cloud-config
+cloud_final_modules:
+- [scripts-user, always]
+
+--//
+Content-Type: text/x-shellscript; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="userdata.txt"
+
+%s
+--//--
+`, script)
+}
+
+// launchWarmPoolInstance launches a new instance destined for the warm pool.
+// Instances will stop after first boot and terminate after being used for a job.
+func launchWarmPoolInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig) (*string, error) {
+	tags := []types.Tag{
+		{Key: aws.String("WarmPool"), Value: aws.String("true")},
+		{Key: aws.String("WarmPoolStatus"), Value: aws.String("available")},
+		{Key: aws.String("WarmPoolActivated"), Value: aws.String("false")},
+		{Key: aws.String("WarmPoolInstanceType"), Value: aws.String(string(instanceType))},
+		{Key: aws.String("WarmPoolCreatedAt"), Value: aws.String(time.Now().UTC().Format(time.RFC3339))},
+		{Key: aws.String("Name"), Value: aws.String(fmt.Sprintf("GitHub Runner Warm Pool - %s", instanceType))},
+	}
+
+	output, err := svc.RunInstances(ctx, &ec2.RunInstancesInput{
+		MinCount:                          aws.Int32(1),
+		MaxCount:                          aws.Int32(1),
+		EbsOptimized:                      aws.Bool(true),
+		ImageId:                           aws.String(launchConfig.ImageID),
+		InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorStop, // STOP on first boot to enter warm pool
+		InstanceType:                      instanceType,
+		IamInstanceProfile: &types.IamInstanceProfileSpecification{
+			Arn: aws.String(launchConfig.InstanceProfileArn),
+		},
+		NetworkInterfaces: []types.InstanceNetworkInterfaceSpecification{
+			{
+				AssociatePublicIpAddress: aws.Bool(true),
+				SubnetId:                 aws.String(launchConfig.SubnetID),
+				DeleteOnTermination:      aws.Bool(true),
+				DeviceIndex:              aws.Int32(0),
+				Groups:                   launchConfig.SecurityGroups,
+			},
+		},
+		KeyName:    aws.String(launchConfig.KeyName),
+		Monitoring: &types.RunInstancesMonitoringEnabled{Enabled: aws.Bool(true)},
+		TagSpecifications: []types.TagSpecification{
+			{ResourceType: types.ResourceTypeInstance, Tags: tags},
+			{ResourceType: types.ResourceTypeVolume, Tags: tags},
+		},
+		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(warmPoolInitUserData))),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to launch warm pool instance: %w", err)
+	}
+
+	if len(output.Instances) == 0 {
+		return nil, errors.New("no instance created")
+	}
+
+	return output.Instances[0].InstanceId, nil
+}
+
+// launchFreshInstance launches a new instance that terminates after use (current behavior).
+func launchFreshInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig, finalUserData string, jobEventID int64) (*string, error) {
+	tags := []types.Tag{
+		{
+			Key:   aws.String("GitHub Workflow Job Event ID"),
+			Value: aws.String(strconv.FormatInt(jobEventID, 10)),
+		},
+		{
+			Key:   aws.String("Name"),
+			Value: aws.String("GitHub Workflow Ephemeral Runner"),
+		},
+	}
+
+	output, err := svc.RunInstances(ctx, &ec2.RunInstancesInput{
+		MinCount:                          aws.Int32(1),
+		MaxCount:                          aws.Int32(1),
+		EbsOptimized:                      aws.Bool(true),
+		ImageId:                           aws.String(launchConfig.ImageID),
+		InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorTerminate,
+		InstanceType:                      instanceType,
+		IamInstanceProfile: &types.IamInstanceProfileSpecification{
+			Arn: aws.String(launchConfig.InstanceProfileArn),
+		},
+		NetworkInterfaces: []types.InstanceNetworkInterfaceSpecification{
+			{
+				AssociatePublicIpAddress: aws.Bool(true),
+				SubnetId:                 aws.String(launchConfig.SubnetID),
+				DeleteOnTermination:      aws.Bool(true),
+				DeviceIndex:              aws.Int32(0),
+				Groups:                   launchConfig.SecurityGroups,
+			},
+		},
+		KeyName:    aws.String(launchConfig.KeyName),
+		Monitoring: &types.RunInstancesMonitoringEnabled{Enabled: aws.Bool(true)},
+		TagSpecifications: []types.TagSpecification{
+			{
+				ResourceType: types.ResourceTypeInstance,
+				Tags:         tags,
+			},
+			{
+				ResourceType: types.ResourceTypeVolume,
+				Tags:         tags,
+			},
+		},
+		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(finalUserData))),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to launch instance: %w", err)
+	}
+
+	if len(output.Instances) == 0 {
+		return nil, errors.New("no instance created")
+	}
+
+	return output.Instances[0].InstanceId, nil
+}
+
 func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyResponse, error) {
 	var githubEventHeader string
 
@@ -62,7 +381,9 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 			return events.APIGatewayProxyResponse{StatusCode: http.StatusOK}, nil
 		}
 
-		cfg, err := config.LoadDefaultConfig(context.TODO(), config.WithRegion("us-east-2"))
+		ctx := context.TODO()
+
+		cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion("us-east-2"))
 		if err != nil {
 			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
 		}
@@ -77,7 +398,7 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("secret name missing")
 		}
 
-		secretOut, err := sm.GetSecretValue(context.TODO(), &secretsmanager.GetSecretValueInput{SecretId: aws.String(secretName)})
+		secretOut, err := sm.GetSecretValue(ctx, &secretsmanager.GetSecretValueInput{SecretId: aws.String(secretName)})
 		if err != nil {
 			slog.Error("failed to get secret", "secret", secretName, "error", err.Error())
 
@@ -128,15 +449,12 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("image id missing")
 		}
 
-		tags := []types.Tag{
-			{
-				Key:   aws.String("GitHub Workflow Job Event ID"),
-				Value: aws.String(strconv.Itoa(int(event.GetWorkflowJob().GetID()))),
-			},
-			{
-				Key:   aws.String("Name"),
-				Value: aws.String("GitHub Workflow Ephemeral Runner"),
-			},
+		launchConfig := LaunchConfig{
+			ImageID:            imageID,
+			SubnetID:           subnetID,
+			SecurityGroups:     securityGroups,
+			KeyName:            keyName,
+			InstanceProfileArn: instanceProfileArn,
 		}
 
 		ephemeral := slices.Contains(event.GetWorkflowJob().Labels, "ephemeral")
@@ -159,7 +477,7 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 			}
 		}
 
-		slog.Info("creating instance", "instanceType", instanceType)
+		slog.Info("processing job", "instanceType", instanceType, "jobID", event.GetWorkflowJob().GetID())
 
 		tpl, err := template.New("userdata").Parse(userData)
 		if err != nil {
@@ -172,66 +490,96 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 		}
 
 		finalUserData := buf.String()
+		jobEventID := event.GetWorkflowJob().GetID()
 
-		output, err := svc.RunInstances(
-			context.TODO(),
-			&ec2.RunInstancesInput{
-				MinCount:                          aws.Int32(1),
-				MaxCount:                          aws.Int32(1),
-				EbsOptimized:                      aws.Bool(true),
-				ImageId:                           aws.String(imageID),
-				InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorTerminate,
-				InstanceType:                      instanceType,
-				IamInstanceProfile: &types.IamInstanceProfileSpecification{
-					Arn: aws.String(instanceProfileArn),
-				},
-				NetworkInterfaces: []types.InstanceNetworkInterfaceSpecification{
-					{
-						AssociatePublicIpAddress: aws.Bool(true),
-						SubnetId:                 aws.String(subnetID),
-						DeleteOnTermination:      aws.Bool(true),
-						DeviceIndex:              aws.Int32(0),
-						Groups:                   securityGroups,
-					},
-				},
-				KeyName:    aws.String(keyName),
-				Monitoring: &types.RunInstancesMonitoringEnabled{Enabled: aws.Bool(true)},
-				TagSpecifications: []types.TagSpecification{
-					{
-						ResourceType: types.ResourceTypeInstance,
-						Tags:         tags,
-					},
-					{
-						ResourceType: types.ResourceTypeVolume,
-						Tags:         tags,
-					},
-				},
-				// base64 encode user data
-				UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(finalUserData))),
-			},
-		)
-		if err != nil {
-			slog.Error(err.Error())
+		// Parse warm pool configuration
+		poolConfig := parseWarmPoolConfig()
+		targetSize := 0
+		if poolConfig != nil {
+			targetSize = poolConfig[string(instanceType)]
+		}
 
-			return events.APIGatewayProxyResponse{
-				Body:       err.Error(),
-				StatusCode: http.StatusInternalServerError,
-			}, err
+		var instanceID *string
+
+		// Try warm pool if configured for this instance type
+		if targetSize > 0 {
+			slog.Info("checking warm pool", "instanceType", instanceType, "targetSize", targetSize)
+
+			warmInstanceID, err := findAvailableWarmInstance(ctx, svc, instanceType)
+			if err != nil {
+				slog.Warn("failed to query warm pool", "error", err.Error())
+				// Fall through to fresh launch
+			} else if warmInstanceID != nil {
+				slog.Info("found warm pool instance", "instanceID", *warmInstanceID)
+
+				err = startWarmInstance(ctx, svc, *warmInstanceID, jobEventID, finalUserData)
+				if err != nil {
+					slog.Error("failed to start warm instance", "instanceID", *warmInstanceID, "error", err.Error())
+					// Mark instance for cleanup and fall through to fresh launch
+					_, _ = svc.CreateTags(ctx, &ec2.CreateTagsInput{
+						Resources: []string{*warmInstanceID},
+						Tags: []types.Tag{
+							{Key: aws.String("WarmPoolStatus"), Value: aws.String("failed")},
+						},
+					})
+				} else {
+					instanceID = warmInstanceID
+					slog.Info("started warm pool instance", "instanceID", *instanceID)
+				}
+			} else {
+				slog.Info("no warm pool instance available", "instanceType", instanceType)
+			}
 		}
 
-		if len(output.Instances) == 0 {
-			slog.Error("no instance created")
+		// Fallback: launch fresh instance if no warm pool instance was used
+		if instanceID == nil {
+			slog.Info("launching fresh instance", "instanceType", instanceType)
+
+			if targetSize > 0 {
+				// Warm pool is configured but no instance available - launch fresh instance that terminates
+				// (Don't create a warm pool instance that will stop - we need it now)
+				instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
+				if err != nil {
+					slog.Error("failed to launch fresh instance", "error", err.Error())
+					return events.APIGatewayProxyResponse{
+						Body:       err.Error(),
+						StatusCode: http.StatusInternalServerError,
+					}, err
+				}
+			} else {
+				// No warm pool configured - launch regular instance that terminates
+				instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
+				if err != nil {
+					slog.Error("failed to launch fresh instance", "error", err.Error())
+					return events.APIGatewayProxyResponse{
+						Body:       err.Error(),
+						StatusCode: http.StatusInternalServerError,
+					}, err
+				}
+			}
 
-			return events.APIGatewayProxyResponse{
-				Body:       "no instance created",
-				StatusCode: http.StatusInternalServerError,
-			}, nil
+			slog.Info("instance launched", "instanceID", *instanceID)
 		}
 
-		slog.Info("instance created", "instanceID", output.Instances[0].InstanceId)
+		// Replenish warm pool if we used a warm instance (or if pool is under target)
+		if targetSize > 0 {
+			currentCount, err := countWarmPoolInstances(ctx, svc, instanceType)
+			if err != nil {
+				slog.Warn("failed to count warm pool", "error", err.Error())
+			} else if currentCount < targetSize {
+				slog.Info("replenishing warm pool", "instanceType", instanceType, "current", currentCount, "target", targetSize)
+
+				newID, err := launchWarmPoolInstance(ctx, svc, instanceType, launchConfig)
+				if err != nil {
+					slog.Warn("failed to launch warm pool replacement", "error", err.Error())
+				} else {
+					slog.Info("launched warm pool replacement", "instanceID", *newID)
+				}
+			}
+		}
 
 		return events.APIGatewayProxyResponse{
-			Body:       *output.Instances[0].InstanceId,
+			Body:       *instanceID,
 			StatusCode: http.StatusOK,
 		}, nil
 
diff --git a/template.yaml b/template.yaml
index ee8f81b..e90e6e6 100644
--- a/template.yaml
+++ b/template.yaml
@@ -21,11 +21,17 @@ Parameters:
   KeyName:
     Type: String
     Description: EC2 key pair name for the runner
+  WarmPoolConfig:
+    Type: String
+    Default: "{}"
+    Description: >
+      JSON map of instance type to pool size. Empty {} disables warm pool.
+      Example: {"c8a.2xlarge":2,"c8a.4xlarge":1}
 
 # More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst
 Globals:
   Function:
-    Timeout: 5
+    Timeout: 30
     MemorySize: 128
 
 Resources:
@@ -86,6 +92,7 @@ Resources:
           SECURITY_GROUP_IDS: !Ref SecurityGroupIds
           KEY_NAME: !Ref KeyName
           INSTANCE_PROFILE_ARN: !GetAtt RunnerInstanceProfile.Arn
+          WARM_POOL_CONFIG: !Ref WarmPoolConfig
       Policies:
         - Statement:
             - Sid: RunInstances
@@ -96,6 +103,14 @@ Resources:
                 - ec2:CreateTags
                 - ec2:RunInstances
               Resource: "*"
+            - Sid: WarmPoolManagement
+              Effect: Allow
+              Action:
+                - ec2:DescribeInstances
+                - ec2:StartInstances
+                - ec2:StopInstances
+                - ec2:ModifyInstanceAttribute
+              Resource: "*"
             - Sid: GetGitHubPAT
               Effect: Allow
               Action:

From b7f8fbe4832981a2478f08ec925a56062ec6ab26 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 09:21:51 +0800
Subject: [PATCH 3/9] Simplify warm pool implementation

- Extract warmPoolFilters() helper to reduce duplication
- Consolidate EC2 launch logic into buildRunInstancesInput() and launchInstance()
- Extract tryAcquireWarmInstance() and replenishWarmPool() helpers
- Use single multipartTemplate constant
- Simplify nil map access (Go returns zero value for nil maps)

Net reduction of ~80 lines while improving readability.
---
 main.go | 322 +++++++++++++++++++++-----------------------------------
 1 file changed, 122 insertions(+), 200 deletions(-)

diff --git a/main.go b/main.go
index 551de6b..5579fb9 100644
--- a/main.go
+++ b/main.go
@@ -56,27 +56,20 @@ func parseWarmPoolConfig() map[string]int {
 	return poolConfig
 }
 
+// warmPoolFilters returns the common filters for querying warm pool instances.
+func warmPoolFilters(instanceType types.InstanceType, states []string) []types.Filter {
+	return []types.Filter{
+		{Name: aws.String("tag:WarmPool"), Values: []string{"true"}},
+		{Name: aws.String("tag:WarmPoolStatus"), Values: []string{"available"}},
+		{Name: aws.String("tag:WarmPoolInstanceType"), Values: []string{string(instanceType)}},
+		{Name: aws.String("instance-state-name"), Values: states},
+	}
+}
+
 // findAvailableWarmInstance searches for a stopped warm pool instance of the requested type.
 func findAvailableWarmInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType) (*string, error) {
 	output, err := svc.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
-		Filters: []types.Filter{
-			{
-				Name:   aws.String("tag:WarmPool"),
-				Values: []string{"true"},
-			},
-			{
-				Name:   aws.String("tag:WarmPoolStatus"),
-				Values: []string{"available"},
-			},
-			{
-				Name:   aws.String("tag:WarmPoolInstanceType"),
-				Values: []string{string(instanceType)},
-			},
-			{
-				Name:   aws.String("instance-state-name"),
-				Values: []string{"stopped"},
-			},
-		},
+		Filters: warmPoolFilters(instanceType, []string{"stopped"}),
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to describe instances: %w", err)
@@ -88,7 +81,7 @@ func findAvailableWarmInstance(ctx context.Context, svc *ec2.Client, instanceTyp
 		}
 	}
 
-	return nil, nil // No available instance found
+	return nil, nil
 }
 
 // startWarmInstance activates a stopped warm pool instance for a job.
@@ -154,24 +147,7 @@ func startWarmInstance(ctx context.Context, svc *ec2.Client, instanceID string,
 // countWarmPoolInstances counts available instances in the warm pool for a given type.
 func countWarmPoolInstances(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType) (int, error) {
 	output, err := svc.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
-		Filters: []types.Filter{
-			{
-				Name:   aws.String("tag:WarmPool"),
-				Values: []string{"true"},
-			},
-			{
-				Name:   aws.String("tag:WarmPoolStatus"),
-				Values: []string{"available"},
-			},
-			{
-				Name:   aws.String("tag:WarmPoolInstanceType"),
-				Values: []string{string(instanceType)},
-			},
-			{
-				Name:   aws.String("instance-state-name"),
-				Values: []string{"stopped", "stopping"},
-			},
-		},
+		Filters: warmPoolFilters(instanceType, []string{"stopped", "stopping"}),
 	})
 	if err != nil {
 		return 0, fmt.Errorf("failed to describe instances: %w", err)
@@ -185,9 +161,8 @@ func countWarmPoolInstances(ctx context.Context, svc *ec2.Client, instanceType t
 	return count, nil
 }
 
-// warmPoolInitUserData is a minimal script that stops the instance on first boot.
-// Uses multipart MIME format with cloud-config to run scripts on every boot.
-const warmPoolInitUserData = `Content-Type: multipart/mixed; boundary="//"
+// multipartTemplate is the MIME multipart format for user-data that runs on every boot.
+const multipartTemplate = `Content-Type: multipart/mixed; boundary="//"
 MIME-Version: 1.0
 
 --//
@@ -206,58 +181,30 @@ MIME-Version: 1.0
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment; filename="userdata.txt"
 
-#!/bin/bash
-# Warm pool init: just stop the instance after first boot
-# When activated, user-data will be replaced with the real setup script
-echo "Warm pool instance initializing, stopping to enter pool..."
-shutdown -h now
+%s
 --//--
 `
 
 // wrapInMultipart wraps a shell script in multipart MIME format that runs on every boot.
 func wrapInMultipart(script string) string {
-	return fmt.Sprintf(`Content-Type: multipart/mixed; boundary="//"
-MIME-Version: 1.0
-
---//
-Content-Type: text/cloud-config; charset="us-ascii"
-MIME-Version: 1.0
-Content-Transfer-Encoding: 7bit
-Content-Disposition: attachment; filename="cloud-config.txt"
-
-#cloud-config
-cloud_final_modules:
-- [scripts-user, always]
-
---//
-Content-Type: text/x-shellscript; charset="us-ascii"
-MIME-Version: 1.0
-Content-Transfer-Encoding: 7bit
-Content-Disposition: attachment; filename="userdata.txt"
-
-%s
---//--
-`, script)
+	return fmt.Sprintf(multipartTemplate, script)
 }
 
-// launchWarmPoolInstance launches a new instance destined for the warm pool.
-// Instances will stop after first boot and terminate after being used for a job.
-func launchWarmPoolInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig) (*string, error) {
-	tags := []types.Tag{
-		{Key: aws.String("WarmPool"), Value: aws.String("true")},
-		{Key: aws.String("WarmPoolStatus"), Value: aws.String("available")},
-		{Key: aws.String("WarmPoolActivated"), Value: aws.String("false")},
-		{Key: aws.String("WarmPoolInstanceType"), Value: aws.String(string(instanceType))},
-		{Key: aws.String("WarmPoolCreatedAt"), Value: aws.String(time.Now().UTC().Format(time.RFC3339))},
-		{Key: aws.String("Name"), Value: aws.String(fmt.Sprintf("GitHub Runner Warm Pool - %s", instanceType))},
-	}
+// warmPoolInitUserData is the script that stops the instance on first boot to enter the warm pool.
+var warmPoolInitUserData = wrapInMultipart(`#!/bin/bash
+# Warm pool init: just stop the instance after first boot
+# When activated, user-data will be replaced with the real setup script
+echo "Warm pool instance initializing, stopping to enter pool..."
+shutdown -h now`)
 
-	output, err := svc.RunInstances(ctx, &ec2.RunInstancesInput{
+// buildRunInstancesInput creates a base RunInstancesInput with common configuration.
+func buildRunInstancesInput(instanceType types.InstanceType, launchConfig LaunchConfig, shutdownBehavior types.ShutdownBehavior, tags []types.Tag, userData string) *ec2.RunInstancesInput {
+	return &ec2.RunInstancesInput{
 		MinCount:                          aws.Int32(1),
 		MaxCount:                          aws.Int32(1),
 		EbsOptimized:                      aws.Bool(true),
 		ImageId:                           aws.String(launchConfig.ImageID),
-		InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorStop, // STOP on first boot to enter warm pool
+		InstanceInitiatedShutdownBehavior: shutdownBehavior,
 		InstanceType:                      instanceType,
 		IamInstanceProfile: &types.IamInstanceProfileSpecification{
 			Arn: aws.String(launchConfig.InstanceProfileArn),
@@ -277,74 +224,105 @@ func launchWarmPoolInstance(ctx context.Context, svc *ec2.Client, instanceType t
 			{ResourceType: types.ResourceTypeInstance, Tags: tags},
 			{ResourceType: types.ResourceTypeVolume, Tags: tags},
 		},
-		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(warmPoolInitUserData))),
-	})
-	if err != nil {
-		return nil, fmt.Errorf("failed to launch warm pool instance: %w", err)
+		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(userData))),
 	}
+}
 
+// launchInstance runs an EC2 instance and returns its ID.
+func launchInstance(ctx context.Context, svc *ec2.Client, input *ec2.RunInstancesInput) (*string, error) {
+	output, err := svc.RunInstances(ctx, input)
+	if err != nil {
+		return nil, err
+	}
 	if len(output.Instances) == 0 {
 		return nil, errors.New("no instance created")
 	}
-
 	return output.Instances[0].InstanceId, nil
 }
 
-// launchFreshInstance launches a new instance that terminates after use (current behavior).
+// launchWarmPoolInstance launches a new instance destined for the warm pool.
+// Instances will stop after first boot and terminate after being used for a job.
+func launchWarmPoolInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig) (*string, error) {
+	tags := []types.Tag{
+		{Key: aws.String("WarmPool"), Value: aws.String("true")},
+		{Key: aws.String("WarmPoolStatus"), Value: aws.String("available")},
+		{Key: aws.String("WarmPoolActivated"), Value: aws.String("false")},
+		{Key: aws.String("WarmPoolInstanceType"), Value: aws.String(string(instanceType))},
+		{Key: aws.String("WarmPoolCreatedAt"), Value: aws.String(time.Now().UTC().Format(time.RFC3339))},
+		{Key: aws.String("Name"), Value: aws.String(fmt.Sprintf("GitHub Runner Warm Pool - %s", instanceType))},
+	}
+
+	input := buildRunInstancesInput(instanceType, launchConfig, types.ShutdownBehaviorStop, tags, warmPoolInitUserData)
+	instanceID, err := launchInstance(ctx, svc, input)
+	if err != nil {
+		return nil, fmt.Errorf("failed to launch warm pool instance: %w", err)
+	}
+	return instanceID, nil
+}
+
+// launchFreshInstance launches a new instance that terminates after use.
 func launchFreshInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig, finalUserData string, jobEventID int64) (*string, error) {
 	tags := []types.Tag{
-		{
-			Key:   aws.String("GitHub Workflow Job Event ID"),
-			Value: aws.String(strconv.FormatInt(jobEventID, 10)),
-		},
-		{
-			Key:   aws.String("Name"),
-			Value: aws.String("GitHub Workflow Ephemeral Runner"),
-		},
+		{Key: aws.String("GitHub Workflow Job Event ID"), Value: aws.String(strconv.FormatInt(jobEventID, 10))},
+		{Key: aws.String("Name"), Value: aws.String("GitHub Workflow Ephemeral Runner")},
 	}
 
-	output, err := svc.RunInstances(ctx, &ec2.RunInstancesInput{
-		MinCount:                          aws.Int32(1),
-		MaxCount:                          aws.Int32(1),
-		EbsOptimized:                      aws.Bool(true),
-		ImageId:                           aws.String(launchConfig.ImageID),
-		InstanceInitiatedShutdownBehavior: types.ShutdownBehaviorTerminate,
-		InstanceType:                      instanceType,
-		IamInstanceProfile: &types.IamInstanceProfileSpecification{
-			Arn: aws.String(launchConfig.InstanceProfileArn),
-		},
-		NetworkInterfaces: []types.InstanceNetworkInterfaceSpecification{
-			{
-				AssociatePublicIpAddress: aws.Bool(true),
-				SubnetId:                 aws.String(launchConfig.SubnetID),
-				DeleteOnTermination:      aws.Bool(true),
-				DeviceIndex:              aws.Int32(0),
-				Groups:                   launchConfig.SecurityGroups,
-			},
-		},
-		KeyName:    aws.String(launchConfig.KeyName),
-		Monitoring: &types.RunInstancesMonitoringEnabled{Enabled: aws.Bool(true)},
-		TagSpecifications: []types.TagSpecification{
-			{
-				ResourceType: types.ResourceTypeInstance,
-				Tags:         tags,
-			},
-			{
-				ResourceType: types.ResourceTypeVolume,
-				Tags:         tags,
-			},
-		},
-		UserData: aws.String(base64.StdEncoding.EncodeToString([]byte(finalUserData))),
-	})
+	input := buildRunInstancesInput(instanceType, launchConfig, types.ShutdownBehaviorTerminate, tags, finalUserData)
+	instanceID, err := launchInstance(ctx, svc, input)
 	if err != nil {
 		return nil, fmt.Errorf("failed to launch instance: %w", err)
 	}
+	return instanceID, nil
+}
 
-	if len(output.Instances) == 0 {
-		return nil, errors.New("no instance created")
+// tryAcquireWarmInstance attempts to acquire and start a warm pool instance.
+// Returns the instance ID if successful, nil if no instance available or on failure.
+func tryAcquireWarmInstance(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, jobEventID int64, finalUserData string) *string {
+	warmInstanceID, err := findAvailableWarmInstance(ctx, svc, instanceType)
+	if err != nil {
+		slog.Warn("failed to query warm pool", "error", err.Error())
+		return nil
+	}
+	if warmInstanceID == nil {
+		slog.Info("no warm pool instance available", "instanceType", instanceType)
+		return nil
 	}
 
-	return output.Instances[0].InstanceId, nil
+	slog.Info("found warm pool instance", "instanceID", *warmInstanceID)
+
+	if err := startWarmInstance(ctx, svc, *warmInstanceID, jobEventID, finalUserData); err != nil {
+		slog.Error("failed to start warm instance", "instanceID", *warmInstanceID, "error", err.Error())
+		// Mark instance for cleanup
+		_, _ = svc.CreateTags(ctx, &ec2.CreateTagsInput{
+			Resources: []string{*warmInstanceID},
+			Tags:      []types.Tag{{Key: aws.String("WarmPoolStatus"), Value: aws.String("failed")}},
+		})
+		return nil
+	}
+
+	slog.Info("started warm pool instance", "instanceID", *warmInstanceID)
+	return warmInstanceID
+}
+
+// replenishWarmPool launches replacement instances if the pool is below target size.
+func replenishWarmPool(ctx context.Context, svc *ec2.Client, instanceType types.InstanceType, launchConfig LaunchConfig, targetSize int) {
+	currentCount, err := countWarmPoolInstances(ctx, svc, instanceType)
+	if err != nil {
+		slog.Warn("failed to count warm pool", "error", err.Error())
+		return
+	}
+	if currentCount >= targetSize {
+		return
+	}
+
+	slog.Info("replenishing warm pool", "instanceType", instanceType, "current", currentCount, "target", targetSize)
+
+	newID, err := launchWarmPoolInstance(ctx, svc, instanceType, launchConfig)
+	if err != nil {
+		slog.Warn("failed to launch warm pool replacement", "error", err.Error())
+		return
+	}
+	slog.Info("launched warm pool replacement", "instanceID", *newID)
 }
 
 func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyResponse, error) {
@@ -492,90 +470,34 @@ func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyRespo
 		finalUserData := buf.String()
 		jobEventID := event.GetWorkflowJob().GetID()
 
-		// Parse warm pool configuration
+		// Get warm pool target size for this instance type
 		poolConfig := parseWarmPoolConfig()
-		targetSize := 0
-		if poolConfig != nil {
-			targetSize = poolConfig[string(instanceType)]
-		}
+		targetSize := poolConfig[string(instanceType)]
 
+		// Try warm pool first if configured
 		var instanceID *string
-
-		// Try warm pool if configured for this instance type
 		if targetSize > 0 {
 			slog.Info("checking warm pool", "instanceType", instanceType, "targetSize", targetSize)
-
-			warmInstanceID, err := findAvailableWarmInstance(ctx, svc, instanceType)
-			if err != nil {
-				slog.Warn("failed to query warm pool", "error", err.Error())
-				// Fall through to fresh launch
-			} else if warmInstanceID != nil {
-				slog.Info("found warm pool instance", "instanceID", *warmInstanceID)
-
-				err = startWarmInstance(ctx, svc, *warmInstanceID, jobEventID, finalUserData)
-				if err != nil {
-					slog.Error("failed to start warm instance", "instanceID", *warmInstanceID, "error", err.Error())
-					// Mark instance for cleanup and fall through to fresh launch
-					_, _ = svc.CreateTags(ctx, &ec2.CreateTagsInput{
-						Resources: []string{*warmInstanceID},
-						Tags: []types.Tag{
-							{Key: aws.String("WarmPoolStatus"), Value: aws.String("failed")},
-						},
-					})
-				} else {
-					instanceID = warmInstanceID
-					slog.Info("started warm pool instance", "instanceID", *instanceID)
-				}
-			} else {
-				slog.Info("no warm pool instance available", "instanceType", instanceType)
-			}
+			instanceID = tryAcquireWarmInstance(ctx, svc, instanceType, jobEventID, finalUserData)
 		}
 
-		// Fallback: launch fresh instance if no warm pool instance was used
+		// Launch fresh instance if warm pool not available or not configured
 		if instanceID == nil {
 			slog.Info("launching fresh instance", "instanceType", instanceType)
-
-			if targetSize > 0 {
-				// Warm pool is configured but no instance available - launch fresh instance that terminates
-				// (Don't create a warm pool instance that will stop - we need it now)
-				instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
-				if err != nil {
-					slog.Error("failed to launch fresh instance", "error", err.Error())
-					return events.APIGatewayProxyResponse{
-						Body:       err.Error(),
-						StatusCode: http.StatusInternalServerError,
-					}, err
-				}
-			} else {
-				// No warm pool configured - launch regular instance that terminates
-				instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
-				if err != nil {
-					slog.Error("failed to launch fresh instance", "error", err.Error())
-					return events.APIGatewayProxyResponse{
-						Body:       err.Error(),
-						StatusCode: http.StatusInternalServerError,
-					}, err
-				}
+			instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
+			if err != nil {
+				slog.Error("failed to launch fresh instance", "error", err.Error())
+				return events.APIGatewayProxyResponse{
+					Body:       err.Error(),
+					StatusCode: http.StatusInternalServerError,
+				}, err
 			}
-
 			slog.Info("instance launched", "instanceID", *instanceID)
 		}
 
-		// Replenish warm pool if we used a warm instance (or if pool is under target)
+		// Replenish warm pool if needed
 		if targetSize > 0 {
-			currentCount, err := countWarmPoolInstances(ctx, svc, instanceType)
-			if err != nil {
-				slog.Warn("failed to count warm pool", "error", err.Error())
-			} else if currentCount < targetSize {
-				slog.Info("replenishing warm pool", "instanceType", instanceType, "current", currentCount, "target", targetSize)
-
-				newID, err := launchWarmPoolInstance(ctx, svc, instanceType, launchConfig)
-				if err != nil {
-					slog.Warn("failed to launch warm pool replacement", "error", err.Error())
-				} else {
-					slog.Info("launched warm pool replacement", "instanceID", *newID)
-				}
-			}
+			replenishWarmPool(ctx, svc, instanceType, launchConfig, targetSize)
 		}
 
 		return events.APIGatewayProxyResponse{

From c870e7c3dcad6cb70b4cd7180e6a16f6265eaf43 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 09:28:58 +0800
Subject: [PATCH 4/9] Add scheduled warm pool maintenance

- Add CloudWatch Events rule that triggers every 5 minutes
- Add handleMaintenance() to check and populate all configured instance types
- Refactor handler to dispatch between API Gateway and scheduled events
- Extract getLaunchConfig() helper to reduce duplication

The maintenance function iterates through all configured instance types
and launches instances to reach target pool sizes.
---
 main.go       | 120 +++++++++++++++++++++++++++++++++++++++++++++++++-
 template.yaml |   7 +++
 2 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/main.go b/main.go
index 5579fb9..1e44baf 100644
--- a/main.go
+++ b/main.go
@@ -325,7 +325,125 @@ func replenishWarmPool(ctx context.Context, svc *ec2.Client, instanceType types.
 	slog.Info("launched warm pool replacement", "instanceID", *newID)
 }
 
-func handler(request events.APIGatewayProxyRequest) (events.APIGatewayProxyResponse, error) {
+// getLaunchConfig builds LaunchConfig from environment variables.
+func getLaunchConfig() (LaunchConfig, error) {
+	subnetID := os.Getenv("SUBNET_ID")
+	if subnetID == "" {
+		return LaunchConfig{}, errors.New("SUBNET_ID env var not set")
+	}
+
+	sgIDs := os.Getenv("SECURITY_GROUP_IDS")
+	if sgIDs == "" {
+		return LaunchConfig{}, errors.New("SECURITY_GROUP_IDS env var not set")
+	}
+
+	keyName := os.Getenv("KEY_NAME")
+	if keyName == "" {
+		return LaunchConfig{}, errors.New("KEY_NAME env var not set")
+	}
+
+	instanceProfileArn := os.Getenv("INSTANCE_PROFILE_ARN")
+	if instanceProfileArn == "" {
+		return LaunchConfig{}, errors.New("INSTANCE_PROFILE_ARN env var not set")
+	}
+
+	imageID := os.Getenv("IMAGE_ID")
+	if imageID == "" {
+		return LaunchConfig{}, errors.New("IMAGE_ID env var not set")
+	}
+
+	return LaunchConfig{
+		ImageID:            imageID,
+		SubnetID:           subnetID,
+		SecurityGroups:     strings.Split(sgIDs, ","),
+		KeyName:            keyName,
+		InstanceProfileArn: instanceProfileArn,
+	}, nil
+}
+
+// handleMaintenance processes scheduled warm pool maintenance events.
+func handleMaintenance() error {
+	slog.Info("warm pool maintenance triggered")
+
+	poolConfig := parseWarmPoolConfig()
+	if poolConfig == nil || len(poolConfig) == 0 {
+		slog.Info("warm pool not configured, skipping maintenance")
+		return nil
+	}
+
+	ctx := context.TODO()
+	cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion("us-east-2"))
+	if err != nil {
+		return fmt.Errorf("failed to load AWS config: %w", err)
+	}
+
+	svc := ec2.NewFromConfig(cfg)
+
+	launchConfig, err := getLaunchConfig()
+	if err != nil {
+		return fmt.Errorf("failed to get launch config: %w", err)
+	}
+
+	// Check and replenish each configured instance type
+	for instanceTypeStr, targetSize := range poolConfig {
+		if targetSize <= 0 {
+			continue
+		}
+
+		instanceType := types.InstanceType(instanceTypeStr)
+		currentCount, err := countWarmPoolInstances(ctx, svc, instanceType)
+		if err != nil {
+			slog.Warn("failed to count warm pool", "instanceType", instanceType, "error", err.Error())
+			continue
+		}
+
+		slog.Info("checking warm pool", "instanceType", instanceType, "current", currentCount, "target", targetSize)
+
+		// Launch instances to reach target size
+		for currentCount < targetSize {
+			newID, err := launchWarmPoolInstance(ctx, svc, instanceType, launchConfig)
+			if err != nil {
+				slog.Error("failed to launch warm pool instance", "instanceType", instanceType, "error", err.Error())
+				break
+			}
+			slog.Info("launched warm pool instance", "instanceType", instanceType, "instanceID", *newID)
+			currentCount++
+		}
+	}
+
+	slog.Info("warm pool maintenance complete")
+	return nil
+}
+
+// MaintenanceEvent represents a scheduled maintenance event from CloudWatch.
+type MaintenanceEvent struct {
+	Source string `json:"source"`
+}
+
+func handler(ctx context.Context, rawEvent json.RawMessage) (interface{}, error) {
+	// Try to detect if this is a maintenance event
+	var maintenanceEvent MaintenanceEvent
+	if err := json.Unmarshal(rawEvent, &maintenanceEvent); err == nil {
+		if maintenanceEvent.Source == "warmPoolMaintenance" {
+			if err := handleMaintenance(); err != nil {
+				slog.Error("maintenance failed", "error", err.Error())
+				return nil, err
+			}
+			return map[string]string{"status": "ok"}, nil
+		}
+	}
+
+	// Otherwise, treat as API Gateway event
+	var request events.APIGatewayProxyRequest
+	if err := json.Unmarshal(rawEvent, &request); err != nil {
+		slog.Error("failed to parse API Gateway event", "error", err.Error())
+		return events.APIGatewayProxyResponse{StatusCode: http.StatusBadRequest}, err
+	}
+
+	return handleWebhook(request)
+}
+
+func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProxyResponse, error) {
 	var githubEventHeader string
 
 	for k, v := range request.MultiValueHeaders {
diff --git a/template.yaml b/template.yaml
index e90e6e6..787f298 100644
--- a/template.yaml
+++ b/template.yaml
@@ -83,6 +83,13 @@ Resources:
               - method.request.header.X-GitHub-Event:
                   Required: true
                   Caching: false
+        WarmPoolMaintenance:
+          Type: Schedule
+          Properties:
+            Schedule: rate(5 minutes)
+            Description: Maintain warm pool of EC2 instances
+            Enabled: true
+            Input: '{"source": "warmPoolMaintenance"}'
       Environment: # More info about Env Vars: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#environment-object
         Variables:
           GITHUB_PAT_SECRET_NAME: !Ref GitHubPATSecretName

From 0950342c616abfb505077229e3d62cb57a9a079f Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 09:43:08 +0800
Subject: [PATCH 5/9] Fix user-data to handle AMIs without pre-extracted runner

Fall back to extracting from /opt/runner-cache/ if /opt/actions-runner
doesn't exist. This supports both old AMIs (with runner cache) and new
AMIs (with pre-extracted runner).
---
 user-data.sh | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/user-data.sh b/user-data.sh
index d1b0475..7d02f75 100644
--- a/user-data.sh
+++ b/user-data.sh
@@ -57,16 +57,29 @@ sed -i 's/ap-southeast-3/us-east-2/g' /etc/apt/sources.list
 # Add ubuntu user to docker group
 usermod -aG docker ubuntu
 
-# Use pre-extracted runner directory
-cd /opt/actions-runner
+# Use pre-extracted runner if available, otherwise extract from cache
+if [ -d "/opt/actions-runner" ] && [ -f "/opt/actions-runner/run.sh" ]; then
+    log_to_cloudwatch "INFO" "Using pre-extracted GitHub runner"
+    cd /opt/actions-runner
+else
+    log_to_cloudwatch "INFO" "Pre-extracted runner not found, extracting from cache"
 
-# Verify runner exists
-if [ ! -f "./run.sh" ]; then
-    log_to_cloudwatch "ERROR" "Runner not found at /opt/actions-runner"
-    shutdown now
-    exit 1
+    # Find runner archive in cache
+    RUNNER_ARCHIVE=$(ls /opt/runner-cache/actions-runner-linux-*.tar.gz 2>/dev/null | head -1)
+
+    if [ -z "$RUNNER_ARCHIVE" ]; then
+        log_to_cloudwatch "ERROR" "No runner archive found in /opt/runner-cache"
+        shutdown now
+        exit 1
+    fi
+
+    # Create directory and extract
+    mkdir -p /opt/actions-runner
+    cd /opt/actions-runner
+    tar xzf "$RUNNER_ARCHIVE"
+    chown -R ubuntu:ubuntu /opt/actions-runner
+    log_to_cloudwatch "INFO" "Extracted runner from $RUNNER_ARCHIVE"
 fi
-log_to_cloudwatch "INFO" "Using pre-extracted GitHub runner"
 
 # Get instance type (we already have instance ID from earlier)
 INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type)

From f6dd72540083212336d9ea47a9343ce6371e27f8 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 09:58:06 +0800
Subject: [PATCH 6/9] Simplify main.go: consolidate getLaunchConfig usage

- Return empty map instead of nil from parseWarmPoolConfig()
- Remove redundant nil check in handleMaintenance()
- Consolidate duplicate launch config building in handleWebhook()
  by reusing getLaunchConfig() (~40 lines removed)
---
 main.go | 53 +++++++----------------------------------------------
 1 file changed, 7 insertions(+), 46 deletions(-)

diff --git a/main.go b/main.go
index 1e44baf..00018ae 100644
--- a/main.go
+++ b/main.go
@@ -44,13 +44,13 @@ type LaunchConfig struct {
 func parseWarmPoolConfig() map[string]int {
 	configStr := os.Getenv("WARM_POOL_CONFIG")
 	if configStr == "" || configStr == "{}" {
-		return nil
+		return map[string]int{}
 	}
 
 	var poolConfig map[string]int
 	if err := json.Unmarshal([]byte(configStr), &poolConfig); err != nil {
 		slog.Error("failed to parse WARM_POOL_CONFIG", "error", err.Error(), "config", configStr)
-		return nil
+		return map[string]int{}
 	}
 
 	return poolConfig
@@ -366,7 +366,7 @@ func handleMaintenance() error {
 	slog.Info("warm pool maintenance triggered")
 
 	poolConfig := parseWarmPoolConfig()
-	if poolConfig == nil || len(poolConfig) == 0 {
+	if len(poolConfig) == 0 {
 		slog.Info("warm pool not configured, skipping maintenance")
 		return nil
 	}
@@ -508,49 +508,10 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 			extraLabels = "," + extraLabels
 		}
 
-		subnetID := os.Getenv("SUBNET_ID")
-		if subnetID == "" {
-			slog.Error("SUBNET_ID env var not set")
-
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("subnet id missing")
-		}
-
-		sgIDs := os.Getenv("SECURITY_GROUP_IDS")
-		if sgIDs == "" {
-			slog.Error("SECURITY_GROUP_IDS env var not set")
-
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("security groups missing")
-		}
-
-		securityGroups := strings.Split(sgIDs, ",")
-
-		keyName := os.Getenv("KEY_NAME")
-		if keyName == "" {
-			slog.Error("KEY_NAME env var not set")
-
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("key name missing")
-		}
-
-		instanceProfileArn := os.Getenv("INSTANCE_PROFILE_ARN")
-		if instanceProfileArn == "" {
-			slog.Error("INSTANCE_PROFILE_ARN env var not set")
-
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("instance profile arn missing")
-		}
-
-		imageID := os.Getenv("IMAGE_ID")
-		if imageID == "" {
-			slog.Error("IMAGE_ID env var not set")
-
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, errors.New("image id missing")
-		}
-
-		launchConfig := LaunchConfig{
-			ImageID:            imageID,
-			SubnetID:           subnetID,
-			SecurityGroups:     securityGroups,
-			KeyName:            keyName,
-			InstanceProfileArn: instanceProfileArn,
+		launchConfig, err := getLaunchConfig()
+		if err != nil {
+			slog.Error("failed to get launch config", "error", err.Error())
+			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
 		}
 
 		ephemeral := slices.Contains(event.GetWorkflowJob().Labels, "ephemeral")

From 4e9fc7054407694ca8aa3480930ac690a419fd75 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 10:35:37 +0800
Subject: [PATCH 7/9] Use JIT runner config to skip config.sh registration

Generate JIT config from Lambda via GitHub API instead of passing PAT
to the instance. This eliminates the 15-30 second config.sh registration
step on the runner.

Changes:
- main.go: Add generateJITConfig() to call GitHub's JIT config API
- main.go: Build labels list and pass JIT config to user-data template
- user-data.sh: Remove get_github_token() and config.sh steps
- user-data.sh: Use ./run.sh --jitconfig instead
---
 main.go      | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 user-data.sh | 76 +++++------------------------------------
 2 files changed, 101 insertions(+), 71 deletions(-)

diff --git a/main.go b/main.go
index 00018ae..9de4bff 100644
--- a/main.go
+++ b/main.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -361,6 +362,73 @@ func getLaunchConfig() (LaunchConfig, error) {
 	}, nil
 }
 
+// JITConfigRequest represents the request body for generating a JIT runner config.
+type JITConfigRequest struct {
+	Name          string   `json:"name"`
+	RunnerGroupID int      `json:"runner_group_id"`
+	Labels        []string `json:"labels"`
+	WorkFolder    string   `json:"work_folder"`
+}
+
+// JITConfigResponse represents the response from the JIT config API.
+type JITConfigResponse struct {
+	Runner struct {
+		ID   int    `json:"id"`
+		Name string `json:"name"`
+	} `json:"runner"`
+	EncodedJITConfig string `json:"encoded_jit_config"`
+}
+
+// generateJITConfig calls the GitHub API to generate a JIT runner configuration.
+// This eliminates the need for config.sh on the runner, saving 15-30 seconds.
+func generateJITConfig(pat, org, runnerName string, labels []string) (*JITConfigResponse, error) {
+	reqBody := JITConfigRequest{
+		Name:          runnerName,
+		RunnerGroupID: 1, // Default runner group
+		Labels:        labels,
+		WorkFolder:    "_work",
+	}
+
+	jsonBody, err := json.Marshal(reqBody)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal JIT config request: %w", err)
+	}
+
+	url := fmt.Sprintf("https://api.github.com/orgs/%s/actions/runners/generate-jitconfig", org)
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonBody))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Accept", "application/vnd.github+json")
+	req.Header.Set("Authorization", "Bearer "+pat)
+	req.Header.Set("X-GitHub-Api-Version", "2022-11-28")
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{Timeout: 30 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to call JIT config API: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read response body: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusCreated {
+		return nil, fmt.Errorf("JIT config API returned status %d: %s", resp.StatusCode, string(body))
+	}
+
+	var jitResp JITConfigResponse
+	if err := json.Unmarshal(body, &jitResp); err != nil {
+		return nil, fmt.Errorf("failed to parse JIT config response: %w", err)
+	}
+
+	return &jitResp, nil
+}
+
 // handleMaintenance processes scheduled warm pool maintenance events.
 func handleMaintenance() error {
 	slog.Info("warm pool maintenance triggered")
@@ -534,7 +602,30 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 			}
 		}
 
-		slog.Info("processing job", "instanceType", instanceType, "jobID", event.GetWorkflowJob().GetID())
+		jobEventID := event.GetWorkflowJob().GetID()
+		runnerName := fmt.Sprintf("ephemeral-i-%d", jobEventID)
+
+		slog.Info("processing job", "instanceType", instanceType, "jobID", jobEventID, "runnerName", runnerName)
+
+		// Build labels for the runner
+		labels := []string{string(instanceType), "ephemeral", "X64"}
+		if extraLabels != "" {
+			// extraLabels already has leading comma, split and add non-empty labels
+			for _, label := range strings.Split(extraLabels, ",") {
+				if label = strings.TrimSpace(label); label != "" {
+					labels = append(labels, label)
+				}
+			}
+		}
+
+		// Generate JIT config from GitHub API (eliminates need for config.sh on instance)
+		jitConfig, err := generateJITConfig(pat, "frgrisk", runnerName, labels)
+		if err != nil {
+			slog.Error("failed to generate JIT config", "error", err.Error())
+			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
+		}
+
+		slog.Info("generated JIT config", "runnerID", jitConfig.Runner.ID, "runnerName", jitConfig.Runner.Name)
 
 		tpl, err := template.New("userdata").Parse(userData)
 		if err != nil {
@@ -542,12 +633,11 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 		}
 
 		var buf bytes.Buffer
-		if err := tpl.Execute(&buf, map[string]string{"GitHubPAT": pat, "ExtraLabels": extraLabels}); err != nil {
+		if err := tpl.Execute(&buf, map[string]string{"JITConfig": jitConfig.EncodedJITConfig}); err != nil {
 			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
 		}
 
 		finalUserData := buf.String()
-		jobEventID := event.GetWorkflowJob().GetID()
 
 		// Get warm pool target size for this instance type
 		poolConfig := parseWarmPoolConfig()
diff --git a/user-data.sh b/user-data.sh
index 7d02f75..72ab94a 100644
--- a/user-data.sh
+++ b/user-data.sh
@@ -86,88 +86,28 @@ INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169
 
 log_to_cloudwatch "INFO" "Instance: ${INSTANCE_ID}, Type: ${INSTANCE_TYPE}"
 
-# Function to get GitHub registration token with retry
-get_github_token() {
-    local max_attempts=5
-    local attempt=1
-    local delay=5
-    
-    while [ $attempt -le $max_attempts ]; do
-        log_to_cloudwatch "INFO" "Attempting to get GitHub registration token (attempt ${attempt}/${max_attempts})"
-        
-        GITHUB_TOKEN=$(curl -s -L \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer {{.GitHubPAT}}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/orgs/frgrisk/actions/runners/registration-token | jq -r .token)
-        
-        if [ -n "$GITHUB_TOKEN" ] && [ "$GITHUB_TOKEN" != "null" ]; then
-            log_to_cloudwatch "INFO" "Successfully obtained GitHub registration token"
-            return 0
-        fi
-        
-        log_to_cloudwatch "WARN" "Failed to get GitHub token, retrying in ${delay} seconds..."
-        sleep $delay
-        delay=$((delay * 2))
-        attempt=$((attempt + 1))
-    done
-    
-    log_to_cloudwatch "ERROR" "Failed to get GitHub registration token after ${max_attempts} attempts"
-    return 1
-}
+# JIT config is passed from Lambda - no need to call GitHub API or run config.sh
+JIT_CONFIG="{{.JITConfig}}"
 
-# Get GitHub registration token
-if ! get_github_token; then
-    log_to_cloudwatch "ERROR" "Unable to proceed without registration token"
+if [ -z "$JIT_CONFIG" ] || [ "$JIT_CONFIG" = "{{.JITConfig}}" ]; then
+    log_to_cloudwatch "ERROR" "JIT config not provided"
     shutdown now
     exit 1
 fi
 
-# Configure runner with retry
-log_to_cloudwatch "INFO" "Configuring GitHub runner"
-max_config_attempts=3
-config_attempt=1
-
-while [ $config_attempt -le $max_config_attempts ]; do
-    if sudo -u ubuntu ./config.sh \
-        --url https://github.com/frgrisk \
-        --token "$GITHUB_TOKEN" \
-        --disableupdate \
-        --ephemeral \
-        --labels "${INSTANCE_TYPE},ephemeral,X64{{.ExtraLabels}}" \
-        --unattended \
-        --name "ephemeral-${INSTANCE_ID}" \
-        --work _work; then
-        
-        log_to_cloudwatch "INFO" "Runner configured successfully"
-        break
-    else
-        log_to_cloudwatch "WARN" "Runner configuration failed (attempt ${config_attempt}/${max_config_attempts})"
-        config_attempt=$((config_attempt + 1))
-        if [ $config_attempt -le $max_config_attempts ]; then
-            sleep 10
-        fi
-    fi
-done
-
-if [ $config_attempt -gt $max_config_attempts ]; then
-    log_to_cloudwatch "ERROR" "Failed to configure runner after ${max_config_attempts} attempts"
-    shutdown now
-    exit 1
-fi
+log_to_cloudwatch "INFO" "JIT config received, skipping config.sh"
 
 END_TIME=$(date +%s)
 EXECUTION_TIME=$((END_TIME - START_TIME))
 log_to_cloudwatch "INFO" "Setup completed in ${EXECUTION_TIME} seconds"
 
-# Start the runner and wait for it to complete
-log_to_cloudwatch "INFO" "Starting GitHub runner"
+# Start the runner with JIT config (skips registration entirely)
+log_to_cloudwatch "INFO" "Starting GitHub runner with JIT config"
 
 # Create a temporary file to capture runner output
 RUNNER_LOG=$(mktemp /tmp/runner-output.XXXXXX)
 
-if sudo -u ubuntu ./run.sh 2>&1 | tee "${RUNNER_LOG}"; then
+if sudo -u ubuntu ./run.sh --jitconfig "$JIT_CONFIG" 2>&1 | tee "${RUNNER_LOG}"; then
     log_to_cloudwatch "INFO" "Runner completed successfully"
 else
     EXIT_CODE=$?

From 020b273954b67134806f38b556e064d39bbaf141 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 11:00:52 +0800
Subject: [PATCH 8/9] Fix double base64 encoding of user-data for warm pool
 instances

ModifyInstanceAttribute with BlobAttributeValue automatically handles
base64 encoding, so we shouldn't pre-encode. This was causing user-data
to exceed the 16KB limit.
---
 main.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.go b/main.go
index 9de4bff..28cea62 100644
--- a/main.go
+++ b/main.go
@@ -115,11 +115,11 @@ func startWarmInstance(ctx context.Context, svc *ec2.Client, instanceID string,
 
 	// Update user data with the full setup script wrapped in multipart format
 	// so it runs on every boot (including this activation)
+	// Note: BlobAttributeValue handles base64 encoding automatically, don't pre-encode
 	wrappedUserData := wrapInMultipart(finalUserData)
-	encodedUserData := base64.StdEncoding.EncodeToString([]byte(wrappedUserData))
 	_, err = svc.ModifyInstanceAttribute(ctx, &ec2.ModifyInstanceAttributeInput{
 		InstanceId: aws.String(instanceID),
-		UserData:   &types.BlobAttributeValue{Value: []byte(encodedUserData)},
+		UserData:   &types.BlobAttributeValue{Value: []byte(wrappedUserData)},
 	})
 	if err != nil {
 		return fmt.Errorf("failed to update user data: %w", err)

From f60ca4107d16ee141330d7d5a818e8978dc5afc1 Mon Sep 17 00:00:00 2001
From: Henry Skiba <henry.skiba@frgrisk.com>
Date: Sun, 11 Jan 2026 11:19:44 +0800
Subject: [PATCH 9/9] Use SSM Parameter Store for JIT config to avoid
 cloud-init caching

Cloud-init caches user-data from first boot, so when we update user-data
on warm pool activation, the cached script runs instead of the new one.

Fix by storing JIT config in SSM Parameter Store (/github-runner/jit-config/{instance-id})
and having the user-data script fetch it from there. This works because:
1. The script itself doesn't change (no templating needed)
2. The SSM parameter is created fresh for each job

Changes:
- main.go: Add storeJITConfigInSSM(), remove template usage
- user-data.sh: Fetch JIT config from SSM using instance ID
- template.yaml: Add SSM permissions for Lambda and EC2 instance
---
 go.mod        |  7 ++++---
 go.sum        |  8 ++++++++
 main.go       | 44 +++++++++++++++++++++++++++++---------------
 template.yaml | 15 +++++++++++++++
 user-data.sh  | 18 +++++++++++++-----
 5 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/go.mod b/go.mod
index 3c57f5d..88a0100 100644
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,7 @@ go 1.23
 
 require (
 	github.com/aws/aws-lambda-go v1.51.1
-	github.com/aws/aws-sdk-go-v2 v1.41.0
+	github.com/aws/aws-sdk-go-v2 v1.41.1
 	github.com/aws/aws-sdk-go-v2/config v1.32.6
 	github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.0
 	github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.41.0
@@ -14,12 +14,13 @@ require (
 require (
 	github.com/aws/aws-sdk-go-v2/credentials v1.19.6 // indirect
 	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect
 	github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect
 	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect
 	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 // indirect
 	github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8 // indirect
 	github.com/aws/aws-sdk-go-v2/service/sso v1.30.8 // indirect
 	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 // indirect
 	github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 // indirect
diff --git a/go.sum b/go.sum
index 44e2647..68253a7 100644
--- a/go.sum
+++ b/go.sum
@@ -2,6 +2,8 @@ github.com/aws/aws-lambda-go v1.51.1 h1:FpqpCK2WOSoq6hJvO9PhN44GzZHWCN3e9DUQgK0B
 github.com/aws/aws-lambda-go v1.51.1/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A=
 github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4=
 github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
+github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU=
+github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0=
 github.com/aws/aws-sdk-go-v2/config v1.32.6 h1:hFLBGUKjmLAekvi1evLi5hVvFQtSo3GYwi+Bx4lpJf8=
 github.com/aws/aws-sdk-go-v2/config v1.32.6/go.mod h1:lcUL/gcd8WyjCrMnxez5OXkO3/rwcNmvfno62tnXNcI=
 github.com/aws/aws-sdk-go-v2/credentials v1.19.6 h1:F9vWao2TwjV2MyiyVS+duza0NIRtAslgLUM0vTA1ZaE=
@@ -10,8 +12,12 @@ github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 h1:80+uETIWS1BqjnN9uJ0dBU
 github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16/go.mod h1:wOOsYuxYuB/7FlnVtzeBYRcjSRtQpAW0hCP7tIULMwo=
 github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 h1:rgGwPzb82iBYSvHMHXc8h9mRoOUBZIGFgKb9qniaZZc=
 github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16/go.mod h1:L/UxsGeKpGoIj6DxfhOWHWQ/kGKcd4I1VncE4++IyKA=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ=
 github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 h1:1jtGzuV7c82xnqOVfx2F0xmJcOw5374L7N6juGW6x6U=
 github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16/go.mod h1:M2E5OQf+XLe+SZGmmpaI2yy+J326aFf6/+54PoxSANc=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM=
 github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
 github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
 github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.0 h1:o7eJKe6VYAnqERPlLAvDW5VKXV6eTKv1oxTpMoDP378=
@@ -24,6 +30,8 @@ github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.41.0 h1:vL6rQXcGtFv9q/9eR
 github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.41.0/go.mod h1:QwEDLD+7EukuEUnbWtiNE8LhgvvmhjZoi4XAppYPtyc=
 github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 h1:HpI7aMmJ+mm1wkSHIA2t5EaFFv5EFYXePW30p1EIrbQ=
 github.com/aws/aws-sdk-go-v2/service/signin v1.0.4/go.mod h1:C5RdGMYGlfM0gYq/tifqgn4EbyX99V15P2V3R+VHbQU=
+github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8 h1:31Llf5VfrZ78YvYs7sWcS7L2m3waikzRc6q1nYenVS4=
+github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8/go.mod h1:/jgaDlU1UImoxTxhRNxXHvBAPqPZQ8oCjcPbbkR6kac=
 github.com/aws/aws-sdk-go-v2/service/sso v1.30.8 h1:aM/Q24rIlS3bRAhTyFurowU8A0SMyGDtEOY/l/s/1Uw=
 github.com/aws/aws-sdk-go-v2/service/sso v1.30.8/go.mod h1:+fWt2UHSb4kS7Pu8y+BMBvJF0EWx+4H0hzNwtDNRTrg=
 github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 h1:AHDr0DaHIAo8c9t1emrzAlVDFp+iMMKnPdYy6XO4MCE=
diff --git a/main.go b/main.go
index 28cea62..27b696f 100644
--- a/main.go
+++ b/main.go
@@ -15,7 +15,6 @@ import (
 	"slices"
 	"strconv"
 	"strings"
-	"text/template"
 	"time"
 
 	"github.com/aws/aws-lambda-go/events"
@@ -25,6 +24,8 @@ import (
 	"github.com/aws/aws-sdk-go-v2/service/ec2"
 	"github.com/aws/aws-sdk-go-v2/service/ec2/types"
 	"github.com/aws/aws-sdk-go-v2/service/secretsmanager"
+	"github.com/aws/aws-sdk-go-v2/service/ssm"
+	ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types"
 	"github.com/google/go-github/v60/github"
 )
 
@@ -429,6 +430,23 @@ func generateJITConfig(pat, org, runnerName string, labels []string) (*JITConfig
 	return &jitResp, nil
 }
 
+// storeJITConfigInSSM stores the JIT config in SSM Parameter Store for the instance to retrieve.
+func storeJITConfigInSSM(ctx context.Context, ssmClient *ssm.Client, instanceID, jitConfig string) error {
+	paramName := fmt.Sprintf("/github-runner/jit-config/%s", instanceID)
+
+	_, err := ssmClient.PutParameter(ctx, &ssm.PutParameterInput{
+		Name:      aws.String(paramName),
+		Value:     aws.String(jitConfig),
+		Type:      ssmtypes.ParameterTypeSecureString,
+		Overwrite: aws.Bool(true),
+	})
+	if err != nil {
+		return fmt.Errorf("failed to store JIT config in SSM: %w", err)
+	}
+
+	return nil
+}
+
 // handleMaintenance processes scheduled warm pool maintenance events.
 func handleMaintenance() error {
 	slog.Info("warm pool maintenance triggered")
@@ -554,6 +572,7 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 
 		svc := ec2.NewFromConfig(cfg)
 		sm := secretsmanager.NewFromConfig(cfg)
+		ssmClient := ssm.NewFromConfig(cfg)
 
 		secretName := os.Getenv("GITHUB_PAT_SECRET_NAME")
 		if secretName == "" {
@@ -627,18 +646,6 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 
 		slog.Info("generated JIT config", "runnerID", jitConfig.Runner.ID, "runnerName", jitConfig.Runner.Name)
 
-		tpl, err := template.New("userdata").Parse(userData)
-		if err != nil {
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
-		}
-
-		var buf bytes.Buffer
-		if err := tpl.Execute(&buf, map[string]string{"JITConfig": jitConfig.EncodedJITConfig}); err != nil {
-			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
-		}
-
-		finalUserData := buf.String()
-
 		// Get warm pool target size for this instance type
 		poolConfig := parseWarmPoolConfig()
 		targetSize := poolConfig[string(instanceType)]
@@ -647,13 +654,13 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 		var instanceID *string
 		if targetSize > 0 {
 			slog.Info("checking warm pool", "instanceType", instanceType, "targetSize", targetSize)
-			instanceID = tryAcquireWarmInstance(ctx, svc, instanceType, jobEventID, finalUserData)
+			instanceID = tryAcquireWarmInstance(ctx, svc, instanceType, jobEventID, userData)
 		}
 
 		// Launch fresh instance if warm pool not available or not configured
 		if instanceID == nil {
 			slog.Info("launching fresh instance", "instanceType", instanceType)
-			instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, finalUserData, jobEventID)
+			instanceID, err = launchFreshInstance(ctx, svc, instanceType, launchConfig, userData, jobEventID)
 			if err != nil {
 				slog.Error("failed to launch fresh instance", "error", err.Error())
 				return events.APIGatewayProxyResponse{
@@ -664,6 +671,13 @@ func handleWebhook(request events.APIGatewayProxyRequest) (events.APIGatewayProx
 			slog.Info("instance launched", "instanceID", *instanceID)
 		}
 
+		// Store JIT config in SSM for the instance to retrieve
+		if err := storeJITConfigInSSM(ctx, ssmClient, *instanceID, jitConfig.EncodedJITConfig); err != nil {
+			slog.Error("failed to store JIT config in SSM", "error", err.Error())
+			return events.APIGatewayProxyResponse{StatusCode: http.StatusInternalServerError}, err
+		}
+		slog.Info("stored JIT config in SSM", "instanceID", *instanceID)
+
 		// Replenish warm pool if needed
 		if targetSize > 0 {
 			replenishWarmPool(ctx, svc, instanceType, launchConfig, targetSize)
diff --git a/template.yaml b/template.yaml
index 787f298..2638591 100644
--- a/template.yaml
+++ b/template.yaml
@@ -60,6 +60,15 @@ Resources:
                   - logs:PutLogEvents
                   - logs:DescribeLogStreams
                 Resource: !Sub 'arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/ec2/github-runner:*'
+        - PolicyName: SSMJITConfigPolicy
+          PolicyDocument:
+            Version: '2012-10-17'
+            Statement:
+              - Effect: Allow
+                Action:
+                  - ssm:GetParameter
+                  - ssm:DeleteParameter
+                Resource: !Sub 'arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/github-runner/jit-config/*'
 
   RunnerInstanceProfile:
     Type: AWS::IAM::InstanceProfile
@@ -123,6 +132,12 @@ Resources:
               Action:
                 - secretsmanager:GetSecretValue
               Resource: !Sub arn:${AWS::Partition}:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${GitHubPATSecretName}*
+            - Sid: SSMJITConfig
+              Effect: Allow
+              Action:
+                - ssm:PutParameter
+                - ssm:DeleteParameter
+              Resource: !Sub arn:${AWS::Partition}:ssm:${AWS::Region}:${AWS::AccountId}:parameter/github-runner/jit-config/*
     Metadata:
       BuildMethod: makefile
 
diff --git a/user-data.sh b/user-data.sh
index 72ab94a..aec56d5 100644
--- a/user-data.sh
+++ b/user-data.sh
@@ -86,16 +86,24 @@ INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169
 
 log_to_cloudwatch "INFO" "Instance: ${INSTANCE_ID}, Type: ${INSTANCE_TYPE}"
 
-# JIT config is passed from Lambda - no need to call GitHub API or run config.sh
-JIT_CONFIG="{{.JITConfig}}"
+# JIT config is stored in SSM Parameter Store by Lambda (avoids cloud-init caching issues)
+# Parameter name is /github-runner/jit-config/{instance-id}
+SSM_PARAM_NAME="/github-runner/jit-config/${INSTANCE_ID}"
 
-if [ -z "$JIT_CONFIG" ] || [ "$JIT_CONFIG" = "{{.JITConfig}}" ]; then
-    log_to_cloudwatch "ERROR" "JIT config not provided"
+log_to_cloudwatch "INFO" "Fetching JIT config from SSM: ${SSM_PARAM_NAME}"
+
+JIT_CONFIG=$(aws ssm get-parameter --name "${SSM_PARAM_NAME}" --with-decryption --query 'Parameter.Value' --output text 2>/dev/null || echo "")
+
+if [ -z "$JIT_CONFIG" ]; then
+    log_to_cloudwatch "ERROR" "JIT config not found in SSM"
     shutdown now
     exit 1
 fi
 
-log_to_cloudwatch "INFO" "JIT config received, skipping config.sh"
+# Delete the parameter after reading (one-time use)
+aws ssm delete-parameter --name "${SSM_PARAM_NAME}" 2>/dev/null || true
+
+log_to_cloudwatch "INFO" "JIT config retrieved from SSM, skipping config.sh"
 
 END_TIME=$(date +%s)
 EXECUTION_TIME=$((END_TIME - START_TIME))