From a80307a7b0ea2ed95c4aeb5c102c547e2bc82d2d Mon Sep 17 00:00:00 2001
From: Joel McCoy <joel@defenseunicorns.com>
Date: Wed, 31 Jul 2024 18:55:58 -0500
Subject: [PATCH 1/2] feat: add server readiness configuration options

---
 main.tf                       | 10 +++++++---
 modules/nodepool/main.tf      |  2 ++
 modules/nodepool/variables.tf | 10 ++++++++++
 variables.tf                  | 24 ++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/main.tf b/main.tf
index 3c19b1b..650c1c7 100644
--- a/main.tf
+++ b/main.tf
@@ -207,8 +207,10 @@ module "servers" {
   wait_for_capacity_timeout   = var.wait_for_capacity_timeout
   metadata_options            = var.metadata_options
   associate_public_ip_address = var.associate_public_ip_address
+  health_check_type           = var.server_node_health_check_type
+  health_check_grace_period   = var.server_node_health_check_grace_period
 
-  # Overrideable variables
+  # Overridable variables
   userdata             = data.cloudinit_config.this.rendered
   iam_instance_profile = var.iam_instance_profile == "" ? module.iam[0].iam_instance_profile : var.iam_instance_profile
 
@@ -221,8 +223,10 @@ module "servers" {
     termination_policies = var.termination_policies
   }
 
-  # TODO: Ideally set this to `length(var.servers)`, but currently blocked by: https://github.com/rancher/rke2/issues/349
-  min_elb_capacity = 1
+  # By default, only check for 1 server to be healthy before moving on
+  # but if enforce_min_elb_capacity is set, we need to wait for all servers to be healthy
+  min_elb_capacity = var.enforce_min_elb_capacity ? var.servers : 1 
+  wait_for_elb_capacity_on_updates = var.wait_for_elb_capacity_on_updates
 
   tags = merge({
     "Role" = "server",
diff --git a/modules/nodepool/main.tf b/modules/nodepool/main.tf
index f47b47f..f0bda54 100644
--- a/modules/nodepool/main.tf
+++ b/modules/nodepool/main.tf
@@ -78,11 +78,13 @@ resource "aws_autoscaling_group" "this" {
 
   # Health check and target groups dependent on whether we're a server or not (identified via rke2_url)
   health_check_type         = var.health_check_type
+  health_check_grace_period = var.health_check_grace_period
   wait_for_capacity_timeout = var.wait_for_capacity_timeout
   target_group_arns         = var.target_group_arns
   load_balancers            = var.load_balancers
 
   min_elb_capacity = var.min_elb_capacity
+  wait_for_elb_capacity = var.wait_for_elb_capacity_on_updates ? var.min_elb_capacity : null
 
   dynamic "launch_template" {
     for_each = var.spot ? [] : ["spot"]
diff --git a/modules/nodepool/variables.tf b/modules/nodepool/variables.tf
index b666207..dd4c91b 100644
--- a/modules/nodepool/variables.tf
+++ b/modules/nodepool/variables.tf
@@ -39,6 +39,16 @@ variable "health_check_type" {
   default = "EC2"
 }
 
+variable "health_check_grace_period" {
+  type    = number
+  default = 300
+}
+
+variable "wait_for_elb_capacity_on_updates" {
+  type    = bool
+  default = false
+}
+
 variable "wait_for_capacity_timeout" {
   description = "How long Terraform should wait for ASG instances to be healthy before timing out."
   type        = string
diff --git a/variables.tf b/variables.tf
index 135c9c7..ad64ee5 100644
--- a/variables.tf
+++ b/variables.tf
@@ -85,6 +85,30 @@ variable "servers" {
   default     = 3
 }
 
+variable "server_node_health_check_type" {
+  description = "Type of health check to use for server nodes (EC2 or ELB). Setting to ELB will wait until the target groups health checks pass before declaring an EC2 instance healthy.  (i.e. It will check to make sure the kubeapi server is up.)"
+  type        = string
+  default     = "EC2"
+}
+
+variable "server_node_health_check_grace_period" {
+  description = "Time (in seconds) after instance comes into service before checking health of the server node."
+  type        = number
+  default     = "300"
+}
+
+variable "enforce_min_elb_capacity" {
+  description = "If this is set to true, it requires all server nodes to be healthy, before the ASG will be considered healthy. This only applies for creation of the ASG.  If you want this check for updates, set wait_for_elb_capacity_on_updates to true."
+  type        = bool
+  default     = false
+}
+
+variable "wait_for_elb_capacity_on_updates" {
+  description = "Setting this will cause Terraform to wait for min_elb_capacity healthy instances from this Auto Scaling Group in all attached load balancers on update operations as well."
+  type        = bool
+  default     = false
+}
+
 variable "spot" {
   description = "Toggle spot requests for server pool"
   type        = bool

From e2a06332267194220473f5ca29ed05a306996c36 Mon Sep 17 00:00:00 2001
From: Joel McCoy <joel@defenseunicorns.com>
Date: Thu, 1 Aug 2024 09:27:06 -0500
Subject: [PATCH 2/2] docs: update terraform docs

---
 README.md                  | 4 ++++
 modules/nodepool/README.md | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 6faf799..87ba91b 100644
--- a/README.md
+++ b/README.md
@@ -216,6 +216,7 @@ Optional policies have the option of being created by default, but are specified
 | <a name="input_download"></a> [download](#input\_download) | Toggle best effort download of rke2 dependencies (rke2 and aws cli), if disabled, dependencies are assumed to exist in $PATH | `bool` | `true` | no |
 | <a name="input_enable_autoscaler"></a> [enable\_autoscaler](#input\_enable\_autoscaler) | Toggle enabling policies required for cluster autoscaler to work | `bool` | `false` | no |
 | <a name="input_enable_ccm"></a> [enable\_ccm](#input\_enable\_ccm) | Toggle enabling the cluster as aws aware, this will ensure the appropriate IAM policies are present | `bool` | `false` | no |
+| <a name="input_enforce_min_elb_capacity"></a> [enforce\_min\_elb\_capacity](#input\_enforce\_min\_elb\_capacity) | If this is set to true, it requires all server nodes to be healthy, before the ASG will be considered healthy. This only applies for creation of the ASG.  If you want this check for updates, set wait\_for\_elb\_capacity\_on\_updates to true. | `bool` | `false` | no |
 | <a name="input_extra_block_device_mappings"></a> [extra\_block\_device\_mappings](#input\_extra\_block\_device\_mappings) | Used to specify additional block device mapping configurations | `list(map(string))` | `[]` | no |
 | <a name="input_extra_cloud_config_config"></a> [extra\_cloud\_config\_config](#input\_extra\_cloud\_config\_config) | extra config to append to cloud-config | `string` | `""` | no |
 | <a name="input_extra_security_group_ids"></a> [extra\_security\_group\_ids](#input\_extra\_security\_group\_ids) | List of additional security group IDs | `list(string)` | `[]` | no |
@@ -231,6 +232,8 @@ Optional policies have the option of being created by default, but are specified
 | <a name="input_rke2_install_script_url"></a> [rke2\_install\_script\_url](#input\_rke2\_install\_script\_url) | URL for RKE2 install script | `string` | `"https://get.rke2.io"` | no |
 | <a name="input_rke2_start"></a> [rke2\_start](#input\_rke2\_start) | Start/Stop value for the rke2-server/agent service.  This will prevent the service from starting until the next reboot. True=start, False= don't start. | `bool` | `true` | no |
 | <a name="input_rke2_version"></a> [rke2\_version](#input\_rke2\_version) | Version to use for RKE2 server nodepool | `string` | `null` | no |
+| <a name="input_server_node_health_check_grace_period"></a> [server\_node\_health\_check\_grace\_period](#input\_server\_node\_health\_check\_grace\_period) | Time (in seconds) after instance comes into service before checking health of the server node. | `number` | `"300"` | no |
+| <a name="input_server_node_health_check_type"></a> [server\_node\_health\_check\_type](#input\_server\_node\_health\_check\_type) | Type of health check to use for server nodes (EC2 or ELB). Setting to ELB will wait until the target groups health checks pass before declaring an EC2 instance healthy.  (i.e. It will check to make sure the kubeapi server is up.) | `string` | `"EC2"` | no |
 | <a name="input_servers"></a> [servers](#input\_servers) | Number of servers to create | `number` | `3` | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Toggle spot requests for server pool | `bool` | `false` | no |
 | <a name="input_ssh_authorized_keys"></a> [ssh\_authorized\_keys](#input\_ssh\_authorized\_keys) | Server pool list of public keys to add as authorized ssh keys | `list(string)` | `[]` | no |
@@ -243,6 +246,7 @@ Optional policies have the option of being created by default, but are specified
 | <a name="input_unzip_rpm_url"></a> [unzip\_rpm\_url](#input\_unzip\_rpm\_url) | URL path to unzip rpm | `string` | `""` | no |
 | <a name="input_vpc_id"></a> [vpc\_id](#input\_vpc\_id) | VPC ID to create resources in | `string` | n/a | yes |
 | <a name="input_wait_for_capacity_timeout"></a> [wait\_for\_capacity\_timeout](#input\_wait\_for\_capacity\_timeout) | How long Terraform should wait for ASG instances to be healthy before timing out. | `string` | `"10m"` | no |
+| <a name="input_wait_for_elb_capacity_on_updates"></a> [wait\_for\_elb\_capacity\_on\_updates](#input\_wait\_for\_elb\_capacity\_on\_updates) | Setting this will cause Terraform to wait for min\_elb\_capacity healthy instances from this Auto Scaling Group in all attached load balancers on update operations as well. | `bool` | `false` | no |
 
 ## Outputs
 
diff --git a/modules/nodepool/README.md b/modules/nodepool/README.md
index 111254d..a9e94da 100644
--- a/modules/nodepool/README.md
+++ b/modules/nodepool/README.md
@@ -31,6 +31,7 @@ No modules.
 | <a name="input_block_device_mappings"></a> [block\_device\_mappings](#input\_block\_device\_mappings) | n/a | `map(string)` | <pre>{<br>  "size": 30,<br>  "type": "gp2"<br>}</pre> | no |
 | <a name="input_extra_block_device_mappings"></a> [extra\_block\_device\_mappings](#input\_extra\_block\_device\_mappings) | n/a | `list(map(string))` | `[]` | no |
 | <a name="input_extra_cloud_config_config"></a> [extra\_cloud\_config\_config](#input\_extra\_cloud\_config\_config) | extra config to append to cloud-config | `string` | `""` | no |
+| <a name="input_health_check_grace_period"></a> [health\_check\_grace\_period](#input\_health\_check\_grace\_period) | n/a | `number` | `300` | no |
 | <a name="input_health_check_type"></a> [health\_check\_type](#input\_health\_check\_type) | n/a | `string` | `"EC2"` | no |
 | <a name="input_iam_instance_profile"></a> [iam\_instance\_profile](#input\_iam\_instance\_profile) | n/a | `string` | `""` | no |
 | <a name="input_instance_type"></a> [instance\_type](#input\_instance\_type) | n/a | `string` | `"t3.medium"` | no |
@@ -46,6 +47,7 @@ No modules.
 | <a name="input_vpc_id"></a> [vpc\_id](#input\_vpc\_id) | n/a | `string` | n/a | yes |
 | <a name="input_vpc_security_group_ids"></a> [vpc\_security\_group\_ids](#input\_vpc\_security\_group\_ids) | n/a | `list(string)` | `[]` | no |
 | <a name="input_wait_for_capacity_timeout"></a> [wait\_for\_capacity\_timeout](#input\_wait\_for\_capacity\_timeout) | How long Terraform should wait for ASG instances to be healthy before timing out. | `string` | `"10m"` | no |
+| <a name="input_wait_for_elb_capacity_on_updates"></a> [wait\_for\_elb\_capacity\_on\_updates](#input\_wait\_for\_elb\_capacity\_on\_updates) | n/a | `bool` | `false` | no |
 
 ## Outputs