Skip to content

Use instance pools for infra and worker node groups #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ The module provides variables to
* provide an API token for control.vshn.net (see next sections for details).
* choose a dedicated deployment target
This allows for using dedicated hypervisors.
* choose to provision Exoscale instance pools for the infra and worker nodes.
NOTE: we currently don't support provisioning Exoscale instance pools for the control plane and storage nodes.

The cluster's domain is constructed from the provided base domain, cluster id and cluster name.
If a cluster name is provided the cluster domain is set to `<cluster name>.<base domain>`.
Expand Down
5 changes: 5 additions & 0 deletions control_plane.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ module "master" {
deploy_target_id = var.deploy_target_id

bootstrap_bucket = var.bootstrap_bucket

# Don't use instance pool for control plane nodes, since scaling them is
# much easier without an instance pool by just stopping/scaling/starting the
# same VM.
use_instancepool = false
}

resource "exoscale_domain_record" "etcd" {
Expand Down
2 changes: 2 additions & 0 deletions infra.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ module "infra" {
deploy_target_id = var.deploy_target_id

bootstrap_bucket = var.bootstrap_bucket

use_instancepool = var.use_instancepools
}
36 changes: 32 additions & 4 deletions modules/node-group/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ locals {
// having to work around merge() being a shallow merge in the compute
// instance resource.
user_data = [
for hostname in random_id.node_id[*].hex :
// NOTE(sg): we only need to patch each node's user-data to have a custom
// /etc/hosts for non-instancepool setups. For instancepool setups, we
// only need a single user-data and we don't actually use the value of
// `hostname`.
for hostname in(var.use_instancepool ? ["pool_member"] : random_id.node_id[*].hex) :
{
"ignition" : {
"version" : "3.1.0",
Expand All @@ -42,7 +46,7 @@ locals {
"storage" : {
// concatenate the private network config (if requested) with the
// `/etc/hostname` override.
"files" : concat(
"files" : var.use_instancepool ? [] : concat(
var.use_privnet ? local.privnet_config_files : [],
// override /etc/hostname with short hostname, this works around the
// fact that we can't set a separate `name` and `display_name` for
Expand Down Expand Up @@ -158,7 +162,7 @@ locals {
}

resource "random_id" "node_id" {
count = var.node_count
count = var.use_instancepool ? 0 : var.node_count
prefix = "${var.role}-"
byte_length = 2
}
Expand All @@ -170,7 +174,7 @@ resource "exoscale_anti_affinity_group" "anti_affinity_group" {
}

resource "exoscale_compute_instance" "nodes" {
count = var.node_count
count = var.use_instancepool ? 0 : var.node_count
name = "${random_id.node_id[count.index].hex}.${var.cluster_domain}"
ssh_key = var.ssh_key_pair
zone = var.region
Expand Down Expand Up @@ -207,3 +211,27 @@ resource "exoscale_compute_instance" "nodes" {
]
}
}

resource "exoscale_instance_pool" "nodes" {
count = var.use_instancepool ? local.anti_affinity_group_count : 0
name = "${var.cluster_id}_${var.role}-${count.index}"
size = var.node_count
zone = var.region
key_pair = var.ssh_key_pair
template_id = var.template_id

instance_prefix = var.role
instance_type = var.instance_type

disk_size = local.disk_size
user_data = jsonencode(local.user_data[0])

deploy_target_id = var.deploy_target_id

security_group_ids = var.security_group_ids

anti_affinity_group_ids = concat(
[exoscale_anti_affinity_group.anti_affinity_group[count.index].id],
var.additional_affinity_group_ids
)
}
6 changes: 5 additions & 1 deletion modules/node-group/output.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
locals {
instance_pool_ips = var.use_instancepool ? flatten(exoscale_instance_pool.nodes[*].instances[*].public_ip_address) : []
instance_ips = var.use_privnet ? exoscale_compute_instance.nodes[*].network_interface[0].ip_address : exoscale_compute_instance.nodes[*].public_ip_address
}
output "ip_address" {
value = var.use_privnet ? exoscale_compute_instance.nodes[*].network_interface[0].ip_address : exoscale_compute_instance.nodes[*].public_ip_address
value = var.use_instancepool ? local.instance_pool_ips : local.instance_ips
}
6 changes: 6 additions & 0 deletions modules/node-group/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,9 @@ variable "affinity_group_capacity" {
default = 0
description = "Capacity of the affinity group, e.g. when using dedicated hypervisors, default: 0 (unlimited)"
}

variable "use_instancepool" {
type = bool
description = "Use instancepool for this node group"
default = false
}
19 changes: 19 additions & 0 deletions security_groups.tf
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,22 @@ resource "exoscale_security_group_rule" "storage" {

user_security_group_id = exoscale_security_group.all_machines.id
}

resource "exoscale_security_group" "worker" {
name = "${var.cluster_id}_worker"
description = "${var.cluster_id} worker nodes"
}

resource "exoscale_security_group_rule" "worker_nodeports" {
for_each = toset(["TCP", "UDP"])

security_group_id = exoscale_security_group.worker.id

type = "INGRESS"
protocol = each.value
description = "Access to worker node ports from anywhere"
start_port = 30000
end_port = 32767

cidr = "0.0.0.0/0"
}
4 changes: 4 additions & 0 deletions storage.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,8 @@ module "storage" {
deploy_target_id = var.deploy_target_id

bootstrap_bucket = var.bootstrap_bucket

# Don't use instancepool for storage nodes so that we can keep the existing
# day 2 operations how-tos (scaling disk etc.).
use_instancepool = false
}
10 changes: 8 additions & 2 deletions worker.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module "worker" {

security_group_ids = concat(
var.additional_security_group_ids,
[exoscale_security_group.all_machines.id]
[exoscale_security_group.all_machines.id],
var.use_instancepools ? [exoscale_security_group.worker.id] : []
)

affinity_group_capacity = var.affinity_group_capacity
Expand All @@ -35,6 +36,8 @@ module "worker" {
deploy_target_id = var.deploy_target_id

bootstrap_bucket = var.bootstrap_bucket

use_instancepool = var.use_instancepools
}

// Additional worker groups.
Expand Down Expand Up @@ -71,7 +74,8 @@ module "additional_worker" {

security_group_ids = concat(
var.additional_security_group_ids,
[exoscale_security_group.all_machines.id]
[exoscale_security_group.all_machines.id],
var.use_instancepools ? [exoscale_security_group.worker.id] : []
)

affinity_group_capacity = var.affinity_group_capacity
Expand All @@ -83,4 +87,6 @@ module "additional_worker" {
deploy_target_id = var.deploy_target_id

bootstrap_bucket = var.bootstrap_bucket

use_instancepool = true
}