From 8bf7c4fe6060586862c540f9810a75c9be47d89c Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 18 Jan 2022 12:07:37 +0100 Subject: [PATCH 1/4] update terraform to 1.1 --- terraform/modules/mybinder/versions.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/mybinder/versions.tf b/terraform/modules/mybinder/versions.tf index 5c361da2b..dce0235cd 100644 --- a/terraform/modules/mybinder/versions.tf +++ b/terraform/modules/mybinder/versions.tf @@ -9,5 +9,5 @@ terraform { version = "~> 3.0.0" } } - required_version = "~> 0.13" + required_version = "~> 1.1" } From 20ef6c7d749b5adbde0bb37ace755202cfbc49af Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 18 Jan 2022 12:08:32 +0100 Subject: [PATCH 2/4] terraform: bump kubernetes to 1.19.14-gke.1900 matches current auto-upgrade version, doesn't actually upgrade nodes --- terraform/prod/main.tf | 18 +++++++++++++++++- terraform/staging/main.tf | 10 ++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/terraform/prod/main.tf b/terraform/prod/main.tf index b429f28af..b647c6341 100644 --- a/terraform/prod/main.tf +++ b/terraform/prod/main.tf @@ -12,7 +12,7 @@ provider "google" { } locals { - gke_version = "1.17.14-gke.400" + gke_version = "1.19.14-gke.1900" location = "us-central1" # for regional clusters federation_members = ["gke-old", "gesis", "turing", "ovh"] } @@ -61,6 +61,14 @@ resource "google_container_node_pool" "core" { disable-legacy-endpoints = "true" } } + + # do not recreate pools that have been auto-upgraded + + lifecycle { + ignore_changes = [ + version + ] + } } resource "google_container_node_pool" "user" { @@ -97,6 +105,14 @@ resource "google_container_node_pool" "user" { disable-legacy-endpoints = "true" } } + + # do not recreate pools that have been auto-upgraded + + lifecycle { + ignore_changes = [ + version + ] + } } # other prod-only resources, not required for both prod and staging, diff --git a/terraform/staging/main.tf b/terraform/staging/main.tf index 6e2c39a59..93098ae0e 100644 --- a/terraform/staging/main.tf +++ b/terraform/staging/main.tf @@ -12,7 +12,7 @@ provider "google" { } locals { - gke_version = "1.17.14-gke.400" + gke_version = "1.19.14-gke.1900" } module "mybinder" { @@ -49,6 +49,13 @@ resource "google_container_node_pool" "pool" { disable-legacy-endpoints = "true" } } + # do not recreate pools that have been auto-upgraded + + lifecycle { + ignore_changes = [ + version + ] + } } # output "public_ip" { @@ -66,4 +73,3 @@ output "matomo_password" { value = module.mybinder.matomo_password sensitive = true } - From 3f9a0d995f0ec3f897899c1599e64afa4e8f6dc5 Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 18 Jan 2022 12:17:43 +0100 Subject: [PATCH 3/4] add user pool with reduced pd-ssd size should reduce operational costs by ~1k/month sets autoscale-max on existing user pool to 1 to avoid allocation of new nodes. Can delete old user pool once it's drained. --- terraform/prod/main.tf | 50 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/terraform/prod/main.tf b/terraform/prod/main.tf index b647c6341..8db18a285 100644 --- a/terraform/prod/main.tf +++ b/terraform/prod/main.tf @@ -66,7 +66,7 @@ resource "google_container_node_pool" "core" { lifecycle { ignore_changes = [ - version + version ] } } @@ -79,6 +79,50 @@ resource "google_container_node_pool" "user" { node_locations = ["${local.location}-a"] version = local.gke_version + autoscaling { + min_node_count = 0 + max_node_count = 1 + } + + + node_config { + machine_type = "n1-highmem-8" + disk_size_gb = 1000 + disk_type = "pd-ssd" + local_ssd_count = 1 + + labels = { + "mybinder.org/pool-type" = "users" + } + # https://www.terraform.io/docs/providers/google/r/container_cluster.html#oauth_scopes-1 + oauth_scopes = [ + "storage-ro", + "logging-write", + "monitoring", + ] + + metadata = { + disable-legacy-endpoints = "true" + } + } + + # do not recreate pools that have been auto-upgraded + + lifecycle { + ignore_changes = [ + version + ] + } +} + +resource "google_container_node_pool" "user1" { + name = "user-202201" + cluster = module.mybinder.cluster_name + location = local.location # location of *cluster* + # node_locations lets us specify a single-zone regional cluster: + node_locations = ["${local.location}-a"] + version = local.gke_version + autoscaling { min_node_count = 2 max_node_count = 12 @@ -87,7 +131,7 @@ resource "google_container_node_pool" "user" { node_config { machine_type = "n1-highmem-8" - disk_size_gb = 1000 + disk_size_gb = 500 disk_type = "pd-ssd" local_ssd_count = 1 @@ -110,7 +154,7 @@ resource "google_container_node_pool" "user" { lifecycle { ignore_changes = [ - version + version ] } } From c0377e98cbfbf51281794b864100444c3a50ea96 Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 25 Jan 2022 11:15:01 +0100 Subject: [PATCH 4/4] switch storage pd-balanced saves cost without losing as much space/performance --- terraform/prod/main.tf | 57 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/terraform/prod/main.tf b/terraform/prod/main.tf index 8db18a285..db7a7a43f 100644 --- a/terraform/prod/main.tf +++ b/terraform/prod/main.tf @@ -28,6 +28,14 @@ module "mybinder" { } # define node pools here, too hard to encode with variables +# note: when upgrading a node pool: +# 1. copy the pool to be upgraded and change the name +# 2. make the planned changes +# 3. deploy them with terraform +# 4. drain old pools (takes a while for user pools) +# 5. once drained, remove old pool(s) here +# 6. deploy again to remove old pool + resource "google_container_node_pool" "core" { name = "core-202009" cluster = module.mybinder.cluster_name @@ -35,6 +43,49 @@ resource "google_container_node_pool" "core" { # node_locations lets us specify a single-zone regional cluster: node_locations = ["${local.location}-a"] + autoscaling { + min_node_count = 0 + max_node_count = 1 + } + + version = local.gke_version + + node_config { + machine_type = "n1-highmem-4" + disk_size_gb = 250 + disk_type = "pd-ssd" + + labels = { + "mybinder.org/pool-type" = "core" + } + # https://www.terraform.io/docs/providers/google/r/container_cluster.html#oauth_scopes-1 + oauth_scopes = [ + "storage-ro", + "logging-write", + "monitoring", + ] + + metadata = { + disable-legacy-endpoints = "true" + } + } + + # do not recreate pools that have been auto-upgraded + + lifecycle { + ignore_changes = [ + version + ] + } +} + +resource "google_container_node_pool" "core1" { + name = "core-202201" + cluster = module.mybinder.cluster_name + location = local.location # location of *cluster* + # node_locations lets us specify a single-zone regional cluster: + node_locations = ["${local.location}-a"] + autoscaling { min_node_count = 1 max_node_count = 4 @@ -45,7 +96,7 @@ resource "google_container_node_pool" "core" { node_config { machine_type = "n1-highmem-4" disk_size_gb = 250 - disk_type = "pd-ssd" + disk_type = "pd-balanced" labels = { "mybinder.org/pool-type" = "core" @@ -131,8 +182,8 @@ resource "google_container_node_pool" "user1" { node_config { machine_type = "n1-highmem-8" - disk_size_gb = 500 - disk_type = "pd-ssd" + disk_size_gb = 800 + disk_type = "pd-balanced" local_ssd_count = 1 labels = {