From 2fc3debd0264aa3eb03d70e1602fa41e7c360b09 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 9 Jan 2024 09:01:41 +0100
Subject: [PATCH 1/8] terraform, azure: increase misc config resolution to help
 during upgrades

---
 terraform/azure/main.tf                  | 33 ++++++++++++++----------
 terraform/azure/projects/utoronto.tfvars | 21 ++++++++++++---
 terraform/azure/variables.tf             | 29 +++++++++++----------
 3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf
index ce605763ac..8f8f94d0cf 100644
--- a/terraform/azure/main.tf
+++ b/terraform/azure/main.tf
@@ -93,22 +93,31 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     }
   }
 
-  # Core node-pool
+  # default_node_pool must be set, and it must be a node pool of system type
+  # that can't scale to zero. Due to that we are forced to use it, and have
+  # decided to use it as our core node pool.
+  #
+  # Most changes to this node pool forces a replace operation on the entire
+  # cluster. This can be avoided with v3.47.0+ of this provider by declaring
+  # temporary_name_for_rotation = "core-b".
+  #
+  # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
+  # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
+  #
   default_node_pool {
-    # Unfortunately, changing anything about VM type / size recreates *whole cluster
-    name                = "core"
-    vm_size             = var.core_node_vm_size
-    os_disk_size_gb     = 40
+    name                = var.core_node_pool.name
+    vm_size             = var.core_node_pool.vm_size
+    os_disk_size_gb     = var.core_node_pool.os_disk_size_gb
     enable_auto_scaling = true
-    min_count           = 1
-    max_count           = 10
+    min_count           = var.core_node_pool.min
+    max_count           = var.core_node_pool.max
     vnet_subnet_id      = azurerm_subnet.node_subnet.id
-    node_labels = {
+    node_labels = merge({
       "hub.jupyter.org/node-purpose" = "core",
       "k8s.dask.org/node-purpose"    = "core"
-    }
+    }, var.core_node_pool.labels)
 
-    orchestrator_version = var.kubernetes_version
+    orchestrator_version = coalesce(var.core_node_pool.kubernetes_version, var.kubernetes_version)
   }
 
   auto_scaler_profile {
@@ -140,7 +149,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
 resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
   for_each = var.notebook_nodes
 
-  name                  = "nb${each.key}"
+  name                  = coalesce(each.value.name, each.key)
   kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
   enable_auto_scaling   = true
   os_disk_size_gb       = 200
@@ -152,7 +161,6 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
   node_labels = merge({
     "hub.jupyter.org/node-purpose" = "user",
     "k8s.dask.org/node-purpose"    = "scheduler"
-    "hub.jupyter.org/node-size"    = each.value.vm_size
   }, each.value.labels)
 
   node_taints = concat([
@@ -180,7 +188,6 @@ resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
   vm_size = each.value.vm_size
   node_labels = merge({
     "k8s.dask.org/node-purpose" = "worker",
-    "hub.jupyter.org/node-size" = each.value.vm_size
   }, each.value.labels)
 
   node_taints = concat([
diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index eaf3d01c87..8c3eeaddaf 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -13,16 +13,31 @@ ssh_pub_key  = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2
 #
 kubernetes_version = "1.26.3"
 
-# FIXME: upgrade core_node_vm_size to Standard_E4s_v5
-core_node_vm_size = "Standard_E4s_v3"
+core_node_pool = {
+  name : "core",
+  vm_size : "Standard_E4s_v3",
+  kubernetes_version : "1.26.3",
+}
 
 notebook_nodes = {
   "default" : {
+    name : "nbdefault",
     # NOTE: min-max below was set to 0-86 retroactively to align with
     #       observed state without understanding on why 0-86 was picked.
     min : 0,
     max : 86,
     # FIXME: upgrade user nodes vm_size to Standard_E8s_v5
     vm_size : "Standard_E8s_v3",
-  }
+    # FIXME: remove this label
+    labels : {
+      "hub.jupyter.org/node-size" = "Standard_E8s_v3",
+    },
+    kubernetes_version : "1.26.3",
+  },
+  #"usere8sv5" : {
+  #  min : 0,
+  #  max : 100,
+  #  vm_size : "Standard_E8s_v5",
+  #  kubernetes_version : "1.28.3",
+  #}
 }
diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf
index b9cd943680..a7982e1c55 100644
--- a/terraform/azure/variables.tf
+++ b/terraform/azure/variables.tf
@@ -48,20 +48,6 @@ variable "kubernetes_version" {
 }
 
 
-variable "core_node_vm_size" {
-  type        = string
-  description = <<-EOT
-  VM Size to use for core nodes
-
-  Core nodes will always be on, and count as 'base cost'
-  for a cluster. We should try to run with as few of them
-  as possible.
-
-  WARNING: CHANGING THIS WILL DESTROY AND RECREATE THE CLUSTER!
-  EOT
-}
-
-
 variable "global_container_registry_name" {
   type        = string
   description = <<-EOT
@@ -92,8 +78,23 @@ variable "ssh_pub_key" {
   EOT
 }
 
+variable "core_node_pool" {
+  type = object({
+    name : optional(string, ""),
+    min : optional(number, 1),
+    max : optional(number, 10),
+    vm_size : string,
+    labels : optional(map(string), {}),
+    taints : optional(list(string), []),
+    os_disk_size_gb : optional(number, 40),
+    kubernetes_version : optional(string, "")
+  })
+  description = "Core node pool"
+}
+
 variable "notebook_nodes" {
   type = map(object({
+    name : optional(string, ""),
     min : number,
     max : number,
     vm_size : string,

From 7eb118d15445f38adeac13b992e0eae77e3b4034 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 9 Jan 2024 09:02:26 +0100
Subject: [PATCH 2/8] terraform, utoronto: upgrade from 1.26.3 to 1.27.7

---
 terraform/azure/projects/utoronto.tfvars | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index 8c3eeaddaf..d06c6b8007 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -11,7 +11,7 @@ ssh_pub_key  = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2
 # FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
 #        available via: az aks get-versions --location westus2 -o table
 #
-kubernetes_version = "1.26.3"
+kubernetes_version = "1.27.7"
 
 core_node_pool = {
   name : "core",

From 04f93699993d344deeea7e20a45fe640f486901d Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 9 Jan 2024 17:41:00 +0100
Subject: [PATCH 3/8] terraform, azure and utoronto: k8s upgrade with misc
 variables to support it

---
 terraform/azure/main.tf                  | 23 +++++++-----
 terraform/azure/projects/utoronto.tfvars | 47 ++++++++++++++++++------
 terraform/azure/variables.tf             | 19 ++++++----
 3 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf
index 8f8f94d0cf..5777b68340 100644
--- a/terraform/azure/main.tf
+++ b/terraform/azure/main.tf
@@ -99,7 +99,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   #
   # Most changes to this node pool forces a replace operation on the entire
   # cluster. This can be avoided with v3.47.0+ of this provider by declaring
-  # temporary_name_for_rotation = "core-b".
+  # temporary_name_for_rotation = "coreb".
   #
   # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
   # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
@@ -108,9 +108,11 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     name                = var.core_node_pool.name
     vm_size             = var.core_node_pool.vm_size
     os_disk_size_gb     = var.core_node_pool.os_disk_size_gb
-    enable_auto_scaling = true
-    min_count           = var.core_node_pool.min
-    max_count           = var.core_node_pool.max
+    enable_auto_scaling = var.core_node_pool.enable_auto_scaling
+    min_count           = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null
+    max_count           = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null
+    node_count          = var.core_node_pool.node_count
+    kubelet_disk_type   = var.core_node_pool.kubelet_disk_type
     vnet_subnet_id      = azurerm_subnet.node_subnet.id
     node_labels = merge({
       "hub.jupyter.org/node-purpose" = "core",
@@ -147,13 +149,14 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
 
 
 resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
-  for_each = var.notebook_nodes
+  for_each = var.user_node_pools
 
   name                  = coalesce(each.value.name, each.key)
   kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
   enable_auto_scaling   = true
-  os_disk_size_gb       = 200
+  os_disk_size_gb       = each.value.os_disk_size_gb
   vnet_subnet_id        = azurerm_subnet.node_subnet.id
+  kubelet_disk_type     = each.value.kubelet_disk_type
 
   orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
@@ -173,14 +176,14 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
 }
 
 resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
-  # If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
-  # This lets us set dask_nodes to an empty array to get no dask nodes
-  for_each = var.dask_nodes
+  # If dask_node_pools is set, we use that. If it isn't, we use user_node_pools.
+  # This lets us set dask_node_pools to an empty array to get no dask nodes
+  for_each = var.dask_node_pools
 
   name                  = "dask${each.key}"
   kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
   enable_auto_scaling   = true
-  os_disk_size_gb       = 200
+  os_disk_size_gb       = each.value.os_disk_size_gb
   vnet_subnet_id        = azurerm_subnet.node_subnet.id
 
   orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index d06c6b8007..c4246f9bf6 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -8,18 +8,40 @@ location                       = "canadacentral"
 storage_size = 8192
 ssh_pub_key  = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"
 
-# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
-#        available via: az aks get-versions --location westus2 -o table
-#
-kubernetes_version = "1.27.7"
+# List available versions via: az aks get-versions --location westus2 -o table
+kubernetes_version = "1.28.3"
 
 core_node_pool = {
   name : "core",
+  kubernetes_version : "1.28.3",
+
+  # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
+  #        can more cheaply handle being forced to have 2-3 replicas for silly
+  #        reasons like three calico-typha pods. See
+  #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
+  #
+  #        Transitioning to E2s_v5 would require reducing the requested memory
+  #        by prometheus-server though, but that should be okay since
+  #        prometheus has reduced its memory profile significant enough recently.
+  #
   vm_size : "Standard_E4s_v3",
-  kubernetes_version : "1.26.3",
+
+  # FIXME: stop using persistent disks for the nodes, use the variable default
+  #        "Temporary" instead
+  kubelet_disk_type : "OS",
+
+  # FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to
+  #        avoid running low when few replicas are used
+  os_disk_size_gb : 40,
+
+  # FIXME: its nice to use autoscaling, but we end up with three replicas due to
+  #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632
+  #        and its a waste at least using Standard_E4s_v3 machines.
+  enable_auto_scaling : false,
+  node_count : 2,
 }
 
-notebook_nodes = {
+user_node_pools = {
   "default" : {
     name : "nbdefault",
     # NOTE: min-max below was set to 0-86 retroactively to align with
@@ -33,11 +55,12 @@ notebook_nodes = {
       "hub.jupyter.org/node-size" = "Standard_E8s_v3",
     },
     kubernetes_version : "1.26.3",
+    # FIXME: stop using persistent disks for the nodes, use Temporary instead
+    kubelet_disk_type : "OS",
   },
-  #"usere8sv5" : {
-  #  min : 0,
-  #  max : 100,
-  #  vm_size : "Standard_E8s_v5",
-  #  kubernetes_version : "1.28.3",
-  #}
+  "usere8sv5" : {
+    min : 0,
+    max : 100,
+    vm_size : "Standard_E8s_v5",
+  }
 }
diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf
index a7982e1c55..dda5c1fab6 100644
--- a/terraform/azure/variables.tf
+++ b/terraform/azure/variables.tf
@@ -81,18 +81,21 @@ variable "ssh_pub_key" {
 variable "core_node_pool" {
   type = object({
     name : optional(string, ""),
+    enable_auto_scaling = optional(bool, true),
     min : optional(number, 1),
     max : optional(number, 10),
+    node_count : optional(number),
     vm_size : string,
     labels : optional(map(string), {}),
     taints : optional(list(string), []),
-    os_disk_size_gb : optional(number, 40),
-    kubernetes_version : optional(string, "")
+    os_disk_size_gb : optional(number, 100),
+    kubernetes_version : optional(string, ""),
+    kubelet_disk_type : optional(string, "Temporary"),
   })
   description = "Core node pool"
 }
 
-variable "notebook_nodes" {
+variable "user_node_pools" {
   type = map(object({
     name : optional(string, ""),
     min : number,
@@ -100,20 +103,22 @@ variable "notebook_nodes" {
     vm_size : string,
     labels : optional(map(string), {}),
     taints : optional(list(string), []),
-    kubernetes_version : optional(string, "")
+    os_disk_size_gb : optional(number, 200),
+    kubernetes_version : optional(string, ""),
+    kubelet_disk_type : optional(string, "Temporary"),
   }))
-  description = "Notebook node pools to create"
+  description = "User node pools to create"
   default     = {}
 }
 
-variable "dask_nodes" {
+variable "dask_node_pools" {
   type = map(object({
     min : number,
     max : number,
     vm_size : string,
     labels : optional(map(string), {}),
     taints : optional(list(string), []),
-    kubernetes_version : optional(string, "")
+    kubernetes_version : optional(string, ""),
   }))
   description = "Dask node pools to create"
   default     = {}

From ed82584f65c608251bf0c86723da688b072bfa23 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 9 Jan 2024 17:42:07 +0100
Subject: [PATCH 4/8] terraform, utoronto: cleanup old user node pool

---
 terraform/azure/projects/utoronto.tfvars | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index c4246f9bf6..361f726e2e 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -42,25 +42,9 @@ core_node_pool = {
 }
 
 user_node_pools = {
-  "default" : {
-    name : "nbdefault",
-    # NOTE: min-max below was set to 0-86 retroactively to align with
-    #       observed state without understanding on why 0-86 was picked.
-    min : 0,
-    max : 86,
-    # FIXME: upgrade user nodes vm_size to Standard_E8s_v5
-    vm_size : "Standard_E8s_v3",
-    # FIXME: remove this label
-    labels : {
-      "hub.jupyter.org/node-size" = "Standard_E8s_v3",
-    },
-    kubernetes_version : "1.26.3",
-    # FIXME: stop using persistent disks for the nodes, use Temporary instead
-    kubelet_disk_type : "OS",
-  },
   "usere8sv5" : {
     min : 0,
     max : 100,
     vm_size : "Standard_E8s_v5",
-  }
+  },
 }

From 9485b3ddff566541a9d9d9c5219ae23f5cc4befc Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Tue, 9 Jan 2024 18:06:48 +0100
Subject: [PATCH 5/8] terraform, utoronto: remove resolved comment

This comment was relevant before https://github.com/2i2c-org/infrastructure/pull/3595, but not after.
---
 terraform/azure/projects/utoronto.tfvars | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index 361f726e2e..fc51571045 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -20,10 +20,6 @@ core_node_pool = {
   #        reasons like three calico-typha pods. See
   #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
   #
-  #        Transitioning to E2s_v5 would require reducing the requested memory
-  #        by prometheus-server though, but that should be okay since
-  #        prometheus has reduced its memory profile significant enough recently.
-  #
   vm_size : "Standard_E4s_v3",
 
   # FIXME: stop using persistent disks for the nodes, use the variable default

From d02bc52b4653c5216821b0508ff77aef624fd612 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 10 Jan 2024 09:18:39 +0100
Subject: [PATCH 6/8] terraform, azure: autoscaling core node pool and refactor
 to reduce duplication

---
 terraform/azure/main.tf                  | 118 +++++++++++------------
 terraform/azure/projects/utoronto.tfvars |  66 +++++++------
 terraform/azure/variables.tf             |  45 ++-------
 3 files changed, 102 insertions(+), 127 deletions(-)

diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf
index 5777b68340..36e0b567d7 100644
--- a/terraform/azure/main.tf
+++ b/terraform/azure/main.tf
@@ -37,6 +37,7 @@ terraform {
 provider "azuread" {
   tenant_id = var.tenant_id
 }
+
 provider "azurerm" {
   subscription_id = var.subscription_id
   features {}
@@ -93,35 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     }
   }
 
-  # default_node_pool must be set, and it must be a node pool of system type
-  # that can't scale to zero. Due to that we are forced to use it, and have
-  # decided to use it as our core node pool.
-  #
-  # Most changes to this node pool forces a replace operation on the entire
-  # cluster. This can be avoided with v3.47.0+ of this provider by declaring
-  # temporary_name_for_rotation = "coreb".
-  #
-  # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
-  # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
-  #
-  default_node_pool {
-    name                = var.core_node_pool.name
-    vm_size             = var.core_node_pool.vm_size
-    os_disk_size_gb     = var.core_node_pool.os_disk_size_gb
-    enable_auto_scaling = var.core_node_pool.enable_auto_scaling
-    min_count           = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null
-    max_count           = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null
-    node_count          = var.core_node_pool.node_count
-    kubelet_disk_type   = var.core_node_pool.kubelet_disk_type
-    vnet_subnet_id      = azurerm_subnet.node_subnet.id
-    node_labels = merge({
-      "hub.jupyter.org/node-purpose" = "core",
-      "k8s.dask.org/node-purpose"    = "core"
-    }, var.core_node_pool.labels)
-
-    orchestrator_version = coalesce(var.core_node_pool.kubernetes_version, var.kubernetes_version)
-  }
-
   auto_scaler_profile {
     skip_nodes_with_local_storage = true
   }
@@ -131,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   }
 
   network_profile {
-    # I don't trust Azure CNI
+    # Azure CNI is the default, but we don't trust it to be reliable, so we've
+    # opted to use kubenet instead
     network_plugin = "kubenet"
     network_policy = "calico"
   }
@@ -144,68 +117,92 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
       client_secret = azuread_service_principal_password.service_principal_password[0].value
     }
   }
-}
 
+  # default_node_pool must be set, and it must be a node pool of system type
+  # that can't scale to zero. Due to that we are forced to use it, and have
+  # decided to use it as our core node pool.
+  #
+  # Most changes to this node pool forces a replace operation on the entire
+  # cluster. This can be avoided with v3.47.0+ of this provider by declaring
+  # temporary_name_for_rotation = "coreb".
+  #
+  # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
+  # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
+  #
+  default_node_pool {
+    name                = var.node_pools["core"][0].name
+    vm_size             = var.node_pools["core"][0].vm_size
+    os_disk_size_gb     = var.node_pools["core"][0].os_disk_size_gb
+    kubelet_disk_type   = var.node_pools["core"][0].kubelet_disk_type
+    enable_auto_scaling = true
+    min_count           = var.node_pools["core"][0].min
+    max_count           = var.node_pools["core"][0].max
+
+    node_labels = merge({
+      "hub.jupyter.org/node-purpose" = "core",
+      "k8s.dask.org/node-purpose"    = "core"
+    }, var.node_pools["core"][0].labels)
+    node_taints = concat([], var.node_pools["core"][0].taints)
 
+    orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version)
 
-resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
-  for_each = var.user_node_pools
+    vnet_subnet_id = azurerm_subnet.node_subnet.id
+  }
+}
 
-  name                  = coalesce(each.value.name, each.key)
-  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
-  enable_auto_scaling   = true
-  os_disk_size_gb       = each.value.os_disk_size_gb
-  vnet_subnet_id        = azurerm_subnet.node_subnet.id
-  kubelet_disk_type     = each.value.kubelet_disk_type
 
-  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
+resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
+  for_each = { for i, v in var.node_pools["user"] : v.name => v }
+
+  name                = each.value.name
+  vm_size             = each.value.vm_size
+  os_disk_size_gb     = each.value.os_disk_size_gb
+  kubelet_disk_type   = each.value.kubelet_disk_type
+  enable_auto_scaling = true
+  min_count           = each.value.min
+  max_count           = each.value.max
 
-  vm_size = each.value.vm_size
   node_labels = merge({
     "hub.jupyter.org/node-purpose" = "user",
     "k8s.dask.org/node-purpose"    = "scheduler"
   }, each.value.labels)
-
   node_taints = concat([
     "hub.jupyter.org_dedicated=user:NoSchedule"
   ], each.value.taints)
 
+  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
-  min_count = each.value.min
-  max_count = each.value.max
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
+  vnet_subnet_id        = azurerm_subnet.node_subnet.id
 }
 
-resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
-  # If dask_node_pools is set, we use that. If it isn't, we use user_node_pools.
-  # This lets us set dask_node_pools to an empty array to get no dask nodes
-  for_each = var.dask_node_pools
 
-  name                  = "dask${each.key}"
-  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
-  enable_auto_scaling   = true
-  os_disk_size_gb       = each.value.os_disk_size_gb
-  vnet_subnet_id        = azurerm_subnet.node_subnet.id
+resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
+  for_each = { for i, v in var.node_pools["dask"] : v.name => v }
 
-  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
+  name                = each.value.name
+  vm_size             = each.value.vm_size
+  os_disk_size_gb     = each.value.os_disk_size_gb
+  kubelet_disk_type   = each.value.kubelet_disk_type
+  enable_auto_scaling = true
+  min_count           = each.value.min
+  max_count           = each.value.max
 
-  vm_size = each.value.vm_size
   node_labels = merge({
     "k8s.dask.org/node-purpose" = "worker",
   }, each.value.labels)
-
   node_taints = concat([
     "k8s.dask.org_dedicated=worker:NoSchedule"
   ], each.value.taints)
 
+  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
-  min_count = each.value.min
-  max_count = each.value.max
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
+  vnet_subnet_id        = azurerm_subnet.node_subnet.id
 }
 
-# AZure container registry
 
 resource "azurerm_container_registry" "container_registry" {
-  # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
   name                = var.global_container_registry_name
   resource_group_name = azurerm_resource_group.jupyterhub.name
   location            = azurerm_resource_group.jupyterhub.location
@@ -213,6 +210,7 @@ resource "azurerm_container_registry" "container_registry" {
   admin_enabled       = true
 }
 
+
 locals {
   registry_creds = {
     "imagePullSecret" = {
diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index fc51571045..e3c3bb8daa 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -11,36 +11,38 @@ ssh_pub_key  = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2
 # List available versions via: az aks get-versions --location westus2 -o table
 kubernetes_version = "1.28.3"
 
-core_node_pool = {
-  name : "core",
-  kubernetes_version : "1.28.3",
-
-  # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
-  #        can more cheaply handle being forced to have 2-3 replicas for silly
-  #        reasons like three calico-typha pods. See
-  #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
-  #
-  vm_size : "Standard_E4s_v3",
-
-  # FIXME: stop using persistent disks for the nodes, use the variable default
-  #        "Temporary" instead
-  kubelet_disk_type : "OS",
-
-  # FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to
-  #        avoid running low when few replicas are used
-  os_disk_size_gb : 40,
-
-  # FIXME: its nice to use autoscaling, but we end up with three replicas due to
-  #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632
-  #        and its a waste at least using Standard_E4s_v3 machines.
-  enable_auto_scaling : false,
-  node_count : 2,
-}
-
-user_node_pools = {
-  "usere8sv5" : {
-    min : 0,
-    max : 100,
-    vm_size : "Standard_E8s_v5",
-  },
+node_pools = {
+  core : [
+    {
+      name : "core",
+
+      # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
+      #        can more cheaply handle being forced to have 2-3 replicas for silly
+      #        reasons like three calico-typha pods. See
+      #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
+      #
+      vm_size : "Standard_E4s_v3",
+
+      os_disk_size_gb : 40,
+
+      # FIXME: stop using persistent disks for the nodes, use the variable default
+      #        "Temporary" instead
+      kubelet_disk_type : "OS",
+
+      min : 1,
+      max : 10,
+    },
+  ],
+
+  user : [
+    {
+      name : "usere8sv5",
+      vm_size : "Standard_E8s_v5",
+      os_disk_size_gb : 200,
+      min : 0,
+      max : 100,
+    },
+  ],
+
+  dask : []
 }
diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf
index dda5c1fab6..afeee2db6e 100644
--- a/terraform/azure/variables.tf
+++ b/terraform/azure/variables.tf
@@ -78,50 +78,25 @@ variable "ssh_pub_key" {
   EOT
 }
 
-variable "core_node_pool" {
-  type = object({
-    name : optional(string, ""),
-    enable_auto_scaling = optional(bool, true),
-    min : optional(number, 1),
-    max : optional(number, 10),
-    node_count : optional(number),
+variable "node_pools" {
+  type = map(list(object({
+    name : string,
     vm_size : string,
-    labels : optional(map(string), {}),
-    taints : optional(list(string), []),
     os_disk_size_gb : optional(number, 100),
-    kubernetes_version : optional(string, ""),
     kubelet_disk_type : optional(string, "Temporary"),
-  })
-  description = "Core node pool"
-}
-
-variable "user_node_pools" {
-  type = map(object({
-    name : optional(string, ""),
     min : number,
     max : number,
-    vm_size : string,
     labels : optional(map(string), {}),
     taints : optional(list(string), []),
-    os_disk_size_gb : optional(number, 200),
     kubernetes_version : optional(string, ""),
-    kubelet_disk_type : optional(string, "Temporary"),
-  }))
-  description = "User node pools to create"
-  default     = {}
-}
+  })))
+  description = <<-EOT
+  Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
 
-variable "dask_node_pools" {
-  type = map(object({
-    min : number,
-    max : number,
-    vm_size : string,
-    labels : optional(map(string), {}),
-    taints : optional(list(string), []),
-    kubernetes_version : optional(string, ""),
-  }))
-  description = "Dask node pools to create"
-  default     = {}
+  There should be exactly one core node pool. The core node pool is given a
+  special treatment by being listed directly in the cluster resource's
+  'default_node_pool' field.
+  EOT
 }
 
 variable "create_service_principal" {

From d6d89f53721960baf2c257051e360d17ccc25b7e Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 10 Jan 2024 09:51:21 +0100
Subject: [PATCH 7/8] utoronto, terraform: refine comments slightly

---
 terraform/azure/projects/utoronto.tfvars | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index e3c3bb8daa..f7888250e5 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -1,3 +1,11 @@
+# IMPORTANT: Due to a restrictive network rule from storage.tf, we can't perform
+#            "terraform plan" or "terraform apply" without a workaround.
+#
+#            One known workaround is to allow your public IP temporarily as
+#            discussed in https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422.
+#            This workaround is problematic as that may temporarily allow access
+#            to storage by other actors with the same IP.
+#
 tenant_id                      = "78aac226-2f03-4b4d-9037-b46d56c55210"
 subscription_id                = "ead3521a-d994-4a44-a68d-b16e35642d5b"
 resourcegroup_name             = "2i2c-utoronto-cluster"
@@ -16,17 +24,20 @@ node_pools = {
     {
       name : "core",
 
-      # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
-      #        can more cheaply handle being forced to have 2-3 replicas for silly
-      #        reasons like three calico-typha pods. See
-      #        https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
+      # FIXME: Transition to "Standard_E2s_v5" nodes as they are large enough to
+      #        for the biggest workload (prometheus-server) and can handle high
+      #        availability requirements better.
+      #
+      #        We are currently forced to handle three calico-typha pods that
+      #        can't schedule on the same node, see https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
       #
       vm_size : "Standard_E4s_v3",
 
+      # core nodes doesn't need much disk space
       os_disk_size_gb : 40,
 
-      # FIXME: stop using persistent disks for the nodes, use the variable default
-      #        "Temporary" instead
+      # FIXME: Stop using persistent disks for the nodes, use the variable default
+      #        "Temporary" instead by removing this line.
       kubelet_disk_type : "OS",
 
       min : 1,

From d0ad3a226d6ba25bdaecfef40ec13bdf1ea3185c Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Thu, 11 Jan 2024 08:54:10 +0100
Subject: [PATCH 8/8] terraform, azure: add node_pools variable validation

---
 terraform/azure/projects/utoronto.tfvars |  2 +-
 terraform/azure/variables.tf             | 41 +++++++++++++++++-------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
index f7888250e5..6a552efa24 100644
--- a/terraform/azure/projects/utoronto.tfvars
+++ b/terraform/azure/projects/utoronto.tfvars
@@ -55,5 +55,5 @@ node_pools = {
     },
   ],
 
-  dask : []
+  dask : [],
 }
diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf
index afeee2db6e..85856ea2e3 100644
--- a/terraform/azure/variables.tf
+++ b/terraform/azure/variables.tf
@@ -79,17 +79,21 @@ variable "ssh_pub_key" {
 }
 
 variable "node_pools" {
-  type = map(list(object({
-    name : string,
-    vm_size : string,
-    os_disk_size_gb : optional(number, 100),
-    kubelet_disk_type : optional(string, "Temporary"),
-    min : number,
-    max : number,
-    labels : optional(map(string), {}),
-    taints : optional(list(string), []),
-    kubernetes_version : optional(string, ""),
-  })))
+  type = map(
+    list(
+      object({
+        name : string,
+        vm_size : string,
+        os_disk_size_gb : optional(number, 100),
+        kubelet_disk_type : optional(string, "Temporary"),
+        min : number,
+        max : number,
+        labels : optional(map(string), {}),
+        taints : optional(list(string), []),
+        kubernetes_version : optional(string, ""),
+      })
+    )
+  )
   description = <<-EOT
   Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
 
@@ -97,6 +101,21 @@ variable "node_pools" {
   special treatment by being listed directly in the cluster resource's
   'default_node_pool' field.
   EOT
+
+  validation {
+    condition     = length(var.node_pools["core"]) == 1
+    error_message = "The core node pool is mapped to the cluster resource's `default_node_pool`, due to this we require exactly one core node pool to be specified."
+  }
+
+  validation {
+    condition     = length(setsubtract(keys(var.node_pools), ["core", "user", "dask"])) == 0
+    error_message = "Only three kinds of node pools supported: 'core', 'user', and 'dask'."
+  }
+
+  validation {
+    condition     = length(setintersection(keys(var.node_pools), ["core", "user", "dask"])) == 3
+    error_message = "All three kinds of node pools ('core', 'user', and 'dask') must be declared, even if they are empty lists of node pools."
+  }
 }
 
 variable "create_service_principal" {