Skip to content

Commit

Permalink
Merge pull request #3406 from consideRatio/pr/opt-out-node-purpose-in…
Browse files Browse the repository at this point in the history
…fra-labels

terraform, gcp: add opt-out var to collaboratively and opportunistically apply node-purpose infra labels
  • Loading branch information
consideRatio authored Nov 13, 2023
2 parents 889f343 + f152e57 commit 77ef03a
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 17 deletions.
6 changes: 3 additions & 3 deletions terraform/gcp/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ resource "google_container_node_pool" "core" {
# Faster disks provide faster image pulls!
disk_type = "pd-balanced"

resource_labels = {
resource_labels = var.temp_opt_out_node_purpose_label_core_nodes ? {} : {
"node-purpose" : "core"
}

Expand Down Expand Up @@ -340,7 +340,7 @@ resource "google_container_node_pool" "notebook" {
"https://www.googleapis.com/auth/cloud-platform"
]

resource_labels = merge({
resource_labels = each.value.temp_opt_out_node_purpose_label ? each.value.resource_labels : merge({
"node-purpose" : "notebook"
}, each.value.resource_labels)

Expand Down Expand Up @@ -422,7 +422,7 @@ resource "google_container_node_pool" "dask_worker" {
"https://www.googleapis.com/auth/cloud-platform"
]

resource_labels = merge({
resource_labels = each.value.temp_opt_out_node_purpose_label ? each.value.resource_labels : merge({
"node-purpose" : "dask-worker"
}, each.value.resource_labels)

Expand Down
9 changes: 3 additions & 6 deletions terraform/gcp/projects/catalystproject-latam.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,17 @@ filestore_capacity_gb = 1024
core_node_machine_type = "n2-highmem-2"

notebook_nodes = {
# FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running
"small" : {
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
"medium" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
},
# FIXME: Rename this to "n2-highmem-64" when given the chance and no such nodes are running
"large" : {
"n2-highmem-64" : {
min : 0,
max : 100,
machine_type : "n2-highmem-64",
Expand Down
17 changes: 11 additions & 6 deletions terraform/gcp/projects/cloudbank.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,27 @@ k8s_versions = {
notebook_nodes_version : "1.26.4-gke.1400",
}

# FIXME: We have a temporary core node pool setup with n2-highmem-4 and
# pd-balanced. This node pool still has standard though, but has been
# cordoned.
#
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

# FIXME: Transition to n2-highmem-4 when possible
core_node_machine_type = "n1-highmem-4"
enable_network_policy = true

enable_filestore = true
filestore_capacity_gb = 1024

notebook_nodes = {
# FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running
# FIXME: Update the machine type to "n2-highmem-4" and rename this node pool
# when given the chance and no such nodes are running.
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"user" : {
min : 0,
max : 100,
machine_type : "n1-highmem-4",
temp_opt_out_node_purpose_label = true
},
"n2-highmem-16" : {
min : 0,
Expand All @@ -46,7 +51,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16"
Expand Down
19 changes: 19 additions & 0 deletions terraform/gcp/projects/leap.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ k8s_versions = {
dask_nodes_version : "1.27.4-gke.900",
}

# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

# GPUs not available in us-central1-b
zone = "us-central1-c"
region = "us-central1"
Expand Down Expand Up @@ -69,32 +73,44 @@ hub_cloud_permissions = {

# Setup notebook node pools
notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
# FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"medium" : {
# A minimum of one is configured for LEAP to ensure quick startups at all
# time. Cost is not a greater concern than optimizing startup times.
min : 1,
max : 100,
machine_type : "n2-highmem-16",
node_version : "1.25.6-gke.1000",
temp_opt_out_node_purpose_label : true
},
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-64" : {
min : 0,
max : 100,
machine_type : "n2-highmem-64"
temp_opt_out_node_purpose_label : true
}
# FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"gpu-t4" : {
min : 0,
max : 100,
machine_type : "n1-standard-8",
node_version : "1.25.6-gke.1000",
temp_opt_out_node_purpose_label : true
gpu : {
enabled : true,
type : "nvidia-tesla-t4",
Expand All @@ -117,6 +133,8 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-16" : {
min : 0,
max : 200,
Expand All @@ -125,5 +143,6 @@ dask_nodes = {
# See https://github.com/2i2c-org/infrastructure/issues/2396
preemptible : false,
machine_type : "n2-highmem-16"
temp_opt_out_node_purpose_label : true
},
}
20 changes: 18 additions & 2 deletions terraform/gcp/projects/pangeo-hubs.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,19 @@
#
# terraform apply --var-file projects/pangeo-hubs.tfvars
#
# FIXME: core_node_machine_type should be set to n2-highmem-4 as its enough
prefix = "pangeo-hubs"
project_id = "pangeo-integration-te-3eea"
billing_project_id = "pangeo-integration-te-3eea"
zone = "us-central1-b"
region = "us-central1"
core_node_machine_type = "n2-highmem-8"
enable_private_cluster = true

# FIXME: core_node_machine_type should be set to n2-highmem-4 as its enough
# FIXME: Remove temp_opt_out_node_purpose_label_core_nodes when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
core_node_machine_type = "n2-highmem-8"
temp_opt_out_node_purpose_label_core_nodes = true

k8s_versions = {
min_master_version : "1.26.5-gke.2100",
core_nodes_version : "1.26.4-gke.1400",
Expand Down Expand Up @@ -58,40 +62,49 @@ user_buckets = {

# Setup notebook node pools
notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
temp_opt_out_node_purpose_label : true,
},
"n2-highmem-64" : {
min : 0,
max : 100,
machine_type : "n2-highmem-64",
temp_opt_out_node_purpose_label : true,
},
"small" : {
min : 0,
max : 100,
machine_type : "n1-standard-2",
temp_opt_out_node_purpose_label : true,
},
"medium" : {
min : 0,
max : 100,
machine_type : "n1-standard-4",
temp_opt_out_node_purpose_label : true,
},
"large" : {
min : 0,
max : 100,
machine_type : "n1-standard-8",
temp_opt_out_node_purpose_label : true,
},
"huge" : {
min : 0,
max : 100,
machine_type : "n1-standard-16",
temp_opt_out_node_purpose_label : true,
},
}

Expand All @@ -102,10 +115,13 @@ notebook_nodes = {
#
dask_nodes = {
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"worker" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
temp_opt_out_node_purpose_label : true,
},
}

Expand Down
7 changes: 7 additions & 0 deletions terraform/gcp/projects/qcl.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ k8s_versions = {
notebook_nodes_version : "1.27.4-gke.900",
}

# FIXME: Remove temp_opt_out_node_purpose_label_core_nodes when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

core_node_machine_type = "n2-highmem-2"
enable_network_policy = true

Expand All @@ -27,10 +31,13 @@ user_buckets = {
}

notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
"n2-highmem-16" : {
min : 0,
Expand Down
13 changes: 13 additions & 0 deletions terraform/gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ variable "notebook_nodes" {
}),
{}
),
# FIXME: Remove temp_opt_out_node_purpose_label when its no longer referenced.
# See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label : optional(bool, false),
resource_labels : optional(map(string), {}),
zones : optional(list(string), []),
node_version : optional(string, ""),
Expand Down Expand Up @@ -125,6 +128,9 @@ variable "dask_nodes" {
}),
{}
),
# FIXME: Remove temp_opt_out_node_purpose_label when its no longer referenced.
# See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label : optional(bool, false),
resource_labels : optional(map(string), {}),
zones : optional(list(string), [])
}))
Expand Down Expand Up @@ -223,6 +229,13 @@ variable "core_node_max_count" {
EOT
}

# FIXME: Remove temp_opt_out_node_purpose_label_core_nodes when its no longer referenced.
# See https://github.com/2i2c-org/infrastructure/issues/3405.
variable "temp_opt_out_node_purpose_label_core_nodes" {
type = bool
default = false
}

variable "enable_network_policy" {
type = bool
default = false
Expand Down

0 comments on commit 77ef03a

Please sign in to comment.