From 5000f06168e98b152f8ee0a1b298981ecfc00af6 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Wed, 26 Jun 2024 09:59:45 -0500 Subject: [PATCH 01/25] save progress --- src/_nebari/stages/infrastructure/__init__.py | 71 +++++++++++++------ .../template/gcp/modules/kubernetes/main.tf | 17 +++++ 2 files changed, 67 insertions(+), 21 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 8b188a720b..e7b1b16056 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -1,4 +1,5 @@ import contextlib +import enum import inspect import os import pathlib @@ -41,10 +42,36 @@ class ExistingInputVars(schema.Base): kube_context: str -class DigitalOceanNodeGroup(schema.Base): +# TODO: Make sure the taint is actually applied to the nodes for each provider +class taintEffectEnum(str, enum.Enum): + NoSchedule: str = "NoSchedule" + PreferNoSchedule: str = "PreferNoSchedule" + NoExecute: str = "NoExecute" + + +class Taint(schema.Base): + key: str + value: str + effect: taintEffectEnum + + +class NodeGroup(schema.Base): instance: str - min_nodes: int - max_nodes: int + min_nodes: Annotated[int, Field(ge=0)] = 0 + max_nodes: Annotated[int, Field(ge=1)] = 1 + taints: Optional[List[Taint]] = [] + + @field_validator("taints", mode="before") + def validate_taint_strings(cls, value: List[str]): + TAINT_STR_REGEX = re.compile(r"(\w+)=(\w+):(\w+)") + parsed_taints = [] + for taint in value: + match = TAINT_STR_REGEX.match(taint) + if not match: + raise ValueError(f"Invalid taint string: {taint}") + key, value, effect = match.groups() + parsed_taints.append(Taint(key=key, value=value, effect=effect)) + return parsed_taints class DigitalOceanInputVars(schema.Base): @@ -53,7 +80,7 @@ class DigitalOceanInputVars(schema.Base): region: str tags: List[str] kubernetes_version: str - node_groups: Dict[str, DigitalOceanNodeGroup] + node_groups: Dict[str, "DigitalOceanNodeGroup"] kubeconfig_filename: str = get_kubeconfig_filename() @@ -62,6 +89,7 @@ class GCPNodeGroupInputVars(schema.Base): instance_type: str min_size: int max_size: int + node_taints: None | List[Taint] labels: Dict[str, str] preemptible: bool guest_accelerators: List["GCPGuestAccelerator"] @@ -211,16 +239,14 @@ class KeyValueDict(schema.Base): value: str -class DigitalOceanNodeGroup(schema.Base): +class DigitalOceanNodeGroup(NodeGroup): """Representation of a node group with Digital Ocean - Kubernetes limits: https://docs.digitalocean.com/products/kubernetes/details/limits/ - Available instance types: https://slugs.do-api.dev/ """ - instance: str min_nodes: Annotated[int, Field(ge=1)] = 1 - max_nodes: Annotated[int, Field(ge=1)] = 1 DEFAULT_DO_NODE_GROUPS = { @@ -305,10 +331,7 @@ class GCPGuestAccelerator(schema.Base): count: Annotated[int, Field(ge=1)] = 1 -class GCPNodeGroup(schema.Base): - instance: str - min_nodes: Annotated[int, Field(ge=0)] = 0 - max_nodes: Annotated[int, Field(ge=1)] = 1 +class GCPNodeGroup(NodeGroup): preemptible: bool = False labels: Dict[str, str] = {} guest_accelerators: List[GCPGuestAccelerator] = [] @@ -316,8 +339,18 @@ class GCPNodeGroup(schema.Base): DEFAULT_GCP_NODE_GROUPS = { "general": GCPNodeGroup(instance="e2-highmem-4", min_nodes=1, max_nodes=1), - "user": GCPNodeGroup(instance="e2-standard-4", min_nodes=0, max_nodes=5), - "worker": GCPNodeGroup(instance="e2-standard-4", min_nodes=0, max_nodes=5), + "user": GCPNodeGroup( + instance="e2-standard-4", + min_nodes=0, + max_nodes=5, + taints=[Taint(key="dedicated", value="user", effect="NoSchedule")], + ), + "worker": GCPNodeGroup( + instance="e2-standard-4", + min_nodes=0, + max_nodes=5, + taints=[Taint(key="dedicated", value="worker", effect="NoSchedule")], + ), } @@ -355,10 +388,8 @@ def _check_input(cls, data: Any) -> Any: return data -class AzureNodeGroup(schema.Base): - instance: str - min_nodes: int - max_nodes: int +class AzureNodeGroup(NodeGroup): + pass DEFAULT_AZURE_NODE_GROUPS = { @@ -426,10 +457,7 @@ def _validate_tags(cls, value: Optional[Dict[str, str]]) -> Dict[str, str]: return value if value is None else azure_cloud.validate_tags(value) -class AWSNodeGroup(schema.Base): - instance: str - min_nodes: int = 0 - max_nodes: int +class AWSNodeGroup(NodeGroup): gpu: bool = False single_subnet: bool = False permissions_boundary: Optional[str] = None @@ -738,6 +766,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): instance_type=node_group.instance, min_size=node_group.min_nodes, max_size=node_group.max_nodes, + node_taints=node_group.taints, preemptible=node_group.preemptible, guest_accelerators=node_group.guest_accelerators, ) diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf index c4b18f32ad..751269562c 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf @@ -92,6 +92,23 @@ resource "google_container_node_pool" "main" { oauth_scopes = local.node_group_oauth_scopes + dynamic "taint" { + for_each = local.merged_node_groups[count.index].taints + content { + key = taint.value["key"] + value = taint.value["value"] + effect = taint.value["effect"] + } + } + # taint = [ + # # Do this for every taint in taints + # { + # "key": local.merged_node_groups[count.index].taints[0]['key'], + # "value": local.merged_node_groups[count.index].taints[0]['value'], + # "effect": local.merged_node_groups[count.index].taints[0]['effect']} + # }, + # ] + metadata = { disable-legacy-endpoints = "true" } From a661514ed6c8c250ba3e98fe6364e6c92ff52c12 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:35:27 -0500 Subject: [PATCH 02/25] fix node taint check --- src/_nebari/stages/infrastructure/__init__.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 04c01605cb..5e2e64292e 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -62,15 +62,25 @@ class NodeGroup(schema.Base): taints: Optional[List[Taint]] = [] @field_validator("taints", mode="before") - def validate_taint_strings(cls, value: List[str]): + def validate_taint_strings(cls, value: List[str | Taint]): TAINT_STR_REGEX = re.compile(r"(\w+)=(\w+):(\w+)") parsed_taints = [] for taint in value: - match = TAINT_STR_REGEX.match(taint) - if not match: - raise ValueError(f"Invalid taint string: {taint}") - key, value, effect = match.groups() - parsed_taints.append(Taint(key=key, value=value, effect=effect)) + if not isinstance(taint, (str, Taint)): + raise ValueError( + f"Unable to parse type: {type(taint)} as taint. Must be a string or Taint object." + ) + + if isinstance(taint, Taint): + parsed_taint = taint + elif isinstance(taint, str): + match = TAINT_STR_REGEX.match(taint) + if not match: + raise ValueError(f"Invalid taint string: {taint}") + key, value, effect = match.groups() + parsed_taint = Taint(key=key, value=value, effect=effect) + parsed_taints.append(parsed_taint) + return parsed_taints From 7f1800d3118de9fc21020ee02870cba501389945 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 19 Aug 2024 12:49:55 -0500 Subject: [PATCH 03/25] fix node taints on gcp --- .../template/gcp/modules/kubernetes/main.tf | 8 ++++---- .../template/gcp/modules/kubernetes/variables.tf | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf index 751269562c..caba36af0a 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf @@ -93,11 +93,11 @@ resource "google_container_node_pool" "main" { oauth_scopes = local.node_group_oauth_scopes dynamic "taint" { - for_each = local.merged_node_groups[count.index].taints + for_each = local.merged_node_groups[count.index].node_taints content { - key = taint.value["key"] - value = taint.value["value"] - effect = taint.value["effect"] + key = each.value["key"] + value = each.value["value"] + effect = each.value["effect"] } } # taint = [ diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf index cef5363030..296b5c3188 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf @@ -50,6 +50,7 @@ variable "node_groups" { min_size = 1 max_size = 1 labels = {} + node_taints = [] }, { name = "user" @@ -57,6 +58,7 @@ variable "node_groups" { min_size = 0 max_size = 2 labels = {} + node_taints = [] # TODO: Do this for other cloud providers }, { name = "worker" @@ -64,6 +66,7 @@ variable "node_groups" { min_size = 0 max_size = 5 labels = {} + node_taints = [] } ] } From 40940f6406fa193196cb9097a02a6c63d803cbe5 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:14:12 -0500 Subject: [PATCH 04/25] add latest changes --- src/_nebari/stages/infrastructure/__init__.py | 45 ++++++++++--------- .../template/gcp/modules/kubernetes/main.tf | 14 ++---- .../stages/kubernetes_services/__init__.py | 22 +++++++++ .../template/jupyterhub.tf | 11 +++++ .../files/jupyterhub/03-profiles.py | 20 +++++++++ .../kubernetes/services/jupyterhub/main.tf | 1 + .../services/jupyterhub/variables.tf | 10 +++++ src/nebari/schema.py | 13 ++++++ 8 files changed, 103 insertions(+), 33 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 5e2e64292e..0f34632d37 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -1,5 +1,4 @@ import contextlib -import enum import inspect import os import pathlib @@ -42,43 +41,30 @@ class ExistingInputVars(schema.Base): kube_context: str -# TODO: Make sure the taint is actually applied to the nodes for each provider -class taintEffectEnum(str, enum.Enum): - NoSchedule: str = "NoSchedule" - PreferNoSchedule: str = "PreferNoSchedule" - NoExecute: str = "NoExecute" - - -class Taint(schema.Base): - key: str - value: str - effect: taintEffectEnum - - class NodeGroup(schema.Base): instance: str min_nodes: Annotated[int, Field(ge=0)] = 0 max_nodes: Annotated[int, Field(ge=1)] = 1 - taints: Optional[List[Taint]] = [] + taints: Optional[List[schema.Taint]] = [] @field_validator("taints", mode="before") - def validate_taint_strings(cls, value: List[str | Taint]): + def validate_taint_strings(cls, value: List[str | schema.Taint]): TAINT_STR_REGEX = re.compile(r"(\w+)=(\w+):(\w+)") parsed_taints = [] for taint in value: - if not isinstance(taint, (str, Taint)): + if not isinstance(taint, (str, schema.Taint)): raise ValueError( f"Unable to parse type: {type(taint)} as taint. Must be a string or Taint object." ) - if isinstance(taint, Taint): + if isinstance(taint, schema.Taint): parsed_taint = taint elif isinstance(taint, str): match = TAINT_STR_REGEX.match(taint) if not match: raise ValueError(f"Invalid taint string: {taint}") key, value, effect = match.groups() - parsed_taint = Taint(key=key, value=value, effect=effect) + parsed_taint = schema.Taint(key=key, value=value, effect=effect) parsed_taints.append(parsed_taint) return parsed_taints @@ -99,11 +85,26 @@ class GCPNodeGroupInputVars(schema.Base): instance_type: str min_size: int max_size: int - node_taints: None | List[Taint] + node_taints: List[dict] labels: Dict[str, str] preemptible: bool guest_accelerators: List["GCPGuestAccelerator"] + @field_validator("node_taints", mode="before") + def convert_taints(cls, value: Optional[List[schema.Taint]]): + return [ + dict( + key=taint.key, + value=taint.value, + effect={ + schema.TaintEffectEnum.NoSchedule: "NO_SCHEDULE", + schema.TaintEffectEnum.PreferNoSchedule: "PREFER_NO_SCHEDULE", + schema.TaintEffectEnum.NoExecute: "NO_EXECUTE", + }[taint.effect], + ) + for taint in value + ] + class GCPPrivateClusterConfig(schema.Base): enable_private_nodes: bool @@ -353,13 +354,13 @@ class GCPNodeGroup(NodeGroup): instance="e2-standard-4", min_nodes=0, max_nodes=5, - taints=[Taint(key="dedicated", value="user", effect="NoSchedule")], + taints=[schema.Taint(key="dedicated", value="user", effect="NoSchedule")], ), "worker": GCPNodeGroup( instance="e2-standard-4", min_nodes=0, max_nodes=5, - taints=[Taint(key="dedicated", value="worker", effect="NoSchedule")], + taints=[schema.Taint(key="dedicated", value="worker", effect="NoSchedule")], ), } diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf index caba36af0a..989f8e813f 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf @@ -95,19 +95,11 @@ resource "google_container_node_pool" "main" { dynamic "taint" { for_each = local.merged_node_groups[count.index].node_taints content { - key = each.value["key"] - value = each.value["value"] - effect = each.value["effect"] + key = taint.value.key + value = taint.value.value + effect = taint.value.effect } } - # taint = [ - # # Do this for every taint in taints - # { - # "key": local.merged_node_groups[count.index].taints[0]['key'], - # "value": local.merged_node_groups[count.index].taints[0]['value'], - # "effect": local.merged_node_groups[count.index].taints[0]['effect']} - # }, - # ] metadata = { disable-legacy-endpoints = "true" diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index 206a483251..de8066c8dd 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -382,6 +382,19 @@ class CondaStoreInputVars(schema.Base): ) +class TolerationOperatorEnum(str, enum.Enum): + Equal = "Equal" + Exists = "Exists" + + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_str(node.value) + + +class Toleration(schema.Taint): + operator: TolerationOperatorEnum = TolerationOperatorEnum.Equal + + class JupyterhubInputVars(schema.Base): jupyterhub_theme: Dict[str, Any] = Field(alias="jupyterhub-theme") jupyterlab_image: ImageNameTag = Field(alias="jupyterlab-image") @@ -405,6 +418,9 @@ class JupyterhubInputVars(schema.Base): jhub_apps_enabled: bool = Field(alias="jhub-apps-enabled") cloud_provider: str = Field(alias="cloud-provider") jupyterlab_preferred_dir: Optional[str] = Field(alias="jupyterlab-preferred-dir") + node_taint_tolerations: Optional[List[Toleration]] = Field( + alias="node-taint-tolerations" + ) class DaskGatewayInputVars(schema.Base): @@ -565,6 +581,12 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): jupyterlab_default_settings=self.config.jupyterlab.default_settings, jupyterlab_gallery_settings=self.config.jupyterlab.gallery_settings, jupyterlab_preferred_dir=self.config.jupyterlab.preferred_dir, + node_taint_tolerations=[ + Toleration(**taint.model_dump()) + for taint in self.config.google_cloud_platform.node_groups[ + "user" + ].taints + ], # TODO: support other cloud providers ) dask_gateway_vars = DaskGatewayInputVars( diff --git a/src/_nebari/stages/kubernetes_services/template/jupyterhub.tf b/src/_nebari/stages/kubernetes_services/template/jupyterhub.tf index 29450ddfad..285aa07e62 100644 --- a/src/_nebari/stages/kubernetes_services/template/jupyterhub.tf +++ b/src/_nebari/stages/kubernetes_services/template/jupyterhub.tf @@ -85,6 +85,16 @@ variable "idle-culler-settings" { type = any } +variable "node-taint-tolerations" { + description = "Node taint toleration" + type = list(object({ + key = string + operator = string + value = string + effect = string + })) +} + module "kubernetes-nfs-server" { count = var.jupyterhub-shared-endpoint == null ? 1 : 0 @@ -137,6 +147,7 @@ module "jupyterhub" { conda-store-service-name = module.kubernetes-conda-store-server.service_name conda-store-jhub-apps-token = module.kubernetes-conda-store-server.service-tokens.jhub-apps jhub-apps-enabled = var.jhub-apps-enabled + node-taint-tolerations = var.node-taint-tolerations extra-mounts = { "/etc/dask" = { diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/files/jupyterhub/03-profiles.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/files/jupyterhub/03-profiles.py index 22193e79dc..1f7405c1a2 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/files/jupyterhub/03-profiles.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/files/jupyterhub/03-profiles.py @@ -241,6 +241,25 @@ def base_profile_extra_mounts(): } +def node_taint_tolerations(): + tolerations = z2jh.get_config("custom.node-taint-tolerations") + + if not tolerations: + return {} + + return { + "tolerations": [ + { + "key": taint["key"], + "operator": taint["operator"], + "value": taint["value"], + "effect": taint["effect"], + } + for taint in tolerations + ] + } + + def configure_user_provisioned_repositories(username): # Define paths and configurations pvc_home_mount_path = f"home/{username}" @@ -519,6 +538,7 @@ def render_profile(profile, username, groups, keycloak_profilenames): configure_user(username, groups), configure_user_provisioned_repositories(username), profile_kubespawner_override, + node_taint_tolerations(), ], {}, ) diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/main.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/main.tf index 8c310c5edb..5e02cfa4de 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/main.tf +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/main.tf @@ -78,6 +78,7 @@ resource "helm_release" "jupyterhub" { conda-store-jhub-apps-token = var.conda-store-jhub-apps-token jhub-apps-enabled = var.jhub-apps-enabled initial-repositories = var.initial-repositories + node-taint-tolerations = var.node-taint-tolerations skel-mount = { name = kubernetes_config_map.etc-skel.metadata.0.name namespace = kubernetes_config_map.etc-skel.metadata.0.namespace diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/variables.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/variables.tf index 21767723c9..7e859a6008 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/variables.tf +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/jupyterhub/variables.tf @@ -208,3 +208,13 @@ variable "initial-repositories" { type = string default = "[]" } + +variable "node-taint-tolerations" { + description = "Node taint toleration" + type = list(object({ + key = string + operator = string + value = string + effect = string + })) +} diff --git a/src/nebari/schema.py b/src/nebari/schema.py index 2cc1c1ea3f..6f453eaf5b 100644 --- a/src/nebari/schema.py +++ b/src/nebari/schema.py @@ -91,3 +91,16 @@ def is_version_accepted(v): for deployment with the current Nebari package. """ return Main.is_version_accepted(v) + + +# TODO: Make sure the taint is actually applied to the nodes for each provider +class TaintEffectEnum(str, enum.Enum): + NoSchedule: str = "NoSchedule" + PreferNoSchedule: str = "PreferNoSchedule" + NoExecute: str = "NoExecute" + + +class Taint(Base): + key: str + value: str + effect: TaintEffectEnum From 6382c7bfeb65d80bc7ad4eed6383b8d9fc666b79 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:09:19 -0500 Subject: [PATCH 05/25] allow daemonsets to run on user node group --- .../stages/kubernetes_services/__init__.py | 6 ++++++ .../kubernetes/services/monitoring/loki/main.tf | 16 ++++++++++++++++ .../kubernetes_services/template/rook-ceph.tf | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index e072dfe44e..2a6f0f80cf 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -479,10 +479,16 @@ class JupyterhubInputVars(schema.Base): jhub_apps_enabled: bool = Field(alias="jhub-apps-enabled") cloud_provider: str = Field(alias="cloud-provider") jupyterlab_preferred_dir: Optional[str] = Field(alias="jupyterlab-preferred-dir") + shared_fs_type: SharedFsEnum node_taint_tolerations: Optional[List[Toleration]] = Field( alias="node-taint-tolerations" ) + @field_validator("jupyterhub_shared_storage", mode="before") + @classmethod + def handle_units(cls, value: Optional[str]) -> float: + return byte_unit_conversion(value, "GiB") + class DaskGatewayInputVars(schema.Base): dask_worker_image: ImageNameTag = Field(alias="dask-worker-image") diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf index 8180d46fb8..3868de9cbf 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/monitoring/loki/main.tf @@ -96,6 +96,22 @@ resource "helm_release" "grafana-promtail" { values = concat([ file("${path.module}/values_promtail.yaml"), jsonencode({ + tolerations = [ + { + key = "node-role.kubernetes.io/master" + operator = "Exists" + effect = "NoSchedule" + }, + { + key = "node-role.kubernetes.io/control-plane" + operator = "Exists" + effect = "NoSchedule" + }, + { + operator = "Exists" + effect = "NoSchedule" + }, + ] }) ], var.grafana-promtail-overrides) diff --git a/src/_nebari/stages/kubernetes_services/template/rook-ceph.tf b/src/_nebari/stages/kubernetes_services/template/rook-ceph.tf index 1895d50d41..8f81d9de65 100644 --- a/src/_nebari/stages/kubernetes_services/template/rook-ceph.tf +++ b/src/_nebari/stages/kubernetes_services/template/rook-ceph.tf @@ -45,6 +45,13 @@ resource "helm_release" "rook-ceph" { }, csi = { enableRbdDriver = false, # necessary to provision block storage, but saves some cpu and memory if not needed + provisionerReplicas : 1, # default is 2 on different nodes + pluginTolerations = [ + { + operator = "Exists" + effect = "NoSchedule" + } + ], }, }) ], From e9d9dd98c8828c396867b2e5b3b96cabca348a95 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:23:35 -0500 Subject: [PATCH 06/25] recreate node groups when taints change --- .../infrastructure/template/gcp/modules/kubernetes/main.tf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf index 9b06a48882..182168fada 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/main.tf @@ -117,9 +117,4 @@ resource "google_container_node_pool" "main" { tags = var.tags } - lifecycle { - ignore_changes = [ - node_config[0].taint - ] - } } From c55cd5f7ecc8911ba93d6acea6cec29114ce5bd4 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:26:01 -0500 Subject: [PATCH 07/25] quick attempt to get scheduler running on tanted worker node group --- .../template/dask_gateway.tf | 13 ++++++++ .../services/dask-gateway/variables.tf | 30 +++++++++---------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf b/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf index a47acee8fa..b624bf9699 100644 --- a/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf +++ b/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf @@ -43,6 +43,19 @@ module "dask-gateway" { forwardauth_middleware_name = var.forwardauth_middleware_name + cluster = { + scheduler_extra_pod_config = { + tolerations = [ + { + key = "dedicated" + operator = "Equal" + value = "adamworker" + effect = "NoSchedule" + } + ] + } + } + depends_on = [ module.kubernetes-nfs-server, module.rook-ceph diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/variables.tf b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/variables.tf index 121405a322..0b3fbcab35 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/variables.tf +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/variables.tf @@ -130,23 +130,23 @@ variable "cluster" { description = "dask gateway cluster defaults" type = object({ # scheduler configuration - scheduler_cores = number - scheduler_cores_limit = number - scheduler_memory = string - scheduler_memory_limit = string - scheduler_extra_container_config = any - scheduler_extra_pod_config = any + scheduler_cores = optional(number, 1) + scheduler_cores_limit = optional(number, 1) + scheduler_memory = optional(string, "2 G") + scheduler_memory_limit = optional(string, "2 G") + scheduler_extra_container_config = optional(any, {}) + scheduler_extra_pod_config = optional(any, {}) # worker configuration - worker_cores = number - worker_cores_limit = number - worker_memory = string - worker_memory_limit = string - worker_extra_container_config = any - worker_extra_pod_config = any + worker_cores = optional(number, 1) + worker_cores_limit = optional(number, 1) + worker_memory = optional(string, "2 G") + worker_memory_limit = optional(string, "2 G") + worker_extra_container_config = optional(any, {}) + worker_extra_pod_config = optional(any, {}) # additional fields - idle_timeout = number - image_pull_policy = string - environment = map(string) + idle_timeout = optional(number, 1800) # 30 minutes + image_pull_policy = optional(string, "IfNotPresent") + environment = optional(map(string), {}) }) default = { # scheduler configuration From a1370c939823899763b9d9b21fe9a492436d36b9 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:40:21 -0500 Subject: [PATCH 08/25] add default options to options_handler --- .../services/dask-gateway/files/gateway_config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index c58e3aa90d..3b964a05af 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -15,7 +15,6 @@ def dask_gateway_config(path="/var/lib/dask-gateway/config.json"): config = dask_gateway_config() - c.DaskGateway.log_level = config["gateway"]["loglevel"] # Configure addresses @@ -227,18 +226,19 @@ def base_username_mount(username, uid=1000, gid=100): } -def worker_profile(options, user): +def options_handler(options, user): namespace, name = options.conda_environment.split("/") return functools.reduce( deep_merge, [ + {}, base_node_group(options), base_conda_store_mounts(namespace, name), base_username_mount(user.name), config["profiles"][options.profile], {"environment": {**options.environment_vars}}, + config["cluster"], ], - {}, ) @@ -279,7 +279,7 @@ def user_options(user): return Options( *args, - handler=worker_profile, + handler=options_handler, ) @@ -288,7 +288,7 @@ def user_options(user): # ============== utils ============ def deep_merge(d1, d2): - """Deep merge two dictionaries. + """Deep merge two dictionaries. Left argument takes precedence. >>> value_1 = { 'a': [1, 2], 'b': {'c': 1, 'z': [5, 6]}, From 0e7e11cc62670684a0cf5c5d4da1e86ed14ea200 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:33:23 -0500 Subject: [PATCH 09/25] add comments --- .../kubernetes/services/dask-gateway/files/gateway_config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index 3b964a05af..b3f856d25f 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -231,13 +231,16 @@ def options_handler(options, user): return functools.reduce( deep_merge, [ + # ordering is higher to lower precedence {}, base_node_group(options), base_conda_store_mounts(namespace, name), base_username_mount(user.name), config["profiles"][options.profile], {"environment": {**options.environment_vars}}, - config["cluster"], + config[ + "cluster" + ], # TODO: potentially too broad, maybe just add scheduler/worker pod overrides ], ) From adb9d745fa8998bc5032a323c836c7b89309e2b8 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:33:40 -0500 Subject: [PATCH 10/25] rename variable --- .../services/dask-gateway/files/gateway_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index b3f856d25f..9b1d90b83e 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -227,19 +227,19 @@ def base_username_mount(username, uid=1000, gid=100): def options_handler(options, user): - namespace, name = options.conda_environment.split("/") + namespace, environment_name = options.conda_environment.split("/") return functools.reduce( deep_merge, [ # ordering is higher to lower precedence {}, base_node_group(options), - base_conda_store_mounts(namespace, name), + base_conda_store_mounts(namespace, environment_name), base_username_mount(user.name), config["profiles"][options.profile], {"environment": {**options.environment_vars}}, config[ - "cluster" + "cluster", ], # TODO: potentially too broad, maybe just add scheduler/worker pod overrides ], ) From 794407156b67091e612e71162b7f489698318108 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:25:27 -0500 Subject: [PATCH 11/25] add comment --- .../kubernetes/services/dask-gateway/files/gateway_config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index 9b1d90b83e..fcd54d3fab 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -39,6 +39,8 @@ def dask_gateway_config(path="/var/lib/dask-gateway/config.json"): c.KubeClusterConfig.scheduler_extra_container_config = config["cluster"][ "scheduler_extra_container_config" ] + +# clobbered by c.Backend.cluster_options['scheduler_extra_pod_config'] if present c.KubeClusterConfig.scheduler_extra_pod_config = config["cluster"][ "scheduler_extra_pod_config" ] From fa81fb99e683d9f94f900241f7e2e0e7d9363c90 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:11:26 -0500 Subject: [PATCH 12/25] make work for all providers --- src/_nebari/stages/infrastructure/__init__.py | 18 +------------ .../stages/kubernetes_services/__init__.py | 21 ++++++++++++--- .../template/dask_gateway.tf | 22 ++++++++++------ .../dask-gateway/files/gateway_config.py | 10 ++++--- src/_nebari/upgrade.py | 8 ++---- src/nebari/schema.py | 26 ++++++++++++++++++- 6 files changed, 65 insertions(+), 40 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index f1f4f3b287..aee031b1ef 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -623,24 +623,8 @@ class ExistingProvider(schema.Base): schema.ProviderEnum.do: DigitalOceanProvider, } -provider_enum_name_map: Dict[schema.ProviderEnum, str] = { - schema.ProviderEnum.local: "local", - schema.ProviderEnum.existing: "existing", - schema.ProviderEnum.gcp: "google_cloud_platform", - schema.ProviderEnum.aws: "amazon_web_services", - schema.ProviderEnum.azure: "azure", - schema.ProviderEnum.do: "digital_ocean", -} - provider_name_abbreviation_map: Dict[str, str] = { - value: key.value for key, value in provider_enum_name_map.items() -} - -provider_enum_default_node_groups_map: Dict[schema.ProviderEnum, Any] = { - schema.ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), - schema.ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), - schema.ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), - schema.ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), + value: key.value for key, value in schema.provider_enum_name_map.items() } diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index 43f8e06390..75c529663a 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -506,6 +506,9 @@ class DaskGatewayInputVars(schema.Base): dask_gateway_profiles: Dict[str, Any] = Field(alias="dask-gateway-profiles") cloud_provider: str = Field(alias="cloud-provider") forwardauth_middleware_name: str = _forwardauth_middleware_name + worker_taint_tolerations: Optional[list[Toleration]] = Field( + alias="worker-taint-tolerations" + ) class MonitoringInputVars(schema.Base): @@ -664,10 +667,12 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): jupyterlab_preferred_dir=self.config.jupyterlab.preferred_dir, node_taint_tolerations=[ Toleration(**taint.model_dump()) - for taint in self.config.google_cloud_platform.node_groups[ - "user" - ].taints - ], # TODO: support other cloud providers + for taint in getattr( + self.config, schema.provider_enum_name_map[self.config.provider] + ) + .node_groups["user"] + .taints + ], shared_fs_type=( # efs is equivalent to nfs in these modules SharedFsEnum.nfs @@ -682,6 +687,14 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): ), dask_gateway_profiles=self.config.profiles.model_dump()["dask_worker"], cloud_provider=cloud_provider, + worker_taint_tolerations=[ + Toleration(**taint.model_dump()) + for taint in getattr( + self.config, schema.provider_enum_name_map[self.config.provider] + ) + .node_groups["worker"] + .taints + ], ) monitoring_vars = MonitoringInputVars( diff --git a/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf b/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf index b624bf9699..997a4ab294 100644 --- a/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf +++ b/src/_nebari/stages/kubernetes_services/template/dask_gateway.tf @@ -11,6 +11,16 @@ variable "dask-gateway-profiles" { description = "Dask Gateway profiles to expose to user" } +variable "worker-taint-tolerations" { + description = "Tolerations for the worker node taints needed by Dask Scheduler/Worker pods" + type = list(object({ + key = string + operator = string + value = string + effect = string + })) +} + # =================== RESOURCES ===================== module "dask-gateway" { source = "./modules/kubernetes/services/dask-gateway" @@ -45,14 +55,10 @@ module "dask-gateway" { cluster = { scheduler_extra_pod_config = { - tolerations = [ - { - key = "dedicated" - operator = "Equal" - value = "adamworker" - effect = "NoSchedule" - } - ] + tolerations = var.worker-taint-tolerations + } + worker_extra_pod_config = { + tolerations = var.worker-taint-tolerations } } diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index fcd54d3fab..975a48460e 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -25,6 +25,8 @@ def dask_gateway_config(path="/var/lib/dask-gateway/config.json"): c.KubeBackend.gateway_instance = config["gateway_service_name"] # ========= Dask Cluster Default Configuration ========= +# These settings are overridden by c.Backend.cluster_option if key e.g. image, scheduler_extra_pod_config, etc. is present + c.KubeClusterConfig.image = ( f"{config['cluster-image']['name']}:{config['cluster-image']['tag']}" ) @@ -40,7 +42,6 @@ def dask_gateway_config(path="/var/lib/dask-gateway/config.json"): "scheduler_extra_container_config" ] -# clobbered by c.Backend.cluster_options['scheduler_extra_pod_config'] if present c.KubeClusterConfig.scheduler_extra_pod_config = config["cluster"][ "scheduler_extra_pod_config" ] @@ -240,9 +241,10 @@ def options_handler(options, user): base_username_mount(user.name), config["profiles"][options.profile], {"environment": {**options.environment_vars}}, - config[ - "cluster", - ], # TODO: potentially too broad, maybe just add scheduler/worker pod overrides + { + k: config["cluster"][k] + for k in ("worker_extra_pod_config", "scheduler_extra_pod_config") + }, ], ) diff --git a/src/_nebari/upgrade.py b/src/_nebari/upgrade.py index 6536612f2d..98614dbfc3 100644 --- a/src/_nebari/upgrade.py +++ b/src/_nebari/upgrade.py @@ -25,10 +25,6 @@ from _nebari.config import backup_configuration from _nebari.keycloak import get_keycloak_admin -from _nebari.stages.infrastructure import ( - provider_enum_default_node_groups_map, - provider_enum_name_map, -) from _nebari.utils import ( get_k8s_version_prefix, get_provider_config_block_name, @@ -36,7 +32,7 @@ yaml, ) from _nebari.version import __version__, rounded_ver_parse -from nebari.schema import ProviderEnum, is_version_accepted +from nebari.schema import ProviderEnum, is_version_accepted, provider_enum_name_map logger = logging.getLogger(__name__) @@ -954,7 +950,7 @@ def _version_specific_upgrade( provider_full_name, {} ): try: - default_node_groups = provider_enum_default_node_groups_map[ + default_node_groups = schema.provider_enum_default_node_groups_map[ provider ] continue_ = Prompt.ask( diff --git a/src/nebari/schema.py b/src/nebari/schema.py index 819da2b070..984b77822a 100644 --- a/src/nebari/schema.py +++ b/src/nebari/schema.py @@ -1,10 +1,17 @@ import enum -from typing import Annotated +from typing import Annotated, Any, Dict import pydantic from pydantic import ConfigDict, Field, StringConstraints, field_validator from ruamel.yaml import yaml_object +from _nebari.stages.infrastructure import ( + DEFAULT_AWS_NODE_GROUPS, + DEFAULT_AZURE_NODE_GROUPS, + DEFAULT_DO_NODE_GROUPS, + DEFAULT_GCP_NODE_GROUPS, + node_groups_to_dict, +) from _nebari.utils import escape_string, yaml from _nebari.version import __version__, rounded_ver_parse @@ -118,3 +125,20 @@ class Taint(Base): key: str value: str effect: TaintEffectEnum + + +provider_enum_name_map: dict[ProviderEnum, str] = { + ProviderEnum.local: "local", + ProviderEnum.existing: "existing", + ProviderEnum.gcp: "google_cloud_platform", + ProviderEnum.aws: "amazon_web_services", + ProviderEnum.azure: "azure", + ProviderEnum.do: "digital_ocean", +} + +provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { + ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), + ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), + ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), + ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), +} From da9fd8218c39cfed66009c0e2202162ad89c48bd Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:17:38 -0500 Subject: [PATCH 13/25] move var back --- src/_nebari/stages/infrastructure/__init__.py | 9 +++++++++ src/nebari/schema.py | 16 +--------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index aee031b1ef..f27c2a8480 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -28,6 +28,7 @@ ) from nebari import schema from nebari.hookspecs import NebariStage, hookimpl +from nebari.schema import ProviderEnum def get_kubeconfig_filename(): @@ -963,3 +964,11 @@ def destroy( @hookimpl def nebari_stage() -> List[Type[NebariStage]]: return [KubernetesInfrastructureStage] + + +provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { + ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), + ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), + ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), + ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), +} diff --git a/src/nebari/schema.py b/src/nebari/schema.py index 984b77822a..3fac698b8a 100644 --- a/src/nebari/schema.py +++ b/src/nebari/schema.py @@ -1,17 +1,10 @@ import enum -from typing import Annotated, Any, Dict +from typing import Annotated import pydantic from pydantic import ConfigDict, Field, StringConstraints, field_validator from ruamel.yaml import yaml_object -from _nebari.stages.infrastructure import ( - DEFAULT_AWS_NODE_GROUPS, - DEFAULT_AZURE_NODE_GROUPS, - DEFAULT_DO_NODE_GROUPS, - DEFAULT_GCP_NODE_GROUPS, - node_groups_to_dict, -) from _nebari.utils import escape_string, yaml from _nebari.version import __version__, rounded_ver_parse @@ -135,10 +128,3 @@ class Taint(Base): ProviderEnum.azure: "azure", ProviderEnum.do: "digital_ocean", } - -provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { - ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), - ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), - ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), - ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), -} From 6a1f81d44011b6fc39ccf8529afa348fc0bf9919 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:18:55 -0500 Subject: [PATCH 14/25] move var back --- src/_nebari/stages/infrastructure/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index f27c2a8480..8984f8437e 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -624,6 +624,13 @@ class ExistingProvider(schema.Base): schema.ProviderEnum.do: DigitalOceanProvider, } +provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { + ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), + ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), + ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), + ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), +} + provider_name_abbreviation_map: Dict[str, str] = { value: key.value for key, value in schema.provider_enum_name_map.items() } @@ -964,11 +971,3 @@ def destroy( @hookimpl def nebari_stage() -> List[Type[NebariStage]]: return [KubernetesInfrastructureStage] - - -provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { - ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), - ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), - ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), - ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), -} From b4c08f3b885ac4d40289e253b7e0566eb3c4a0c0 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:19:37 -0500 Subject: [PATCH 15/25] move var back --- src/_nebari/stages/infrastructure/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 8984f8437e..fff2cff67f 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -624,6 +624,10 @@ class ExistingProvider(schema.Base): schema.ProviderEnum.do: DigitalOceanProvider, } +provider_name_abbreviation_map: Dict[str, str] = { + value: key.value for key, value in schema.provider_enum_name_map.items() +} + provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), @@ -631,10 +635,6 @@ class ExistingProvider(schema.Base): ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), } -provider_name_abbreviation_map: Dict[str, str] = { - value: key.value for key, value in schema.provider_enum_name_map.items() -} - class InputSchema(schema.Base): local: Optional[LocalProvider] = None From 9bae2a1945c48d76c85def4b75dba19d5f6566ff Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:20:34 -0500 Subject: [PATCH 16/25] move var back --- src/_nebari/stages/infrastructure/__init__.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index fff2cff67f..9f05446f4d 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -28,7 +28,6 @@ ) from nebari import schema from nebari.hookspecs import NebariStage, hookimpl -from nebari.schema import ProviderEnum def get_kubeconfig_filename(): @@ -628,11 +627,11 @@ class ExistingProvider(schema.Base): value: key.value for key, value in schema.provider_enum_name_map.items() } -provider_enum_default_node_groups_map: Dict[ProviderEnum, Any] = { - ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), - ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), - ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), - ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), +provider_enum_default_node_groups_map: Dict[schema.ProviderEnum, Any] = { + schema.ProviderEnum.gcp: node_groups_to_dict(DEFAULT_GCP_NODE_GROUPS), + schema.ProviderEnum.aws: node_groups_to_dict(DEFAULT_AWS_NODE_GROUPS), + schema.ProviderEnum.azure: node_groups_to_dict(DEFAULT_AZURE_NODE_GROUPS), + schema.ProviderEnum.do: node_groups_to_dict(DEFAULT_DO_NODE_GROUPS), } From b3dbedae39e7d7a9e7a3087a272ffb3765dda29d Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:41:48 -0500 Subject: [PATCH 17/25] add reference --- .../template/aws/modules/kubernetes/main.tf | 11 +++++++++++ .../stages/infrastructure/template/aws/variables.tf | 1 + .../stages/infrastructure/template/azure/main.tf | 1 + .../template/azure/modules/kubernetes/main.tf | 3 +++ .../stages/infrastructure/template/azure/variables.tf | 7 ++++--- 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 5b66201f83..af0e605ec1 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -86,6 +86,17 @@ resource "aws_eks_node_group" "main" { max_size = var.node_groups[count.index].max_size } + # TODO: add node_taints (var.node_groups.node_taints) to the node group, check the node taints below are working + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_node_group#node_taints + # dynamic "taint" { + # for_each = var.node_groups[count.index].node_taints + # content { + # key = taint.value.key + # value = taint.value.value + # effect = taint.value.effect + # } + # } + # Only set launch_template if its node_group counterpart parameter is not null dynamic "launch_template" { for_each = var.node_groups[count.index].launch_template != null ? [0] : [] diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index a3f37b9eb9..3c0469da8c 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -40,6 +40,7 @@ variable "node_groups" { single_subnet = bool launch_template = map(any) ami_type = string + node_taints = list(any) })) } diff --git a/src/_nebari/stages/infrastructure/template/azure/main.tf b/src/_nebari/stages/infrastructure/template/azure/main.tf index 2d6e2e2afa..0ddff5f583 100644 --- a/src/_nebari/stages/infrastructure/template/azure/main.tf +++ b/src/_nebari/stages/infrastructure/template/azure/main.tf @@ -38,6 +38,7 @@ module "kubernetes" { instance_type = config.instance min_size = config.min_nodes max_size = config.max_nodes + node_taints = config.node_taints } ] vnet_subnet_id = var.vnet_subnet_id diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf index f093f048c6..70b347c038 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf @@ -36,6 +36,8 @@ resource "azurerm_kubernetes_cluster" "main" { min_count = var.node_groups[0].min_size max_count = var.node_groups[0].max_size max_pods = var.max_pods + # TODO: I don't think it's possible to add node_taints to the default node pool so we should throw an error somewhere if people try to do this + # see https://github.com/hashicorp/terraform-provider-azurerm/issues/9183 for more info orchestrator_version = var.kubernetes_version node_labels = { @@ -81,4 +83,5 @@ resource "azurerm_kubernetes_cluster_node_pool" "node_group" { orchestrator_version = var.kubernetes_version tags = var.tags vnet_subnet_id = var.vnet_subnet_id + node_taints = each.value.node_taints # TODO: check this is working } diff --git a/src/_nebari/stages/infrastructure/template/azure/variables.tf b/src/_nebari/stages/infrastructure/template/azure/variables.tf index dcef2c97cb..b4566343f1 100644 --- a/src/_nebari/stages/infrastructure/template/azure/variables.tf +++ b/src/_nebari/stages/infrastructure/template/azure/variables.tf @@ -21,9 +21,10 @@ variable "kubernetes_version" { variable "node_groups" { description = "Azure node groups" type = map(object({ - instance = string - min_nodes = number - max_nodes = number + instance = string + min_nodes = number + max_nodes = number + node_taints = list(any) })) } From 97858d06e5574e21f7d8ce19bc658c184f50e9b3 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:47:45 -0500 Subject: [PATCH 18/25] refactor --- .../stages/kubernetes_services/__init__.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index 75c529663a..bcb6a986fc 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -465,6 +465,15 @@ def to_yaml(cls, representer, node): class Toleration(schema.Taint): operator: TolerationOperatorEnum = TolerationOperatorEnum.Equal + @classmethod + def from_taint( + cls, taint: schema.Taint, operator: None | TolerationOperatorEnum = None + ): + kwargs = {} + if operator: + kwargs["operator"] = operator + cls(**taint.model_dump(), **kwargs) + class JupyterhubInputVars(schema.Base): jupyterhub_theme: Dict[str, Any] = Field(alias="jupyterhub-theme") @@ -491,7 +500,7 @@ class JupyterhubInputVars(schema.Base): cloud_provider: str = Field(alias="cloud-provider") jupyterlab_preferred_dir: Optional[str] = Field(alias="jupyterlab-preferred-dir") shared_fs_type: SharedFsEnum - node_taint_tolerations: Optional[List[Toleration]] = Field( + user_taint_tolerations: Optional[List[Toleration]] = Field( alias="node-taint-tolerations" ) @@ -611,6 +620,27 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): ): jupyterhub_theme.update({"version": f"v{self.config.nebari_version}"}) + def _node_taint_tolerations(node_group_name: str) -> List[Toleration]: + tolerations = [] + provider = getattr( + self.config, schema.provider_enum_name_map[self.config.provider] + ) + if not ( + hasattr(provider, "node_groups") + and provider.node_groups.get(node_group_name, {}) + and hasattr(provider.node_groups[node_group_name], "taints") + ): + return tolerations + tolerations = [ + Toleration.from_taint(taint) + for taint in getattr( + self.config, schema.provider_enum_name_map[self.config.provider] + ) + .node_groups[node_group_name] + .taints + ] + return tolerations + kubernetes_services_vars = KubernetesServicesInputVars( name=self.config.project_name, environment=self.config.namespace, @@ -665,14 +695,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): jupyterlab_default_settings=self.config.jupyterlab.default_settings, jupyterlab_gallery_settings=self.config.jupyterlab.gallery_settings, jupyterlab_preferred_dir=self.config.jupyterlab.preferred_dir, - node_taint_tolerations=[ - Toleration(**taint.model_dump()) - for taint in getattr( - self.config, schema.provider_enum_name_map[self.config.provider] - ) - .node_groups["user"] - .taints - ], + user_taint_tolerations=_node_taint_tolerations(node_group_name="user"), shared_fs_type=( # efs is equivalent to nfs in these modules SharedFsEnum.nfs @@ -687,14 +710,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): ), dask_gateway_profiles=self.config.profiles.model_dump()["dask_worker"], cloud_provider=cloud_provider, - worker_taint_tolerations=[ - Toleration(**taint.model_dump()) - for taint in getattr( - self.config, schema.provider_enum_name_map[self.config.provider] - ) - .node_groups["worker"] - .taints - ], + worker_taint_tolerations=_node_taint_tolerations(node_group_name="worker"), ) monitoring_vars = MonitoringInputVars( From 4ac7b9c79aceba23dfb9eae1cefd573ab9f07cd8 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:44:23 -0500 Subject: [PATCH 19/25] various fixes for aws and azure providers --- src/_nebari/stages/infrastructure/__init__.py | 23 +++++++++++++++++++ .../template/aws/modules/kubernetes/main.tf | 16 ++++++------- .../aws/modules/kubernetes/variables.tf | 5 ++++ .../infrastructure/template/aws/variables.tf | 6 ++++- .../template/azure/modules/kubernetes/main.tf | 4 ++-- .../azure/modules/kubernetes/variables.tf | 14 ++++++++++- .../template/azure/variables.tf | 2 +- .../stages/kubernetes_services/__init__.py | 2 +- 8 files changed, 58 insertions(+), 14 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 9f05446f4d..ee2f3f2deb 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -148,6 +148,11 @@ class AzureNodeGroupInputVars(schema.Base): instance: str min_nodes: int max_nodes: int + node_taints: list[str] + + @field_validator("node_taints", mode="before") + def convert_taints(cls, value: Optional[List[schema.Taint]]): + return [f"{taint.key}={taint.value}:{taint.effect.value}" for taint in value] class AzureInputVars(schema.Base): @@ -189,6 +194,7 @@ class AWSNodeGroupInputVars(schema.Base): permissions_boundary: Optional[str] = None ami_type: Optional[AWSAmiTypes] = None launch_template: Optional[AWSNodeLaunchTemplate] = None + node_taints: list[dict] @field_validator("ami_type", mode="before") @classmethod @@ -210,6 +216,21 @@ def _infer_and_validate_ami_type(cls, value, values) -> str: ) return value + @field_validator("node_taints", mode="before") + def convert_taints(cls, value: Optional[List[schema.Taint]]): + return [ + dict( + key=taint.key, + value=taint.value, + effect={ + schema.TaintEffectEnum.NoSchedule: "NO_SCHEDULE", + schema.TaintEffectEnum.PreferNoSchedule: "PREFER_NO_SCHEDULE", + schema.TaintEffectEnum.NoExecute: "NO_EXECUTE", + }[taint.effect], + ) + for taint in value + ] + class AWSInputVars(schema.Base): name: str @@ -848,6 +869,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): instance=node_group.instance, min_nodes=node_group.min_nodes, max_nodes=node_group.max_nodes, + node_taints=node_group.taints, ) for name, node_group in self.config.azure.node_groups.items() }, @@ -889,6 +911,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): single_subnet=node_group.single_subnet, permissions_boundary=node_group.permissions_boundary, launch_template=node_group.launch_template, + node_taints=node_group.taints, ) for name, node_group in self.config.amazon_web_services.node_groups.items() ], diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index af0e605ec1..92c4ed284a 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -88,14 +88,14 @@ resource "aws_eks_node_group" "main" { # TODO: add node_taints (var.node_groups.node_taints) to the node group, check the node taints below are working # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_node_group#node_taints - # dynamic "taint" { - # for_each = var.node_groups[count.index].node_taints - # content { - # key = taint.value.key - # value = taint.value.value - # effect = taint.value.effect - # } - # } + dynamic "taint" { + for_each = var.node_groups[count.index].node_taints + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } # Only set launch_template if its node_group counterpart parameter is not null dynamic "launch_template" { diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index 4d38d10a19..703aaba52c 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -53,6 +53,11 @@ variable "node_groups" { single_subnet = bool launch_template = map(any) ami_type = string + node_taints = list(object({ + key = string + value = string + effect = string + })) })) } diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index 3c0469da8c..2621686d4b 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -40,7 +40,11 @@ variable "node_groups" { single_subnet = bool launch_template = map(any) ami_type = string - node_taints = list(any) + node_taints = list(object({ + key = string + value = string + effect = string + })) })) } diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf index 70b347c038..08baa980a8 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf @@ -32,7 +32,7 @@ resource "azurerm_kubernetes_cluster" "main" { vnet_subnet_id = var.vnet_subnet_id name = var.node_groups[0].name vm_size = var.node_groups[0].instance_type - enable_auto_scaling = "true" + enable_auto_scaling = "true" # TODO: Check if this is still supported in the provider version we are using min_count = var.node_groups[0].min_size max_count = var.node_groups[0].max_size max_pods = var.max_pods @@ -72,7 +72,7 @@ resource "azurerm_kubernetes_cluster_node_pool" "node_group" { name = each.value.name kubernetes_cluster_id = azurerm_kubernetes_cluster.main.id vm_size = each.value.instance_type - enable_auto_scaling = "true" + enable_auto_scaling = "true" # TODO: Check if this is still supported in the provider version we are using mode = "User" # "System" or "User", only "User" nodes can scale down to 0 min_count = each.value.min_size max_count = each.value.max_size diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf index b93a9fae2d..2351be9dc2 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf @@ -30,9 +30,21 @@ variable "environment" { } +# variable "node_groups" { +# description = "Node pools to add to Azure Kubernetes Cluster" +# type = list(map(any)) +# } + variable "node_groups" { description = "Node pools to add to Azure Kubernetes Cluster" - type = list(map(any)) + type = list(object({ + name = string + auto_scale = bool + instance_type = string + min_size = number + max_size = number + node_taints = list(string) + })) } variable "vnet_subnet_id" { diff --git a/src/_nebari/stages/infrastructure/template/azure/variables.tf b/src/_nebari/stages/infrastructure/template/azure/variables.tf index b4566343f1..5eeb32c02a 100644 --- a/src/_nebari/stages/infrastructure/template/azure/variables.tf +++ b/src/_nebari/stages/infrastructure/template/azure/variables.tf @@ -24,7 +24,7 @@ variable "node_groups" { instance = string min_nodes = number max_nodes = number - node_taints = list(any) + node_taints = list(string) })) } diff --git a/src/_nebari/stages/kubernetes_services/__init__.py b/src/_nebari/stages/kubernetes_services/__init__.py index bcb6a986fc..41025b7737 100644 --- a/src/_nebari/stages/kubernetes_services/__init__.py +++ b/src/_nebari/stages/kubernetes_services/__init__.py @@ -472,7 +472,7 @@ def from_taint( kwargs = {} if operator: kwargs["operator"] = operator - cls(**taint.model_dump(), **kwargs) + return cls(**taint.model_dump(), **kwargs) class JupyterhubInputVars(schema.Base): From f6b9a4f4259898099259ab0097a1a80570623126 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:17:55 -0600 Subject: [PATCH 20/25] add taint conversion for AWS --- src/_nebari/stages/infrastructure/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 64b4d2ce4e..3c80e60f87 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -196,6 +196,21 @@ class AWSNodeGroupInputVars(schema.Base): launch_template: Optional[AWSNodeLaunchTemplate] = None node_taints: list[dict] + @field_validator("node_taints", mode="before") + def convert_taints(cls, value: Optional[List[schema.Taint]]): + return [ + dict( + key=taint.key, + value=taint.value, + effect={ + schema.TaintEffectEnum.NoSchedule: "NO_SCHEDULE", + schema.TaintEffectEnum.PreferNoSchedule: "PREFER_NO_SCHEDULE", + schema.TaintEffectEnum.NoExecute: "NO_EXECUTE", + }[taint.effect], + ) + for taint in value + ] + def construct_aws_ami_type(gpu_enabled: bool, launch_template: AWSNodeLaunchTemplate): """Construct the AWS AMI type based on the provided parameters.""" From e752a3a8df4b79c4e723bb056762a8e252d5b03f Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:51:38 -0600 Subject: [PATCH 21/25] add DEFAULT_.*_TAINT vars --- src/_nebari/stages/infrastructure/__init__.py | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 3c80e60f87..0db445c10d 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -72,6 +72,13 @@ def validate_taint_strings(cls, value: List[str | schema.Taint]): return parsed_taints +DEFAULT_GENERAL_TAINTS = [] +DEFAULT_USER_TAINTS = [schema.Taint(key="dedicated", value="user", effect="NoSchedule")] +DEFAULT_WORKER_TAINTS = [ + schema.Taint(key="dedicated", value="worker", effect="NoSchedule") +] + + class DigitalOceanInputVars(schema.Base): name: str environment: str @@ -421,18 +428,23 @@ class GCPNodeGroup(NodeGroup): DEFAULT_GCP_NODE_GROUPS = { - "general": GCPNodeGroup(instance="e2-standard-8", min_nodes=1, max_nodes=1), + "general": GCPNodeGroup( + instance="e2-standard-8", + min_nodes=1, + max_nodes=1, + taints=DEFAULT_GENERAL_TAINTS, + ), "user": GCPNodeGroup( instance="e2-standard-4", min_nodes=0, max_nodes=5, - taints=[schema.Taint(key="dedicated", value="user", effect="NoSchedule")], + taints=DEFAULT_USER_TAINTS, ), "worker": GCPNodeGroup( instance="e2-standard-4", min_nodes=0, max_nodes=5, - taints=[schema.Taint(key="dedicated", value="worker", effect="NoSchedule")], + taints=DEFAULT_WORKER_TAINTS, ), } @@ -475,9 +487,21 @@ class AzureNodeGroup(NodeGroup): DEFAULT_AZURE_NODE_GROUPS = { - "general": AzureNodeGroup(instance="Standard_D8_v3", min_nodes=1, max_nodes=1), - "user": AzureNodeGroup(instance="Standard_D4_v3", min_nodes=0, max_nodes=5), - "worker": AzureNodeGroup(instance="Standard_D4_v3", min_nodes=0, max_nodes=5), + "general": AzureNodeGroup( + instance="Standard_D8_v3", + min_nodes=1, + max_nodes=1, + taints=DEFAULT_GENERAL_TAINTS, + ), + "user": AzureNodeGroup( + instance="Standard_D4_v3", min_nodes=0, max_nodes=5, taints=DEFAULT_USER_TAINTS + ), + "worker": AzureNodeGroup( + instance="Standard_D4_v3", + min_nodes=0, + max_nodes=5, + taints=DEFAULT_WORKER_TAINTS, + ), } @@ -547,12 +571,22 @@ class AWSNodeGroup(NodeGroup): DEFAULT_AWS_NODE_GROUPS = { - "general": AWSNodeGroup(instance="m5.2xlarge", min_nodes=1, max_nodes=1), + "general": AWSNodeGroup( + instance="m5.2xlarge", min_nodes=1, max_nodes=1, taints=DEFAULT_GENERAL_TAINTS + ), "user": AWSNodeGroup( - instance="m5.xlarge", min_nodes=0, max_nodes=5, single_subnet=False + instance="m5.xlarge", + min_nodes=0, + max_nodes=5, + single_subnet=False, + taints=DEFAULT_USER_TAINTS, ), "worker": AWSNodeGroup( - instance="m5.xlarge", min_nodes=0, max_nodes=5, single_subnet=False + instance="m5.xlarge", + min_nodes=0, + max_nodes=5, + single_subnet=False, + taints=DEFAULT_WORKER_TAINTS, ), } From 59daa0c787bd908acacb1902aaf61792c35ef43b Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 12:08:14 -0600 Subject: [PATCH 22/25] clean up fixed TODOs --- .../infrastructure/template/aws/modules/kubernetes/main.tf | 2 -- .../template/azure/modules/kubernetes/main.tf | 6 +++--- .../services/dask-gateway/files/gateway_config.py | 1 + src/nebari/schema.py | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 92c4ed284a..b217cfecdb 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -86,8 +86,6 @@ resource "aws_eks_node_group" "main" { max_size = var.node_groups[count.index].max_size } - # TODO: add node_taints (var.node_groups.node_taints) to the node group, check the node taints below are working - # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_node_group#node_taints dynamic "taint" { for_each = var.node_groups[count.index].node_taints content { diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf index 08baa980a8..b62257b51b 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf @@ -32,7 +32,7 @@ resource "azurerm_kubernetes_cluster" "main" { vnet_subnet_id = var.vnet_subnet_id name = var.node_groups[0].name vm_size = var.node_groups[0].instance_type - enable_auto_scaling = "true" # TODO: Check if this is still supported in the provider version we are using + enable_auto_scaling = "true" min_count = var.node_groups[0].min_size max_count = var.node_groups[0].max_size max_pods = var.max_pods @@ -72,7 +72,7 @@ resource "azurerm_kubernetes_cluster_node_pool" "node_group" { name = each.value.name kubernetes_cluster_id = azurerm_kubernetes_cluster.main.id vm_size = each.value.instance_type - enable_auto_scaling = "true" # TODO: Check if this is still supported in the provider version we are using + enable_auto_scaling = "true" mode = "User" # "System" or "User", only "User" nodes can scale down to 0 min_count = each.value.min_size max_count = each.value.max_size @@ -83,5 +83,5 @@ resource "azurerm_kubernetes_cluster_node_pool" "node_group" { orchestrator_version = var.kubernetes_version tags = var.tags vnet_subnet_id = var.vnet_subnet_id - node_taints = each.value.node_taints # TODO: check this is working + node_taints = each.value.node_taints } diff --git a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py index 975a48460e..427b8734a7 100644 --- a/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py +++ b/src/_nebari/stages/kubernetes_services/template/modules/kubernetes/services/dask-gateway/files/gateway_config.py @@ -241,6 +241,7 @@ def options_handler(options, user): base_username_mount(user.name), config["profiles"][options.profile], {"environment": {**options.environment_vars}}, + # merge with default values { k: config["cluster"][k] for k in ("worker_extra_pod_config", "scheduler_extra_pod_config") diff --git a/src/nebari/schema.py b/src/nebari/schema.py index 3fac698b8a..873adb0f31 100644 --- a/src/nebari/schema.py +++ b/src/nebari/schema.py @@ -107,7 +107,6 @@ def is_version_accepted(v): return Main.is_version_accepted(v) -# TODO: Make sure the taint is actually applied to the nodes for each provider class TaintEffectEnum(str, enum.Enum): NoSchedule: str = "NoSchedule" PreferNoSchedule: str = "PreferNoSchedule" From e05f143952282bcc010b40b284772a6f788da4a6 Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 12:35:01 -0600 Subject: [PATCH 23/25] more clean up --- .../template/azure/modules/kubernetes/main.tf | 3 +-- .../template/azure/modules/kubernetes/variables.tf | 6 ------ .../template/gcp/modules/kubernetes/variables.tf | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf index b62257b51b..a054147759 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/main.tf @@ -36,8 +36,7 @@ resource "azurerm_kubernetes_cluster" "main" { min_count = var.node_groups[0].min_size max_count = var.node_groups[0].max_size max_pods = var.max_pods - # TODO: I don't think it's possible to add node_taints to the default node pool so we should throw an error somewhere if people try to do this - # see https://github.com/hashicorp/terraform-provider-azurerm/issues/9183 for more info + # TODO: It's not possible to add node_taints to the default node pool. See https://github.com/hashicorp/terraform-provider-azurerm/issues/9183 for more info orchestrator_version = var.kubernetes_version node_labels = { diff --git a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf index 2351be9dc2..01851b842c 100644 --- a/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/azure/modules/kubernetes/variables.tf @@ -29,12 +29,6 @@ variable "environment" { type = string } - -# variable "node_groups" { -# description = "Node pools to add to Azure Kubernetes Cluster" -# type = list(map(any)) -# } - variable "node_groups" { description = "Node pools to add to Azure Kubernetes Cluster" type = list(object({ diff --git a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf index 17e9b481a4..236a0b9017 100644 --- a/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/gcp/modules/kubernetes/variables.tf @@ -58,7 +58,7 @@ variable "node_groups" { min_size = 0 max_size = 2 labels = {} - node_taints = [] # TODO: Do this for other cloud providers + node_taints = [] }, { name = "worker" From f3cb2e9c0ec8d9d2888ffc9033dfb6f27682029a Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:21:33 -0600 Subject: [PATCH 24/25] fix test --- src/_nebari/stages/infrastructure/__init__.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 0db445c10d..5361b0db5b 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -50,26 +50,21 @@ class NodeGroup(schema.Base): taints: Optional[List[schema.Taint]] = [] @field_validator("taints", mode="before") - def validate_taint_strings(cls, value: List[str | schema.Taint]): + def validate_taint_strings(cls, value: list[Any]): TAINT_STR_REGEX = re.compile(r"(\w+)=(\w+):(\w+)") - parsed_taints = [] + return_value = [] for taint in value: - if not isinstance(taint, (str, schema.Taint)): - raise ValueError( - f"Unable to parse type: {type(taint)} as taint. Must be a string or Taint object." - ) - - if isinstance(taint, schema.Taint): - parsed_taint = taint - elif isinstance(taint, str): + if not isinstance(taint, str): + return_value.append(taint) + else: match = TAINT_STR_REGEX.match(taint) if not match: raise ValueError(f"Invalid taint string: {taint}") key, value, effect = match.groups() parsed_taint = schema.Taint(key=key, value=value, effect=effect) - parsed_taints.append(parsed_taint) + return_value.append(parsed_taint) - return parsed_taints + return return_value DEFAULT_GENERAL_TAINTS = [] From b125e8c22d603437b5a5a8614f59ee98bcadfcaa Mon Sep 17 00:00:00 2001 From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:38:17 -0600 Subject: [PATCH 25/25] fix test error --- src/nebari/schema.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nebari/schema.py b/src/nebari/schema.py index 873adb0f31..2f844ba6f9 100644 --- a/src/nebari/schema.py +++ b/src/nebari/schema.py @@ -107,11 +107,16 @@ def is_version_accepted(v): return Main.is_version_accepted(v) +@yaml_object(yaml) class TaintEffectEnum(str, enum.Enum): NoSchedule: str = "NoSchedule" PreferNoSchedule: str = "PreferNoSchedule" NoExecute: str = "NoExecute" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_str(node.value) + class Taint(Base): key: str