From a6d61dda7787272903281d6acec210403c8f590a Mon Sep 17 00:00:00 2001 From: mclacore Date: Wed, 22 May 2024 13:38:11 -0700 Subject: [PATCH] Intermittently working :( --- README.md | 9 +++++---- core-services/nvidia_gpu.tf | 18 ++++++++++++------ massdriver.yaml | 16 +++++++++------- src/iam.tf | 5 +++++ src/main.tf | 27 +++++++++++++++++++++++++-- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 7e43d2f..6c07aa4 100644 --- a/README.md +++ b/README.md @@ -94,10 +94,11 @@ Form input parameters for configuring a bundle for deployment. - T3 Medium (2 vCPUs for a 4h 48m burst, 4.0 GiB) - T3 Large (2 vCPUs for a 7h 12m burst, 8.0 GiB) - T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB) - - T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB) - - P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB) - - P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB) - - P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB) + - T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB) + - P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem) + - P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem) + - P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB) + - P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD) - **`max_size`** *(integer)*: Maximum number of instances in the node group. Minimum: `0`. Default: `10`. - **`min_size`** *(integer)*: Minimum number of instances in the node group. Minimum: `0`. Default: `1`. - **`name_suffix`** *(string)*: The name of the node group. Default: ``. diff --git a/core-services/nvidia_gpu.tf b/core-services/nvidia_gpu.tf index 34fa82b..30ff1fe 100644 --- a/core-services/nvidia_gpu.tf +++ b/core-services/nvidia_gpu.tf @@ -1,5 +1,7 @@ locals { - has_gpu = length(coalesce([for ng in var.node_groups : length(regexall("^p[0-9]\\..*", ng.instance_type)) > 1 ? "gpu" : ""])) > 0 + gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*" + has_gpu = length(coalesce([for ng in var.node_groups : length(regexall(local.gpu_regex, ng.instance_type)) > 1 ? "gpu" : ""])) > 0 + is_gpu = [for ng in var.node_groups : ng.instance_type if length(regexall(local.gpu_regex, ng.instance_type)) > 0] } resource "kubernetes_daemonset" "nvidia" { @@ -30,14 +32,15 @@ resource "kubernetes_daemonset" "nvidia" { } } spec { + priority_class_name = "system-node-critical" affinity { node_affinity { required_during_scheduling_ignored_during_execution { node_selector_term { match_expressions { - key = "accelerator" + key = "node.kubernetes.io/instance-type" operator = "In" - values = ["nvidia"] + values = local.is_gpu } } } @@ -60,10 +63,13 @@ resource "kubernetes_daemonset" "nvidia" { } container { name = "nvidia-device-plugin-ctr" - image = "nvcr.io/nvidia/k8s-device-plugin:v0.9.0" - args = ["--fail-on-init-error=false"] + image = "nvcr.io/nvidia/k8s-device-plugin:v0.15.0" + env { + name = "FAIL_ON_INIT_ERROR" + value = "false" + } security_context { - privileged = false + privileged = true capabilities { drop = ["all"] } diff --git a/massdriver.yaml b/massdriver.yaml index 302031a..db1857a 100644 --- a/massdriver.yaml +++ b/massdriver.yaml @@ -196,14 +196,16 @@ params: const: t3.large - title: T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB) const: t3.xlarge - - title: T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB) + - title: T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB) const: t3.2xlarge - - title: P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB) - const: p2.xlarge - - title: P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB) - const: p2.8xlarge - - title: P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB) - const: p2.16xlarge + - title: P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem) + const: p3.2xlarge + - title: P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem) + const: p3.8xlarge + - title: P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB) + const: p3.16xlarge + - title: P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD) + const: p3dn.24xlarge advanced_configuration_enabled: type: boolean title: Advanced Configuration Enabled diff --git a/src/iam.tf b/src/iam.tf index 735e6a2..ffb6466 100644 --- a/src/iam.tf +++ b/src/iam.tf @@ -87,3 +87,8 @@ resource "aws_iam_role_policy_attachment" "node-ecr" { policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" role = aws_iam_role.node.name } + +resource "aws_iam_role_policy_attachment" "node-ssm" { + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + role = aws_iam_role.node.name +} diff --git a/src/main.tf b/src/main.tf index c76314a..449e42b 100644 --- a/src/main.tf +++ b/src/main.tf @@ -3,9 +3,22 @@ locals { private_subnet_ids = [for subnet in var.vpc.data.infrastructure.private_subnets : element(split("/", subnet["arn"]), 1)] subnet_ids = concat(local.public_subnet_ids, local.private_subnet_ids) + gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*" + is_gpu_instance = { for ng in var.node_groups : ng.name_suffix => length(regexall(local.gpu_regex, ng.instance_type)) > 0 } + has_gpu_instance = contains(values(local.is_gpu_instance), true) + cluster_name = var.md_metadata.name_prefix } +data "aws_ssm_parameter" "eks_ami" { + name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2/recommended/image_id" +} + +data "aws_ssm_parameter" "eks_gpu_ami" { + count = local.has_gpu_instance ? 1 : 0 + name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2-gpu/recommended/image_id" +} + resource "aws_eks_cluster" "cluster" { name = local.cluster_name role_arn = aws_iam_role.cluster.arn @@ -42,10 +55,10 @@ resource "aws_eks_node_group" "node_group" { for_each = { for ng in var.node_groups : ng.name_suffix => ng } node_group_name = "${local.cluster_name}-${each.value.name_suffix}" cluster_name = local.cluster_name - version = var.k8s_version subnet_ids = local.private_subnet_ids node_role_arn = aws_iam_role.node.arn instance_types = [each.value.instance_type] + ami_type = "CUSTOM" launch_template { id = aws_launch_template.nodes[each.key].id @@ -59,7 +72,7 @@ resource "aws_eks_node_group" "node_group" { } dynamic "taint" { - for_each = length(regexall("^p[0-9]\\..*", each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([]) + for_each = length(regexall(local.gpu_regex, each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([]) content { key = "sku" value = "gpu" @@ -95,6 +108,16 @@ resource "aws_launch_template" "nodes" { update_default_version = true + image_id = local.is_gpu_instance[each.key] ? data.aws_ssm_parameter.eks_gpu_ami[0].value : data.aws_ssm_parameter.eks_ami.value + + user_data = base64encode( + <