Skip to content

Commit

Permalink
Intermittently working :(
Browse files Browse the repository at this point in the history
  • Loading branch information
mclacore committed May 22, 2024
1 parent 2f08291 commit a6d61dd
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 19 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ Form input parameters for configuring a bundle for deployment.
- T3 Medium (2 vCPUs for a 4h 48m burst, 4.0 GiB)
- T3 Large (2 vCPUs for a 7h 12m burst, 8.0 GiB)
- T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
- T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
- P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
- P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
- P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
- T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
- P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
- P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
- P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
- P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
- **`max_size`** *(integer)*: Maximum number of instances in the node group. Minimum: `0`. Default: `10`.
- **`min_size`** *(integer)*: Minimum number of instances in the node group. Minimum: `0`. Default: `1`.
- **`name_suffix`** *(string)*: The name of the node group. Default: ``.
Expand Down
18 changes: 12 additions & 6 deletions core-services/nvidia_gpu.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
locals {
has_gpu = length(coalesce([for ng in var.node_groups : length(regexall("^p[0-9]\\..*", ng.instance_type)) > 1 ? "gpu" : ""])) > 0
gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*"
has_gpu = length(coalesce([for ng in var.node_groups : length(regexall(local.gpu_regex, ng.instance_type)) > 1 ? "gpu" : ""])) > 0
is_gpu = [for ng in var.node_groups : ng.instance_type if length(regexall(local.gpu_regex, ng.instance_type)) > 0]
}

resource "kubernetes_daemonset" "nvidia" {
Expand Down Expand Up @@ -30,14 +32,15 @@ resource "kubernetes_daemonset" "nvidia" {
}
}
spec {
priority_class_name = "system-node-critical"
affinity {
node_affinity {
required_during_scheduling_ignored_during_execution {
node_selector_term {
match_expressions {
key = "accelerator"
key = "node.kubernetes.io/instance-type"
operator = "In"
values = ["nvidia"]
values = local.is_gpu
}
}
}
Expand All @@ -60,10 +63,13 @@ resource "kubernetes_daemonset" "nvidia" {
}
container {
name = "nvidia-device-plugin-ctr"
image = "nvcr.io/nvidia/k8s-device-plugin:v0.9.0"
args = ["--fail-on-init-error=false"]
image = "nvcr.io/nvidia/k8s-device-plugin:v0.15.0"
env {
name = "FAIL_ON_INIT_ERROR"
value = "false"
}
security_context {
privileged = false
privileged = true
capabilities {
drop = ["all"]
}
Expand Down
16 changes: 9 additions & 7 deletions massdriver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,16 @@ params:
const: t3.large
- title: T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
const: t3.xlarge
- title: T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
- title: T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
const: t3.2xlarge
- title: P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
const: p2.xlarge
- title: P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
const: p2.8xlarge
- title: P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
const: p2.16xlarge
- title: P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
const: p3.2xlarge
- title: P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
const: p3.8xlarge
- title: P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
const: p3.16xlarge
- title: P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
const: p3dn.24xlarge
advanced_configuration_enabled:
type: boolean
title: Advanced Configuration Enabled
Expand Down
5 changes: 5 additions & 0 deletions src/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,8 @@ resource "aws_iam_role_policy_attachment" "node-ecr" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node.name
}

resource "aws_iam_role_policy_attachment" "node-ssm" {
policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
role = aws_iam_role.node.name
}
27 changes: 25 additions & 2 deletions src/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,22 @@ locals {
private_subnet_ids = [for subnet in var.vpc.data.infrastructure.private_subnets : element(split("/", subnet["arn"]), 1)]
subnet_ids = concat(local.public_subnet_ids, local.private_subnet_ids)

gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*"
is_gpu_instance = { for ng in var.node_groups : ng.name_suffix => length(regexall(local.gpu_regex, ng.instance_type)) > 0 }
has_gpu_instance = contains(values(local.is_gpu_instance), true)

cluster_name = var.md_metadata.name_prefix
}

data "aws_ssm_parameter" "eks_ami" {
name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2/recommended/image_id"
}

data "aws_ssm_parameter" "eks_gpu_ami" {
count = local.has_gpu_instance ? 1 : 0
name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2-gpu/recommended/image_id"
}

resource "aws_eks_cluster" "cluster" {
name = local.cluster_name
role_arn = aws_iam_role.cluster.arn
Expand Down Expand Up @@ -42,10 +55,10 @@ resource "aws_eks_node_group" "node_group" {
for_each = { for ng in var.node_groups : ng.name_suffix => ng }
node_group_name = "${local.cluster_name}-${each.value.name_suffix}"
cluster_name = local.cluster_name
version = var.k8s_version
subnet_ids = local.private_subnet_ids
node_role_arn = aws_iam_role.node.arn
instance_types = [each.value.instance_type]
ami_type = "CUSTOM"

launch_template {
id = aws_launch_template.nodes[each.key].id
Expand All @@ -59,7 +72,7 @@ resource "aws_eks_node_group" "node_group" {
}

dynamic "taint" {
for_each = length(regexall("^p[0-9]\\..*", each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([])
for_each = length(regexall(local.gpu_regex, each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([])
content {
key = "sku"
value = "gpu"
Expand Down Expand Up @@ -95,6 +108,16 @@ resource "aws_launch_template" "nodes" {

update_default_version = true

image_id = local.is_gpu_instance[each.key] ? data.aws_ssm_parameter.eks_gpu_ami[0].value : data.aws_ssm_parameter.eks_ami.value

user_data = base64encode(
<<EOF
#!/bin/bash
set -o xtrace
/etc/eks/bootstrap.sh ${local.cluster_name} --kubelet-extra-args '--node-labels=node.kubernetes.io/instancegroup=${each.key}'
EOF
)

metadata_options {
http_endpoint = "enabled"
http_tokens = "required"
Expand Down

0 comments on commit a6d61dd

Please sign in to comment.