diff --git a/framework/API.md b/framework/API.md
index 5ebfbcffc..51bcb606f 100644
--- a/framework/API.md
+++ b/framework/API.md
@@ -19703,11 +19703,11 @@ The list of supported Karpenter versions as defined [here](https://github.com/aw
| **Name** | **Description** |
| --- | --- |
-| V0_37_0
| *No description.* |
+| V1_0_1
| *No description.* |
---
-##### `V0_37_0`
+##### `V1_0_1`
---
diff --git a/framework/src/processing/lib/karpenter-releases.ts b/framework/src/processing/lib/karpenter-releases.ts
index fcd8ed4d2..d16fef04f 100644
--- a/framework/src/processing/lib/karpenter-releases.ts
+++ b/framework/src/processing/lib/karpenter-releases.ts
@@ -6,7 +6,7 @@
* At this time only v0.37.0 is supported.
*/
export enum KarpenterVersion {
- V0_37_0 = '0.37.0',
+ V1_0_1 = '1.0.1',
}
-export const DEFAULT_KARPENTER_VERSION: KarpenterVersion = KarpenterVersion.V0_37_0;
+export const DEFAULT_KARPENTER_VERSION: KarpenterVersion = KarpenterVersion.V1_0_1;
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/eks-karpenter-helpers.ts b/framework/src/processing/lib/spark-runtime/emr-containers/eks-karpenter-helpers.ts
index f5ecabcf1..f0496d2b1 100644
--- a/framework/src/processing/lib/spark-runtime/emr-containers/eks-karpenter-helpers.ts
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/eks-karpenter-helpers.ts
@@ -183,43 +183,45 @@ export function karpenterSetup(cluster: ICluster,
actions: ['ec2:RunInstances', 'ec2:CreateFleet'],
});
- const allowScopedEC2InstanceActionsWithTags: PolicyStatement = new PolicyStatement({
+ const allowScopedEC2LaunchTemplateAccessActions = new PolicyStatement({
+ sid: 'AllowScopedEC2LaunchTemplateAccessActions',
effect: Effect.ALLOW,
- resources: [
- `arn:aws:ec2:${Stack.of(scope).region}:*:fleet/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:instance/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:volume/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:network-interface/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:launch-template/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:spot-instances-request`,
+ resources: [`arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:launch-template/*`],
+ actions: [
+ 'ec2:RunInstances',
+ 'ec2:CreateFleet',
],
- actions: ['ec2:RunInstances', 'ec2:CreateFleet', 'ec2:CreateLaunchTemplate'],
conditions: {
StringEquals: {
- [`aws:RequestTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
+ [`aws:ResourceTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
},
StringLike: {
- 'aws:RequestTag/karpenter.sh/nodepool': '*',
+ 'aws:ResourceTag/karpenter.sh/nodepool': '*',
},
},
});
- const allowScopedResourceCreationTagging: PolicyStatement = new PolicyStatement({
+ const allowScopedResourceCreationTagging = new PolicyStatement({
sid: 'AllowScopedResourceCreationTagging',
effect: Effect.ALLOW,
resources: [
- `arn:aws:ec2:${Stack.of(scope).region}:*:fleet/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:instance/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:volume/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:network-interface/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:launch-template/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:spot-instances-request`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:fleet/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:instance/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:volume/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:network-interface/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:launch-template/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:spot-instances-request/*`,
],
actions: ['ec2:CreateTags'],
conditions: {
StringEquals: {
[`aws:RequestTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
- 'ec2:CreateAction': ['RunInstances', 'CreateFleet', 'CreateLaunchTemplate'],
+ 'aws:RequestTag/eks:eks-cluster-name': clusterName,
+ 'ec2:CreateAction': [
+ 'RunInstances',
+ 'CreateFleet',
+ 'CreateLaunchTemplate',
+ ],
},
StringLike: {
'aws:RequestTag/karpenter.sh/nodepool': '*',
@@ -227,12 +229,11 @@ export function karpenterSetup(cluster: ICluster,
},
});
- const allowScopedResourceTagging: PolicyStatement = new PolicyStatement({
- sid: 'allowScopedResourceTagging',
+
+ const allowScopedResourceTagging = new PolicyStatement({
+ sid: 'AllowScopedResourceTagging',
effect: Effect.ALLOW,
- resources: [
- `arn:aws:ec2:${Stack.of(scope).region}:*:instance/*`,
- ],
+ resources: [`arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:instance/*`],
actions: ['ec2:CreateTags'],
conditions: {
'StringEquals': {
@@ -241,8 +242,12 @@ export function karpenterSetup(cluster: ICluster,
'StringLike': {
'aws:ResourceTag/karpenter.sh/nodepool': '*',
},
+ 'StringEqualsIfExists': {
+ 'aws:RequestTag/eks:eks-cluster-name': clusterName,
+ },
'ForAllValues:StringEquals': {
'aws:TagKeys': [
+ 'eks:eks-cluster-name',
'karpenter.sh/nodeclaim',
'Name',
],
@@ -250,14 +255,18 @@ export function karpenterSetup(cluster: ICluster,
},
});
- const allowScopedDeletion: PolicyStatement = new PolicyStatement({
+
+ const allowScopedDeletion = new PolicyStatement({
sid: 'AllowScopedDeletion',
effect: Effect.ALLOW,
resources: [
- `arn:aws:ec2:${Stack.of(scope).region}:*:instance/*`,
- `arn:aws:ec2:${Stack.of(scope).region}:*:launch-template/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:instance/*`,
+ `arn:${Stack.of(scope).partition}:ec2:${Stack.of(scope).region}:*:launch-template/*`,
+ ],
+ actions: [
+ 'ec2:TerminateInstances',
+ 'ec2:DeleteLaunchTemplate',
],
- actions: ['ec2:TerminateInstances', 'ec2:DeleteLaunchTemplate'],
conditions: {
StringEquals: {
[`aws:ResourceTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
@@ -268,6 +277,29 @@ export function karpenterSetup(cluster: ICluster,
},
});
+
+ const allowScopedEC2InstanceActionsWithTags: PolicyStatement = new PolicyStatement({
+ effect: Effect.ALLOW,
+ resources: [
+ `arn:aws:ec2:${Stack.of(scope).region}:*:fleet/*`,
+ `arn:aws:ec2:${Stack.of(scope).region}:*:instance/*`,
+ `arn:aws:ec2:${Stack.of(scope).region}:*:volume/*`,
+ `arn:aws:ec2:${Stack.of(scope).region}:*:network-interface/*`,
+ `arn:aws:ec2:${Stack.of(scope).region}:*:launch-template/*`,
+ `arn:aws:ec2:${Stack.of(scope).region}:*:spot-instances-request`,
+ ],
+ actions: ['ec2:RunInstances', 'ec2:CreateFleet', 'ec2:CreateLaunchTemplate'],
+ conditions: {
+ StringEquals: {
+ [`aws:RequestTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
+ 'aws:RequestTag/eks:eks-cluster-name': clusterName,
+ },
+ StringLike: {
+ 'aws:RequestTag/karpenter.sh/nodepool': '*',
+ },
+ },
+ });
+
const allowPassingInstanceRole: PolicyStatement = new PolicyStatement({
effect: Effect.ALLOW,
actions: ['iam:PassRole'],
@@ -293,22 +325,16 @@ export function karpenterSetup(cluster: ICluster,
actions: ['eks:DescribeCluster'],
});
- const allowInstanceProfileReadActions: PolicyStatement = new PolicyStatement({
- sid: 'AllowInstanceProfileReadActions',
- effect: Effect.ALLOW,
- resources: ['*'],
- actions: ['iam:GetInstanceProfile'],
- });
-
- const allowScopedInstanceProfileCreationActions: PolicyStatement = new PolicyStatement({
+ const allowScopedInstanceProfileCreationActions = new PolicyStatement({
sid: 'AllowScopedInstanceProfileCreationActions',
effect: Effect.ALLOW,
- resources: ['*'],
+ resources: [`arn:${Stack.of(scope).partition}:iam::${Stack.of(scope).account}:instance-profile/*`],
actions: ['iam:CreateInstanceProfile'],
conditions: {
StringEquals: {
[`aws:RequestTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
- 'aws:RequestTag/topology.kubernetes.io/region': `${Stack.of(scope).region}`,
+ 'aws:RequestTag/eks:eks-cluster-name': clusterName,
+ 'aws:RequestTag/topology.kubernetes.io/region': Stack.of(scope).region,
},
StringLike: {
'aws:RequestTag/karpenter.k8s.aws/ec2nodeclass': '*',
@@ -316,17 +342,18 @@ export function karpenterSetup(cluster: ICluster,
},
});
- const allowScopedInstanceProfileTagActions: PolicyStatement = new PolicyStatement({
+ const allowScopedInstanceProfileTagActions = new PolicyStatement({
sid: 'AllowScopedInstanceProfileTagActions',
effect: Effect.ALLOW,
- resources: ['*'],
+ resources: [`arn:${Stack.of(scope).partition}:iam::${Stack.of(scope).account}:instance-profile/*`],
actions: ['iam:TagInstanceProfile'],
conditions: {
StringEquals: {
[`aws:ResourceTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
- 'aws:ResourceTag/topology.kubernetes.io/region': `${Stack.of(scope).region}`,
+ 'aws:ResourceTag/topology.kubernetes.io/region': Stack.of(scope).region,
[`aws:RequestTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
- 'aws:RequestTag/topology.kubernetes.io/region': `${Stack.of(scope).region}`,
+ 'aws:RequestTag/eks:eks-cluster-name': clusterName,
+ 'aws:RequestTag/topology.kubernetes.io/region': Stack.of(scope).region,
},
StringLike: {
'aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass': '*',
@@ -335,15 +362,19 @@ export function karpenterSetup(cluster: ICluster,
},
});
- const allowScopedInstanceProfileActions: PolicyStatement = new PolicyStatement({
+ const allowScopedInstanceProfileActions = new PolicyStatement({
sid: 'AllowScopedInstanceProfileActions',
effect: Effect.ALLOW,
- resources: ['*'],
- actions: ['iam:AddRoleToInstanceProfile', 'iam:RemoveRoleFromInstanceProfile', 'iam:DeleteInstanceProfile'],
+ resources: [`arn:${Stack.of(scope).partition}:iam::${Stack.of(scope).account}:instance-profile/*`],
+ actions: [
+ 'iam:AddRoleToInstanceProfile',
+ 'iam:RemoveRoleFromInstanceProfile',
+ 'iam:DeleteInstanceProfile',
+ ],
conditions: {
StringEquals: {
[`aws:ResourceTag/kubernetes.io/cluster/${clusterName}`]: 'owned',
- 'aws:ResourceTag/topology.kubernetes.io/region': `${Stack.of(scope).region}`,
+ 'aws:ResourceTag/topology.kubernetes.io/region': Stack.of(scope).region,
},
StringLike: {
'aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass': '*',
@@ -351,6 +382,13 @@ export function karpenterSetup(cluster: ICluster,
},
});
+ const allowInstanceProfileReadActions = new PolicyStatement({
+ sid: 'AllowInstanceProfileReadActions',
+ effect: Effect.ALLOW,
+ resources: [`arn:${Stack.of(scope).partition}:iam::${Stack.of(scope).account}:instance-profile/*`],
+ actions: ['iam:GetInstanceProfile'],
+ });
+
const karpenterNS = cluster.addManifest('karpenterNS', {
apiVersion: 'v1',
@@ -380,6 +418,7 @@ export function karpenterSetup(cluster: ICluster,
karpenterAccount.addToPrincipalPolicy(allowAPIServerEndpointDiscovery);
karpenterAccount.addToPrincipalPolicy(allowInstanceProfileReadActions);
karpenterAccount.addToPrincipalPolicy(allowRegionalReadActions);
+ karpenterAccount.addToPrincipalPolicy(allowScopedEC2LaunchTemplateAccessActions);
//Deploy Karpenter Chart
const karpenterChart = cluster.addHelmChart('KarpenterHelmChart', {
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/Dockerfile-nvme-raid0-mount b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/Dockerfile-nvme-raid0-mount
new file mode 100644
index 000000000..8c20f2f1f
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/Dockerfile-nvme-raid0-mount
@@ -0,0 +1,8 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+FROM public.ecr.aws/amazonlinux/amazonlinux:2023
+RUN dnf -y install e2fsprogs bash mdadm util-linux
+COPY setup-runtime-storage ./
+RUN chmod +x ./setup-runtime-storage
+ENTRYPOINT ["sh", "setup-runtime-storage"]
\ No newline at end of file
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/critical-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/critical-provisioner.yml
new file mode 100644
index 000000000..a4cb651a2
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/critical-provisioner.yml
@@ -0,0 +1,102 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: critical-{{az}}
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: critical
+ node-lifecycle: on-demand
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: critical-nodes-{{az}}
+
+ expireAfter: 720h
+
+ taints:
+ - key: role
+ value: critical
+ effect: NoSchedule
+
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["m6gd"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["arm64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: NotIn
+ values: [nano, micro, small, medium, large]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["on-demand"]
+ - key: "topology.kubernetes.io/zone"
+ operator: In
+ values: ["{{az}}"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+
+ limits:
+ cpu: "3200"
+ memory: 12800Gi
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 10
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: critical-nodes-{{az}}
+spec:
+ subnetSelectorTerms:
+ - id: {{subnet-id}}
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+ tags:
+ KarpenerProvisionerName: "critical"
+
+ role: {{ROLENAME}}
+
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ userData: |
+ [settings.bootstrap-containers.bootstrap-nvme-raid0]
+ source = "{{REPLACE-WITH-IMAGE-ECR}}"
+ mode = "once"
+ essential = true
\ No newline at end of file
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-driver-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-driver-provisioner.yml
new file mode 100644
index 000000000..1317c26e9
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-driver-provisioner.yml
@@ -0,0 +1,98 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: notebook-driver-{{az}}
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: notebook
+ node-lifecycle: on-demand
+ spark-role: driver
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: notebook-driver-nodes-{{az}}
+
+ expireAfter: 720h
+
+ taints:
+ - key: role
+ value: notebook
+ effect: NoSchedule
+
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["t3", "t3a"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["amd64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: NotIn
+ values: [nano, micro, small]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["on-demand"]
+ - key: "topology.kubernetes.io/zone"
+ operator: In
+ values: ["{{az}}"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+ limits:
+ cpu: "40"
+ memory: 160Gi
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 20
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: notebook-driver-nodes-{{az}}
+spec:
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ subnetSelectorTerms:
+ - id: {{subnet-id}}
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+
+ tags:
+ KarpenerProvisionerName: "notebook-driver"
+
+ role: {{ROLENAME}}
+
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-executor-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-executor-provisioner.yml
new file mode 100644
index 000000000..b404e60c1
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/notebook-executor-provisioner.yml
@@ -0,0 +1,103 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: notebook-executor-{{az}}
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: notebook
+ node-lifecycle: spot
+ spark-role: executor
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: notebook-executor-nodes-{{az}}
+
+ expireAfter: 720h
+
+ taints:
+ - key: role
+ value: notebook
+ effect: NoSchedule
+ - key: node-lifecycle
+ value: spot
+ effect: NoSchedule
+
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["t3","t3a"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["amd64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: NotIn
+ values: [nano, micro, small, medium, large]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["spot"]
+ - key: "topology.kubernetes.io/zone"
+ operator: In
+ values: ["{{az}}"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+ limits:
+ cpu: "800"
+ memory: 3200Gi
+
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 20
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: notebook-executor-nodes-{{az}}
+spec:
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ subnetSelectorTerms:
+ - id: {{subnet-id}}
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+
+ tags:
+ KarpenerProvisionerName: "notebook-executor"
+
+ role: {{ROLENAME}}
+
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/setup-runtime-storage b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/setup-runtime-storage
new file mode 100644
index 000000000..a9cc0111f
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/setup-runtime-storage
@@ -0,0 +1,98 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#!/usr/bin/env bash
+set -ex
+
+ROOT_PATH="/.bottlerocket/rootfs"
+
+# Symlinks to ephemeral disks are created here by udev
+declare -a EPHEMERAL_DISKS
+EPHEMERAL_DISKS=("${ROOT_PATH}"/dev/disk/ephemeral/*)
+
+# Exit early if there aren't ephemeral disks
+if [ "${#EPHEMERAL_DISKS[@]}" -eq 0 ]; then
+ echo "no ephemeral disks found"
+ exit 1
+fi
+
+MD_NAME="scratch"
+MD_DEVICE="/dev/md/${MD_NAME}"
+MD_CONFIG="/.bottlerocket/bootstrap-containers/current/mdadm.conf"
+
+# Create or assemble the array.
+if [ ! -s "${MD_CONFIG}" ] ; then
+ mdadm --create --force --verbose \
+ "${MD_DEVICE}" \
+ --level=0 \
+ --name="${MD_NAME}" \
+ --raid-devices="${#EPHEMERAL_DISKS[@]}" \
+ "${EPHEMERAL_DISKS[@]}"
+ mdadm --detail --scan > "${MD_CONFIG}"
+else
+ mdadm --assemble --config="${MD_CONFIG}" "${MD_DEVICE}"
+fi
+
+# Format the array if not already formatted.
+if ! blkid --match-token TYPE=ext4 "${MD_DEVICE}" ; then
+ mkfs.ext4 "${MD_DEVICE}"
+fi
+
+MOUNT_POINT="${ROOT_PATH}/mnt/${MD_NAME}"
+
+# Mount the array in the host's /mnt.
+mkdir -p "${MOUNT_POINT}"
+mount "${MD_DEVICE}" "${MOUNT_POINT}"
+
+# Keep track of whether we can unmount the array later. This depends on the
+# version of Bottlerocket.
+should_umount="no"
+
+# Bind state directories to the array, if they exist.
+for state_dir in containerd docker kubelet ; do
+ # The correct next step depends on the version of Bottlerocket, which can be
+ # inferred by inspecting the mounts available to the bootstrap container.
+ if findmnt "${ROOT_PATH}/var/lib/${state_dir}" ; then
+ # For Bottlerocket >= 1.9.0, the state directory can be bind-mounted over
+ # the host directory and the mount will propagate back to the host.
+ mkdir -p "${MOUNT_POINT}/${state_dir}"
+ mount --rbind "${MOUNT_POINT}/${state_dir}" "${ROOT_PATH}/var/lib/${state_dir}"
+ mount --make-rshared "${ROOT_PATH}/var/lib/${state_dir}"
+ should_umount="yes"
+ elif [ ! -L "${ROOT_PATH}/var/lib/${state_dir}" ] ; then
+ # For Bottlerocket < 1.9.0, the host directory needs to be replaced with a
+ # symlink to the state directory on the array. This works but can lead to
+ # unexpected behavior or incompatibilities, for example with CSI drivers.
+ if [ -d "${ROOT_PATH}/var/lib/${state_dir}" ] ; then
+ # The host directory exists but is not a symlink, and might need to be
+ # relocated to the storage array. This depends on whether the host has
+ # been downgraded from a newer version of Bottlerocket, or whether it's
+ # the first boot of an older version.
+ if [ -d "${MOUNT_POINT}/${state_dir}" ] ; then
+ # If downgrading from a version of Bottlerocket that supported bind
+ # mounts, the directory will exist but should be empty, except for
+ # subdirectories that may have been created by tmpfiles.d before an
+ # upgrade to that version. Keep a copy of the directory just in case.
+ rm -rf "${ROOT_PATH}/var/lib/${state_dir}.bak"
+ mv "${ROOT_PATH}/var/lib/${state_dir}"{,.bak}
+ else
+ # Otherwise, treat it as the first boot of an older version, and move
+ # the directory to the array.
+ mv "${ROOT_PATH}/var/lib/${state_dir}" "${MOUNT_POINT}/${state_dir}"
+ fi
+ else
+ # The host directory does not exist, so the target directory likely needs
+ # to be created.
+ mkdir -p "${MOUNT_POINT}/${state_dir}"
+ fi
+ # Any host directory has been dealt with and the symlink can be created.
+ ln -snfT "/mnt/${MD_NAME}/${state_dir}" "${ROOT_PATH}/var/lib/${state_dir}"
+ fi
+done
+
+# When using bind mounts, the parent directory where the array is mounted can
+# be unmounted. This avoids a second, redundant mount entry under `/mnt` for
+# every new mount in one of the state directories.
+if [ "${should_umount}" == "yes" ] ; then
+ umount "${MOUNT_POINT}"
+fi
\ No newline at end of file
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-driver-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-driver-provisioner.yml
new file mode 100644
index 000000000..a51c7f551
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-driver-provisioner.yml
@@ -0,0 +1,94 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: shared-driver-{{az}}
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: shared
+ node-lifecycle: on-demand
+ spark-role: driver
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: shared-driver-nodes-{{az}}
+
+ expireAfter: 720h
+
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["m6g"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["arm64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: NotIn
+ values: [nano, micro, small, medium]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["on-demand"]
+ - key: "topology.kubernetes.io/zone"
+ operator: In
+ values: ["{{az}}"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+ limits:
+ cpu: "40"
+ memory: 160Gi
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 10
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: shared-driver-nodes-{{az}}
+spec:
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ subnetSelectorTerms:
+ - id: {{subnet-id}}
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+
+ tags:
+ KarpenerProvisionerName: "shared-driver"
+
+ role: {{ROLENAME}}
+
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-executor-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-executor-provisioner.yml
new file mode 100644
index 000000000..42918c420
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/shared-executor-provisioner.yml
@@ -0,0 +1,98 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: shared-executor-{{az}}
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: shared
+ node-lifecycle: spot
+ spark-role: executor
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: shared-executor-nodes-{{az}}
+
+ expireAfter: 720h
+
+ taints:
+ - key: node-lifecycle
+ value: spot
+ effect: NoSchedule
+
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["m6g", "m6gd"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["arm64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: NotIn
+ values: [nano, micro, small, medium, large]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["spot"]
+ - key: "topology.kubernetes.io/zone"
+ operator: In
+ values: ["{{az}}"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+ limits:
+ cpu: "3200"
+ memory: 12800Gi
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 10
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: shared-executor-nodes-{{az}}
+spec:
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ subnetSelectorTerms:
+ - id: {{subnet-id}}
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+
+ tags:
+ KarpenerProvisionerName: "shared-executor"
+
+ role: {{ROLENAME}}
+
diff --git a/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/tooling-provisioner.yml b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/tooling-provisioner.yml
new file mode 100644
index 000000000..5d967ae01
--- /dev/null
+++ b/framework/src/processing/lib/spark-runtime/emr-containers/resources/k8s/karpenter-provisioner-config/1.0.1/tooling-provisioner.yml
@@ -0,0 +1,87 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: karpenter.sh/v1
+kind: NodePool
+metadata:
+ name: tooling-nodes
+spec:
+ # References cloud provider-specific custom resource, see your cloud provider specific documentation
+ template:
+ metadata:
+ # Labels are arbitrary key-values that are applied to all nodes
+ labels:
+ role: tooling
+
+ spec:
+ nodeClassRef:
+ group: karpenter.k8s.aws
+ kind: EC2NodeClass
+ name: tooling-nodes
+
+ expireAfter: 720h
+ # Requirements that constrain the parameters of provisioned nodes.
+ # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+ # Operators { In, NotIn } are supported to enable including or excluding values
+ requirements:
+ # Include general purpose instance families
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["t3", "t3a"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["amd64"]
+ # Exclude smaller instance sizes
+ - key: "karpenter.k8s.aws/instance-size"
+ operator: In
+ values: [medium, large, xlarge]
+ - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+ operator: In
+ values: ["on-demand"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+
+ # Resource limits constrain the total size of the cluster.
+ # Limits prevent Karpenter from creating new instances once the limit is exceeded.
+ limits:
+ cpu: "100"
+ memory: 100Gi
+
+ disruption:
+
+ consolidationPolicy: WhenEmptyOrUnderutilized
+
+ # If omitted, the feature is disabled, nodes will never scale down due to low utilization
+ consolidateAfter: 300s
+
+ # Priority given to the provisioner when the scheduler considers which provisioner
+ # to select. Higher weights indicate higher priority when comparing provisioners.
+ # Specifying no weight is equivalent to specifying a weight of 0.
+ weight: 50
+
+---
+apiVersion: karpenter.k8s.aws/v1
+kind: EC2NodeClass
+metadata:
+ name: tooling-nodes
+spec:
+ amiFamily: Bottlerocket
+ amiSelectorTerms:
+ - alias: bottlerocket@latest
+ subnetSelectorTerms:
+ - id: {{subnet-1}}
+ - id: {{subnet-2}}
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: required
+ securityGroupSelectorTerms:
+ - tags:
+ kubernetes.io/cluster/{{cluster-name}}: owned
+ tags:
+ KarpenerProvisionerName: "tooling"
+
+ role: {{ROLENAME}}
+