Skip to content

Commit

Permalink
Update all demo scripts for use on GKE with a k8s 1.31 alpha cluster
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Klues <kklues@nvidia.com>
  • Loading branch information
klueska committed Nov 6, 2024
1 parent 32805fe commit 53420fa
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 7 deletions.
16 changes: 10 additions & 6 deletions demo/clusters/gke/create-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"
NODE_VERSION="1.31.1-gke.2105000"

## Create the Network for the cluster
gcloud compute networks create "${NETWORK_NAME}" \
Expand All @@ -52,16 +53,18 @@ gcloud container clusters create "${CLUSTER_NAME}" \
--no-enable-autorepair \
--no-enable-autoupgrade \
--region us-west1 \
--num-nodes "1" \
--network "${NETWORK_NAME}" \
--node-labels=nvidia.com/dra.controller=true
--cluster-version "${NODE_VERSION}" \
--node-version "${NODE_VERSION}"

# Create t4 node pool
gcloud beta container node-pools create "pool-1" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-t4,count=1" \
--image-type "UBUNTU_CONTAINERD" \
Expand All @@ -79,15 +82,15 @@ gcloud beta container node-pools create "pool-1" \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

# Create v100 node pool
gcloud beta container node-pools create "pool-2" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-v100,count=1" \
--image-type "UBUNTU_CONTAINERD" \
Expand All @@ -105,7 +108,7 @@ gcloud beta container node-pools create "pool-2" \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

## Allow the GPU nodes access to the internet
gcloud compute routers create ${NETWORK_NAME}-nat-router \
Expand All @@ -126,10 +129,11 @@ gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"

## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version-
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml

## Create the nvidia namespace
kubectl create namespace nvidia

## Deploy a custom daemonset that prepares a node for use with DRA
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/456d097feb452cca1351817bab2ccd0782e96c9f/demo/prepare-gke-nodes-for-dra.yaml
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/3498c9a91cb594af94c9e8d65177b131e380e116/demo/prepare-gke-nodes-for-dra.yaml
3 changes: 2 additions & 1 deletion demo/clusters/gke/install-dra-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: ${IMAGE_REGISTRY:=ghcr.io/nvidia}
: ${IMAGE_NAME:=${DRIVER_NAME}}
: ${IMAGE_TAG:=9323da2d-ubuntu20.04}
: ${IMAGE_TAG:=32805fec-ubi8}

helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
--set image.tag=${IMAGE_TAG} \
--set image.pullPolicy=Always \
--set controller.priorityClassName="" \
--set kubeletPlugin.priorityClassName="" \
--set deviceClasses="{gpu,mig}" \
--set nvidiaDriverRoot="/opt/nvidia" \
--set kubeletPlugin.tolerations[0].key=nvidia.com/gpu \
--set kubeletPlugin.tolerations[0].operator=Exists \
Expand Down
4 changes: 4 additions & 0 deletions demo/specs/quickstart/gpu-test-mps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,7 @@ spec:
resourceClaims:
- name: shared-gpu
resourceClaimTemplateName: shared-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
8 changes: 8 additions & 0 deletions demo/specs/quickstart/gpu-test1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ spec:
resourceClaims:
- name: gpu
resourceClaimTemplateName: single-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

---
apiVersion: v1
Expand All @@ -61,3 +65,7 @@ spec:
resourceClaims:
- name: gpu
resourceClaimTemplateName: single-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
4 changes: 4 additions & 0 deletions demo/specs/quickstart/gpu-test2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,7 @@ spec:
resourceClaims:
- name: shared-gpu
resourceClaimTemplateName: single-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
8 changes: 8 additions & 0 deletions demo/specs/quickstart/gpu-test3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ spec:
resourceClaims:
- name: shared-gpu
resourceClaimName: single-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

---
apiVersion: v1
Expand All @@ -60,3 +64,7 @@ spec:
resourceClaims:
- name: shared-gpu
resourceClaimName: single-gpu
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
4 changes: 4 additions & 0 deletions demo/specs/quickstart/gpu-test4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,7 @@ spec:
claims:
- name: mig-devices
request: mig-3g-20gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
4 changes: 4 additions & 0 deletions demo/specs/quickstart/gpu-test5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,7 @@ spec:
resourceClaims:
- name: shared-gpus
resourceClaimTemplateName: multiple-gpus
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
4 changes: 4 additions & 0 deletions demo/specs/quickstart/gpu-test6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@ spec:
resourceClaims:
- name: a100
resourceClaimTemplateName: a100
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

0 comments on commit 53420fa

Please sign in to comment.