Merge branch 'rearrange-demos' into 'main'

Rearrange files in the demo folder for better organization See merge request nvidia/cloud-native/k8s-dra-driver!27
NVIDIA · Nov 3, 2023 · 749fcd1 · 749fcd1
2 parents fe94758 + 98467d8
commit 749fcd1
Show file tree

Hide file tree

Showing 30 changed files with 259 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -32,33 +32,34 @@ First since we'll launch kind with GPU support, ensure that the following prereq
    Container Runtime to use volume mounts to select devices to inject into a
    container.
 
-We start by first cloning this repository and `cd`ing into its `demo`
-subdirectory. All of the scripts and example Pod specs used in this demo are
-contained here, so take a moment to browse through the various files and see
+We start by first cloning this repository and `cd`ing into it.
+All of the scripts and example Pod specs used in this demo are in the `demo`
+subdirectory, so take a moment to browse through the various files and see
 what's available:
 
 ```
 git clone https://github.com/NVIDIA/k8s-dra-driver.git
-cd k8s-dra-driver/demo
+```
+```
+cd k8s-dra-driver
 ```
 
 ### Setting up the infrastructure
 First, create a `kind` cluster to run the demo:
 ```bash
-./create-cluster.sh
+./demo/clusters/kind/create-cluster.sh
 ```
 
-
 From here we will build the image for the example resource driver:
 ```bash
-./build-dra-driver.sh
+./demo/clusters/kind/build-dra-driver.sh
 ```
 
 This also makes the built images available to the `kind` cluster.
 
 We now install the NVIDIA GPU DRA driver:
 ```
-./install-dra-driver.sh
+./demo/clusters/kind/install-dra-driver.sh
 ```
 
 This should show two pods running in the `nvidia-dra-driver` namespace:
@@ -70,16 +71,16 @@ nvidia-dra-driver   nvidia-dra-plugin-lt7qh                    1/1     Running
 ```
 
 ### Run the examples by following the steps in the demo script
-Finally, you can run the various examples contained in the `demo` folder.
-The files `demo/DEMO.sh` shows the full script of the demo you can walk through.
+Finally, you can run the various examples contained in the `demo/specs/quickstart` folder.
+The `README` in that directory shows the full script of the demo you can walk through.
 ```console
-cat demo/DEMO.sh
+cat demo/specs/quickstart/README.md
 ...
 ```
 
 Where the running the first three examples should produce output similar to the following:
 ```console
-$ kubectl apply --filename=gpu-test{1,2,3}.yaml
+$ kubectl apply --filename=demo/specs/quickstart/gpu-test{1,2,3}.yaml
 ...
 
 ```
@@ -112,7 +113,7 @@ GPU 0: A100-SXM4-40GB (UUID: GPU-4404041a-04cf-1ccf-9e70-f139a9b1e23c)
 
 Running
 ```
-$ ./delete-cluster.sh
+$ ./demo/clusters/kind/delete-cluster.sh
 ```
 will remove the cluster created in the preceding steps.
 
@@ -155,4 +156,4 @@ half-half:
       mig-devices: {}
 EOF
 ```
--->
+-->
diff --git a/demo/DEMO.sh b/demo/DEMO.sh
diff --git a/demo/build-dra-driver.sh → demo/clusters/kind/build-dra-driver.sh b/demo/build-dra-driver.sh → demo/clusters/kind/build-dra-driver.sh
diff --git a/demo/create-cluster.sh → demo/clusters/kind/create-cluster.sh b/demo/create-cluster.sh → demo/clusters/kind/create-cluster.sh
diff --git a/demo/delete-cluster.sh → demo/clusters/kind/delete-cluster.sh b/demo/delete-cluster.sh → demo/clusters/kind/delete-cluster.sh
diff --git a/demo/install-dra-driver.sh → demo/clusters/kind/install-dra-driver.sh b/demo/install-dra-driver.sh → demo/clusters/kind/install-dra-driver.sh
@@ -25,7 +25,7 @@ source "${CURRENT_DIR}/scripts/common.sh"
 kubectl label node "${KIND_CLUSTER_NAME}-worker" --overwrite nvidia.com/dra.kubelet-plugin=true
 kubectl label node "${KIND_CLUSTER_NAME}-control-plane" --overwrite nvidia.com/dra.controller=true
 
-helm upgrade -i --create-namespace --namespace nvidia-dra-driver nvidia ../deployments/helm/k8s-dra-driver \
+helm upgrade -i --create-namespace --namespace nvidia-dra-driver nvidia ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
     ${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
     --wait
 

diff --git a/demo/scripts/build-driver-image.sh → ...usters/kind/scripts/build-driver-image.sh b/demo/scripts/build-driver-image.sh → ...usters/kind/scripts/build-driver-image.sh
@@ -31,7 +31,7 @@ cleanup() {
 trap cleanup EXIT
 
 # Go back to the root directory of this repo
-cd ${CURRENT_DIR}/../..
+cd ${PROJECT_DIR}
 
 # Regenerate the CRDs and build the container image
 # TODO: This should be part of the image name

diff --git a/demo/scripts/build-kind-image.sh → ...clusters/kind/scripts/build-kind-image.sh b/demo/scripts/build-kind-image.sh → ...clusters/kind/scripts/build-kind-image.sh
diff --git a/demo/scripts/common.sh → demo/clusters/kind/scripts/common.sh b/demo/scripts/common.sh → demo/clusters/kind/scripts/common.sh
@@ -17,7 +17,7 @@
 
 # A reference to the current directory where this script is located
 SCRIPTS_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
-PROJECT_DIR="$(cd -- "$( dirname -- "${SCRIPTS_DIR}/../../.." )" &> /dev/null && pwd)"
+PROJECT_DIR="$(cd -- "$( dirname -- "${SCRIPTS_DIR}/../../../../.." )" &> /dev/null && pwd)"
 
 # We extract information from versions.mk
 function from_versions_mk() {

diff --git a/demo/scripts/create-kind-cluster.sh → ...sters/kind/scripts/create-kind-cluster.sh b/demo/scripts/create-kind-cluster.sh → ...sters/kind/scripts/create-kind-cluster.sh
diff --git a/demo/scripts/delete-kind-cluster.sh → ...sters/kind/scripts/delete-kind-cluster.sh b/demo/scripts/delete-kind-cluster.sh → ...sters/kind/scripts/delete-kind-cluster.sh
diff --git a/demo/scripts/kind-cluster-config.yaml → ...ers/kind/scripts/kind-cluster-config.yaml b/demo/scripts/kind-cluster-config.yaml → ...ers/kind/scripts/kind-cluster-config.yaml
diff --git a/demo/scripts/load-driver-image-into-kind.sh → ...nd/scripts/load-driver-image-into-kind.sh b/demo/scripts/load-driver-image-into-kind.sh → ...nd/scripts/load-driver-image-into-kind.sh
diff --git a/demo/specs/mig+mps/README.md b/demo/specs/mig+mps/README.md
@@ -0,0 +1,100 @@
+#### Show the job and its claims
+```console
+vim -O \
+	sharing-demo-job.yaml \
+	sharing-demo-claims.yaml \
+	sharing-demo-parameters.yaml
+```
+
+#### Show current state of the cluster
+```console
+kubectl get pod -A
+```
+
+#### Show the current MIG configuration of the machine
+```console
+nvidia-smi -L
+```
+
+#### Create the demo namespace and deploy the job and its claims
+```console
+kubectl create namespace sharing-demo
+kubectl apply \
+	-f sharing-demo-parameters.yaml \
+	-f sharing-demo-claims.yaml \
+	-f sharing-demo-job.yaml
+```
+
+#### Show processes starting to come up
+```console
+kubectl get pod -A
+```
+
+#### Show MIG devices and processes from all job pods executing
+```console
+nvidia-smi
+```
+
+#### Set some environment variables to help us narrow down our view of the running processes
+```console
+eval "$(./sharing-demo-envs.sh)"
+```
+
+#### View the processes for the time-sliced GPU
+```console
+docker run \
+	--rm \
+	--pid=host \
+	-e NVIDIA_VISIBLE_DEVICES=${GPU_TS_SHARING_DEVICE} \
+	ubuntu:22.04 nvidia-smi
+```
+
+#### View the processes for the MPS-shared GPU
+```console
+docker run \
+	--rm \
+	--pid=host \
+	-e NVIDIA_VISIBLE_DEVICES=${GPU_MPS_SHARING_DEVICE} \
+	ubuntu:22.04 nvidia-smi
+```
+
+#### View the processes for the time-sliced MIG Device
+```console
+docker run \
+	--rm \
+	--pid=host \
+	-e NVIDIA_VISIBLE_DEVICES=${MIG_TS_SHARING_DEVICE} \
+	ubuntu:22.04 nvidia-smi
+```
+
+#### View the processes for the MPS-shared MIG Device
+```console
+docker run \
+	--rm \
+	--pid=host \
+	-e NVIDIA_VISIBLE_DEVICES=${MIG_MPS_SHARING_DEVICE} \
+	ubuntu:22.04 nvidia-smi
+```
+
+#### Show the running job again
+```console
+kubectl get pod -A
+```
+
+#### Delete the job and its claims
+```console
+kubectl delete \
+	-f sharing-demo-parameters.yaml \
+	-f sharing-demo-claims.yaml \
+	-f sharing-demo-job.yaml
+```
+
+#### Show processes starting to come down
+```console
+kubectl get pod -A
+```
+
+#### Show MIG devices have been deleted
+```console
+nvidia-smi -L
+```
diff --git a/demo/specs/mig+mps/sharing-demo-claims-1.yaml b/demo/specs/mig+mps/sharing-demo-claims-1.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClaim
+metadata:
+  namespace: sharing-demo
+  name: mig-mps-sharing
+spec:
+  resourceClassName: gpu.nvidia.com
+  parametersRef:
+    apiGroup: gpu.resource.nvidia.com
+    kind: MigDeviceClaimParameters
+    name: mig-mps-sharing
diff --git a/demo/sharing-demo-claims.yaml → demo/specs/mig+mps/sharing-demo-claims.yaml b/demo/sharing-demo-claims.yaml → demo/specs/mig+mps/sharing-demo-claims.yaml
diff --git a/demo/specs/mig+mps/sharing-demo-envs.sh b/demo/specs/mig+mps/sharing-demo-envs.sh
@@ -0,0 +1,15 @@
+GPU_TS_SHARING=$(nvidia-smi | awk '$3 == "N/A" && $4 == "N/A" && $6 == "C" && $7 ~ /sample/ {print $2}' | sort -u)
+GPU_MPS_SHARING=$(nvidia-smi | awk '$3 == "N/A" && $4 == "N/A" && $6 == "M+C" && $7 ~ /sample/ {print $2}' | sort -u)
+GI_TS_SHARING=$(nvidia-smi | awk '$3 != "N/A" && $4 != "N/A" && $6 == "C" && $7 ~ /sample/ {print $3}' | sort -u)
+GI_MPS_SHARING=$(nvidia-smi | awk '$3 != "N/A" && $4 != "N/A" && $6 == "M+C" && $7 ~ /sample/ {print $3}' | sort -u)
+MIG_TS_SHARING=$(nvidia-smi | awk -v gi="${GI_TS_SHARING}" '/MIG devices:/{p=1} /Processes:/{p=0} p && $3==gi {print $2 ":" $5}')
+MIG_MPS_SHARING=$(nvidia-smi | awk -v gi="${GI_MPS_SHARING}" '/MIG devices:/{p=1} /Processes:/{p=0} p && $3==gi {print $2 ":" $5}')
+
+echo export GPU_TS_SHARING_DEVICE=${GPU_TS_SHARING}
+>&2 echo export GPU_TS_SHARING_DEVICE=${GPU_TS_SHARING}
+echo export GPU_MPS_SHARING_DEVICE=${GPU_MPS_SHARING}
+>&2 echo export GPU_MPS_SHARING_DEVICE=${GPU_MPS_SHARING}
+echo export MIG_TS_SHARING_DEVICE=${MIG_TS_SHARING}
+>&2 echo export MIG_TS_SHARING_DEVICE=${MIG_TS_SHARING}
+echo export MIG_MPS_SHARING_DEVICE=${MIG_MPS_SHARING}
+>&2 echo export MIG_MPS_SHARING_DEVICE=${MIG_MPS_SHARING}
diff --git a/demo/specs/mig+mps/sharing-demo-job-1.yaml b/demo/specs/mig+mps/sharing-demo-job-1.yaml
@@ -0,0 +1,22 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  namespace: sharing-demo
+  name: sharing-demo-job
+spec:
+  parallelism: 4
+  template:
+    spec:
+      restartPolicy: OnFailure
+      resourceClaims:
+      - name: mig-mps-sharing
+        source:
+          resourceClaimName: mig-mps-sharing
+      containers:
+      - name: mig-mps-sharing-ctr
+        image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
+        args: ["--benchmark", "--numbodies=4226000"]
+        resources:
+          claims:
+          - name: mig-mps-sharing
diff --git a/demo/sharing-demo-job.yaml → demo/specs/mig+mps/sharing-demo-job.yaml b/demo/sharing-demo-job.yaml → demo/specs/mig+mps/sharing-demo-job.yaml
@@ -26,29 +26,25 @@ spec:
       containers:
       - name: gpu-ts-sharing-ctr
         image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
-        command: ["sleep", "99999"]
-        #args: ["--benchmark", "--numbodies=4226000"]
+        args: ["--benchmark", "--numbodies=4226000"]
         resources:
           claims:
           - name: gpu-ts-sharing
       - name: gpu-mps-sharing-ctr
         image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
-        command: ["sleep", "99999"]
-        #args: ["--benchmark", "--numbodies=4226000"]
+        args: ["--benchmark", "--numbodies=4226000"]
         resources:
           claims:
           - name: gpu-mps-sharing
       - name: mig-ts-sharing-ctr
         image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
-        command: ["sleep", "99999"]
-        #args: ["--benchmark", "--numbodies=4226000"]
+        args: ["--benchmark", "--numbodies=4226000"]
         resources:
           claims:
           - name: mig-ts-sharing
       - name: mig-mps-sharing-ctr
         image: nvcr.io/nvidia/k8s/cuda-sample:nbody-cuda11.7.1-ubuntu18.04
-        command: ["sleep", "99999"]
-        #args: ["--benchmark", "--numbodies=4226000"]
+        args: ["--benchmark", "--numbodies=4226000"]
         resources:
           claims:
           - name: mig-mps-sharing
diff --git a/demo/sharing-demo-mig-parted-config.yaml → ...g+mps/sharing-demo-mig-parted-config.yaml b/demo/sharing-demo-mig-parted-config.yaml → ...g+mps/sharing-demo-mig-parted-config.yaml
diff --git a/demo/specs/mig+mps/sharing-demo-parameters-1.yaml b/demo/specs/mig+mps/sharing-demo-parameters-1.yaml
@@ -0,0 +1,10 @@
+---
+apiVersion: gpu.resource.nvidia.com/v1alpha1
+kind: MigDeviceClaimParameters
+metadata:
+  namespace: sharing-demo
+  name: mig-mps-sharing
+spec:
+  profile: "1g.5gb"
+  sharing:
+    strategy: MPS
diff --git a/demo/sharing-demo-parameters.yaml → ...pecs/mig+mps/sharing-demo-parameters.yaml b/demo/sharing-demo-parameters.yaml → ...pecs/mig+mps/sharing-demo-parameters.yaml