diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index e442293e..ece8cdf1 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -29,6 +29,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC --set deviceClasses="{${deviceClasses}}" \ ${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \ ${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \ + ${MASK_NVIDIA_DRIVER_PARAMS:+--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS}} \ --wait set +x diff --git a/demo/clusters/nvkind/install-dra-driver.sh b/demo/clusters/nvkind/install-dra-driver.sh deleted file mode 120000 index 4ae0529d..00000000 --- a/demo/clusters/nvkind/install-dra-driver.sh +++ /dev/null @@ -1 +0,0 @@ -../kind/install-dra-driver.sh \ No newline at end of file diff --git a/demo/clusters/nvkind/install-dra-driver.sh b/demo/clusters/nvkind/install-dra-driver.sh new file mode 100755 index 00000000..b409fd29 --- /dev/null +++ b/demo/clusters/nvkind/install-dra-driver.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Copyright 2024 NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A reference to the current directory where this script is located +CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)" + +: ${MASK_NVIDIA_DRIVER_PARAMS:="true"} +export MASK_NVIDIA_DRIVER_PARAMS +exec ${CURRENT_DIR}/../kind/install-dra-driver.sh diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml index cf9ddecb..0b9b09b0 100644 --- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml +++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml @@ -56,14 +56,11 @@ spec: command: ["bash", "-c"] args: - |- - # TODO: Masking of the params file is done below to allow nvkind to - # selectively exclude certain GPUs from being visible to the driver. - # At present, this is only feasible with a host-mounted driver where - # /dev in this container already has GPU devices present (as brought - # in via the --privileged flag from docker/podman when using nvkind). - # In the future we should revisit this to find a more robust method - # of supporting this. - if [ "${NVIDIA_DRIVER_ROOT}" = "/" ]; then + # Conditionally mask the params file to prevent this container from + # recreating any missing GPU device nodes. This is necessary, for + # example, when running under nvkind to limit the set GPUs governed + # by the plugin even though it has cgroup access to all of them. + if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then cp /proc/driver/nvidia/params root/gpu-params sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params mount --bind root/gpu-params /proc/driver/nvidia/params @@ -72,6 +69,8 @@ spec: resources: {{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }} env: + - name: MASK_NVIDIA_DRIVER_PARAMS + value: "{{ .Values.maskNvidiaDriverParams }}" - name: NVIDIA_CTK_PATH value: "{{ .Values.nvidiaCtkPath }}" - name: NVIDIA_DRIVER_ROOT diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index 09ce8d3b..76ff38ca 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -36,6 +36,17 @@ allowDefaultNamespace: false deviceClasses: ["gpu", "mig", "imex"] +# Masking of the params file is typically done to allow nvkind to +# selectively exclude certain GPUs from being visible to the +# underlying GPU driver. Unfortunately, kind doesn't let you choose +# which device nodes to inject into each worker node (they all come in +# via the --priviliged flag passed to docker/podman). Because of +# this, all workers see all GPUs by default. By masking the params +# file we can prevent a container from recreating any missing GPU +# device nodes and limit its view to only those device nodes that +# nvkind decided to allow in. +maskNvidiaDriverParams: false + imagePullSecrets: [] image: repository: nvcr.io/nvidia/cloud-native/k8s-dra-driver