Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add explicit envvar to control if we mask /proc/driver/nvidia/params
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Klues <kklues@nvidia.com>
klueska committed Oct 29, 2024
1 parent 737b4c5 commit 0ebb992
Showing 4 changed files with 47 additions and 9 deletions.
1 change: 1 addition & 0 deletions demo/clusters/kind/install-dra-driver.sh
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC
--set deviceClasses="{${deviceClasses}}" \
${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \
${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
${MASK_NVIDIA_DRIVER_PARAMS:+--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS}} \
--wait

set +x
1 change: 0 additions & 1 deletion demo/clusters/nvkind/install-dra-driver.sh

This file was deleted.

23 changes: 23 additions & 0 deletions demo/clusters/nvkind/install-dra-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash

# Copyright 2023 The Kubernetes Authors.
# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A reference to the current directory where this script is located
CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"

: ${MASK_NVIDIA_DRIVER_PARAMS:="true"}
export MASK_NVIDIA_DRIVER_PARAMS
exec ${CURRENT_DIR}/../kind/install-dra-driver.sh
20 changes: 12 additions & 8 deletions deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
Original file line number Diff line number Diff line change
@@ -57,14 +57,16 @@ spec:
args:
- |-
trap 'exit 0' TERM
# TODO: Masking of the params file is done below to allow nvkind to
# selectively exclude certain GPUs from being visible to the driver.
# At present, this is only feasible with a host-mounted driver where
# /dev in this container already has GPU devices present (as brought
# in via the --privileged flag from docker/podman when using nvkind).
# In the future we should revisit this to find a more robust method
# of supporting this.
if [ "${NVIDIA_DRIVER_ROOT}" = "/" ]; then
# Masking of the params file is typically done to allow nvkind to
# selectively exclude certain GPUs from being visible to the
# underlying GPU driver. Unfortunately, kind doesn't let you choose
# which device nodes to inject into each worker node (they all come in
# via the --priviliged flag passed to docker/podman). Because of
# this, all workers see all GPUs by default. By masking the params
# file we can prevent a container from recreating any missing GPU
# device nodes and limit its view to only those device nodes that
# nvkind decided to allow in.
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
cp /proc/driver/nvidia/params root/gpu-params
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
@@ -74,6 +76,8 @@ spec:
resources:
{{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }}
env:
- name: MASK_NVIDIA_DRIVER_PARAMS
value: "{{ .Values.maskNvidiaDriverParams }}"
- name: NVIDIA_CTK_PATH
value: "{{ .Values.nvidiaCtkPath }}"
- name: NVIDIA_DRIVER_ROOT
11 changes: 11 additions & 0 deletions deployments/helm/k8s-dra-driver/values.yaml
Original file line number Diff line number Diff line change
@@ -36,6 +36,17 @@ allowDefaultNamespace: false

deviceClasses: ["gpu", "mig", "imex"]

# Masking of the params file is typically done to allow nvkind to
# selectively exclude certain GPUs from being visible to the
# underlying GPU driver. Unfortunately, kind doesn't let you choose
# which device nodes to inject into each worker node (they all come in
# via the --priviliged flag passed to docker/podman). Because of
# this, all workers see all GPUs by default. By masking the params
# file we can prevent a container from recreating any missing GPU
# device nodes and limit its view to only those device nodes that
# nvkind decided to allow in.
maskNvidiaDriverParams: false

imagePullSecrets: []
image:
repository: nvcr.io/nvidia/cloud-native/k8s-dra-driver

0 comments on commit 0ebb992

Please sign in to comment.