From 3c7c5f50f84f18838afbea551f11cdf3a3b68b7e Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Tue, 18 Nov 2025 11:57:52 +0200 Subject: [PATCH 1/6] gpu: remove unused variables Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/gpu_plugin.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 619fde38b..42caec066 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -43,8 +43,6 @@ const ( devfsDriDirectory = "/dev/dri" wslDxgPath = "/dev/dxg" wslLibPath = "/usr/lib/wsl" - nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d" - resourceFilename = "intel-gpu-resources.txt" gpuDeviceRE = `^card[0-9]+$` controlDeviceRE = `^controlD[0-9]+$` pciAddressRE = "^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\\.[0-9a-f]{1}$" From cdb4bf384e061802f932260fee0d3e7d6e56fa8b Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Tue, 18 Nov 2025 11:59:32 +0200 Subject: [PATCH 2/6] gpu: add support for additional temperature limits Use the existing "temp-limit" as the global limit, and introduce GPU and memory thresholds. Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/gpu_plugin.go | 16 ++++++---- cmd/gpu_plugin/gpu_plugin_test.go | 29 +++++++++++++++---- .../levelzeroservice/levelzero_service.go | 12 ++++---- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 42caec066..0c572975e 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -70,7 +70,9 @@ type cliOptions struct { allowIDs string denyIDs string sharedDevNum int - temperatureLimit int + globalTempLimit int + memoryTempLimit int + gpuTempLimit int enableMonitoring bool wslScan bool healthManagement bool @@ -402,13 +404,13 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string { return health } - limit := float64(dp.options.temperatureLimit) - // Temperatures for different areas - klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", + klog.V(4).Infof("Temperatures: Memory=%dC, GPU=%dC, Global=%dC", deviceTemps.Memory, deviceTemps.GPU, deviceTemps.Global) - if deviceTemps.GPU > limit || deviceTemps.Global > limit || deviceTemps.Memory > limit { + if deviceTemps.GPU > dp.options.gpuTempLimit || + deviceTemps.Global > dp.options.globalTempLimit || + deviceTemps.Memory > dp.options.memoryTempLimit { health = pluginapi.Unhealthy } @@ -784,7 +786,9 @@ func main() { flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") - flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", 100, "GPU temperature limit at which device is marked unhealthy") + flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", 100, "Memory temperature limit at which device is marked unhealthy") flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none") flag.StringVar(&opts.allowIDs, "allow-ids", "", "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)") flag.StringVar(&opts.denyIDs, "deny-ids", "", "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)") diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index beb9f7262..86fd48e65 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -58,10 +58,11 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { } type mockL0Service struct { - indices []uint32 - memSize uint64 - healthy bool - fail bool + indices []uint32 + memSize uint64 + healthy bool + failTemp bool + fail bool } func (m *mockL0Service) Run(keep bool) { @@ -83,7 +84,7 @@ func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.Dev return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil } func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) { - if m.fail { + if m.fail || m.failTemp { return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error") } @@ -608,6 +609,24 @@ func TestScanWithHealth(t *testing.T) { healthy: true, }, }, + { + name: "one device with failure on temp reading", + pciAddresses: map[string]string{"0000:00:00.0": "card0"}, + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + l0mock: &mockL0Service{ + healthy: true, + failTemp: true, + }, + }, { name: "one unhealthy device with proper symlink", pciAddresses: map[string]string{"0000:00:00.0": "card0"}, diff --git a/cmd/gpu_plugin/levelzeroservice/levelzero_service.go b/cmd/gpu_plugin/levelzeroservice/levelzero_service.go index 79ca4a3f9..774face02 100644 --- a/cmd/gpu_plugin/levelzeroservice/levelzero_service.go +++ b/cmd/gpu_plugin/levelzeroservice/levelzero_service.go @@ -39,9 +39,9 @@ type DeviceHealth struct { } type DeviceTemperature struct { - Global float64 - GPU float64 - Memory float64 + Global int + GPU int + Memory int } type clientNotReadyErr struct{} @@ -175,9 +175,9 @@ func (l *levelzero) GetDeviceTemperature(bdfAddress string) (DeviceTemperature, } return DeviceTemperature{ - Global: temps.Global, - GPU: temps.Gpu, - Memory: temps.Memory, + Global: int(temps.Global), + GPU: int(temps.Gpu), + Memory: int(temps.Memory), }, nil } From c0e8e4feada37718d37c4d85657c64ef09eab0c4 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 19 Nov 2025 12:14:09 +0200 Subject: [PATCH 3/6] gpu: levelzero: update compute-runtime components Fix uninitialized variable that caused random behaviour. Signed-off-by: Tuomas Katila --- build/docker/intel-gpu-levelzero.Dockerfile | 14 +++++++------- .../templates/intel-gpu-levelzero.Dockerfile.in | 14 +++++++------- cmd/gpu_levelzero/zes.c | 15 +++++++++++---- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/build/docker/intel-gpu-levelzero.Dockerfile b/build/docker/intel-gpu-levelzero.Dockerfile index cf04f45f9..3951ef0a3 100644 --- a/build/docker/intel-gpu-levelzero.Dockerfile +++ b/build/docker/intel-gpu-levelzero.Dockerfile @@ -44,13 +44,13 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \ LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \ wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \ cd /runtime && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \ - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \ - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \ - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \ - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \ - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \ + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \ + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \ dpkg -i *.deb && \ rm -rf /var/lib/apt/lists/\*; \ else \ diff --git a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in index d109c619c..fd77fa8fa 100644 --- a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in +++ b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in @@ -37,13 +37,13 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \N LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \N wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \N cd /runtime && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \N - wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \N - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \N - wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \N - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \N - wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \N + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N + wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \N dpkg -i *.deb && \N rm -rf /var/lib/apt/lists/\*; \N else \N diff --git a/cmd/gpu_levelzero/zes.c b/cmd/gpu_levelzero/zes.c index a4593deb8..5243e0321 100644 --- a/cmd/gpu_levelzero/zes.c +++ b/cmd/gpu_levelzero/zes.c @@ -137,8 +137,12 @@ static ze_result_t enumerate_zes_devices(void) for (uint32_t i = 0; i < count; ++i) { zes_device_handle_t dev_h = zes_handles[i]; - zes_pci_properties_t pci_props; + zes_pci_properties_t pci_props = { + .pNext = NULL, + }; + if (zesDevicePciGetProperties(dev_h, &pci_props) != ZE_RESULT_SUCCESS) { + print_log(LOG_WARNING, "Failed to get PCI properties for device %d: %X\n", i, res); continue; } @@ -332,8 +336,9 @@ bool zes_device_bus_is_healthy(char* bdf_address, uint32_t* error) return true; } - zes_pci_state_t pci_state; - memset(&pci_state, 0, sizeof(pci_state)); + zes_pci_state_t pci_state = { + .pNext = NULL, + }; ze_result_t res = zesDevicePciGetState(handle, &pci_state); if (res == ZE_RESULT_SUCCESS) { @@ -409,7 +414,9 @@ double zes_device_temp_max(char* bdf_address, char* sensor, uint32_t* error) } for (uint32_t i = 0; i < count; ++i) { - zes_temp_properties_t props; + zes_temp_properties_t props = { + .pNext = NULL, + }; res = zesTemperatureGetProperties(tempHandles[i], &props); if (res != ZE_RESULT_SUCCESS) { From d2e3c4b43246e837c59545168954a3d773fa1aa0 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 19 Nov 2025 12:36:15 +0200 Subject: [PATCH 4/6] build: levelzero: re-download compute-runtime and others By re-downloading the components, we save on the overall container size. While the build time increases slightly, the container size drops by around 100M (520->420). Signed-off-by: Tuomas Katila --- build/docker/intel-gpu-levelzero.Dockerfile | 17 ++++++++++++++--- .../templates/intel-gpu-levelzero.Dockerfile.in | 17 ++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/build/docker/intel-gpu-levelzero.Dockerfile b/build/docker/intel-gpu-levelzero.Dockerfile index 3951ef0a3..3737d7787 100644 --- a/build/docker/intel-gpu-levelzero.Dockerfile +++ b/build/docker/intel-gpu-levelzero.Dockerfile @@ -52,6 +52,7 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \ wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \ dpkg -i *.deb && \ + rm -f *.deb && \ rm -rf /var/lib/apt/lists/\*; \ else \ source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \ @@ -83,9 +84,19 @@ ARG CMD ARG ROCKYLINUX COPY --from=builder /runtime /runtime RUN if [ $ROCKYLINUX -eq 0 ]; then \ - apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \ - rm /runtime/level-zero-devel_*.deb && \ - cd /runtime && dpkg -i *.deb && rm -rf /runtime && \ + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \ + cd /runtime && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \ + wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \ + dpkg -i *.deb && \ + apt-get -y remove wget ca-certificates && \ + apt-get -y autoremove && \ + rm -f *.deb && \ + rm -rf /var/lib/apt/lists/\* && \ rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \ else \ cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \ diff --git a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in index fd77fa8fa..c72ee067b 100644 --- a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in +++ b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in @@ -45,6 +45,7 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \N wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \N dpkg -i *.deb && \N + rm -f *.deb && \N rm -rf /var/lib/apt/lists/\*; \N else \N source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \N @@ -80,9 +81,19 @@ ARG ROCKYLINUX COPY --from=builder /runtime /runtime RUN if [ $ROCKYLINUX -eq 0 ]; then \N - apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \N - rm /runtime/level-zero-devel_*.deb && \N - cd /runtime && dpkg -i *.deb && rm -rf /runtime && \N + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \N + cd /runtime && \N + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N + wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N + wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N + dpkg -i *.deb && \N + apt-get -y remove wget ca-certificates && \N + apt-get -y autoremove && \N + rm -f *.deb && \N + rm -rf /var/lib/apt/lists/\* && \N rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \N else \N cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \N From 2ffbe4bb1942c59a7915f60127d54f72c56780ee Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Fri, 28 Nov 2025 08:41:18 +0200 Subject: [PATCH 5/6] gpu plugin: add by-path mount options default behaviour stays the same. Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/README.md | 21 +++++++ cmd/gpu_plugin/gpu_plugin.go | 32 ++++++++++- cmd/gpu_plugin/gpu_plugin_test.go | 92 +++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 2 deletions(-) diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md index d68308019..ebb3bc74c 100644 --- a/cmd/gpu_plugin/README.md +++ b/cmd/gpu_plugin/README.md @@ -19,6 +19,7 @@ Table of Contents * [CDI support](#cdi-support) * [KMD and UMD](#kmd-and-umd) * [Health management](#health-management) + * [by-path mounting](#by-path-mounting) * [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups) * [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api) @@ -60,6 +61,7 @@ For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd). | -allow-ids | string | "" | A list of PCI Device IDs that are allowed to be registered as resources. Default is empty (=all registered). Cannot be used together with `deny-ids`. | | -deny-ids | string | "" | A list of PCI Device IDs that are denied to be registered as resources. Default is empty (=all registered). Cannot be used together with `allow-ids`. | | -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. | +| -bypath | string | single | 3 possible values: single, none, all. Default is single. Changes how the by-path symlinks are handled by the plugin. More [info](#by-path-mounting). | The plugin also accepts a number of other arguments (common to all plugins) related to logging. Please use the -h option to see the complete list of logging related options. @@ -258,6 +260,25 @@ Kubernetes Device Plugin API allows passing device's healthiness to Kubelet. By Temperature limit can be provided via the command line argument, default is 100C. +### By-path mounting + +The DRM devices for the Intel GPUs register `by-path` symlinks under `/dev/dri/by-path`. For each GPU character device, there is a corresponding symlink in the by-path directory: +``` +$ ls -l /dev/dri/by-path/ +lrwxrwxrwx 1 root root 8 oct x 13:09 pci-0000:00:02.0-card -> ../card1 +lrwxrwxrwx 1 root root 13 oct x 13:09 pci-0000:00:02.0-render -> ../renderD128 +``` + +The Intel GPU UMD uses these symlinks to detect hardware properties in some cases. Mounting the by-path symlinks as __symlinks__ with the Device plugin API (DP API) is not possible. When the symlinks are mounted via the DP API, they are mounted as the actual devices, and the symlink information is lost (pci address). + +To support possible all use cases, GPU plugin allows changing the by-path mounting method. The options are: +* `single` - Symlinks are individually mounted per device. Default. + * Mostly Works, but is known to have issues with some pytorch workloads. See [issue](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/2158). +* `none` - No symlinks are mounted. + * Aligned with docker use where devices are included with privileged mode. +* `all` - All symlinks are mounted even if only one is allocated by the container. + * Optimal for scale-up workloads where all the GPUs are used by the workload. + ### Issues with media workloads on multi-GPU setups OneVPL media API, 3D and compute APIs provide device discovery diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 0c572975e..8c494fb08 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -59,6 +59,10 @@ const ( monitorSuffix = "_monitoring" monitorID = "all" + bypathOptionNone = "none" + bypathOptionAll = "all" + bypathOptionSingle = "single" + levelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK" // Period of device scans. @@ -69,6 +73,7 @@ type cliOptions struct { preferredAllocationPolicy string allowIDs string denyIDs string + bypathMount string sharedDevNum int globalTempLimit int memoryTempLimit int @@ -289,6 +294,16 @@ func (dp *devicePlugin) bypathMountsForPci(pciAddress, bypathDir string) []plugi return mounts } +func (dp *devicePlugin) bypathMountForAll() []pluginapi.Mount { + return []pluginapi.Mount{ + { + ContainerPath: dp.bypathDir, + HostPath: dp.bypathDir, + ReadOnly: true, + }, + } +} + type devicePlugin struct { gpuDeviceReg *regexp.Regexp controlDeviceReg *regexp.Regexp @@ -660,8 +675,20 @@ func (dp *devicePlugin) createMountsAndCDIDevices(cardPath, name string, devSpec mounts := []pluginapi.Mount{} if dp.bypathFound { - if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil { - mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir) + switch dp.options.bypathMount { + case bypathOptionAll: + klog.V(4).Info("Using by-path mount option: all") + mounts = dp.bypathMountForAll() + case bypathOptionNone: + klog.V(4).Info("Using by-path mount option: none") + // no mounts + case bypathOptionSingle: + fallthrough + default: + klog.V(4).Info("Using by-path mount option: single/default") + if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil { + mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir) + } } } @@ -784,6 +811,7 @@ func main() { flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource") flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") + flag.StringVar(&opts.bypathMount, "bypath", bypathOptionSingle, "bypath mounting options: single, none, all. Default: single") flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy") diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index 86fd48e65..3a3837f31 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -21,6 +21,7 @@ import ( "path/filepath" "reflect" "sort" + "strings" "testing" "github.com/pkg/errors" @@ -1155,6 +1156,97 @@ func TestCDIDeviceInclusion(t *testing.T) { } } +func TestByPathOptions(t *testing.T) { + root, err := os.MkdirTemp("", "test_bypathoptions") + if err != nil { + t.Fatalf("Can't create temporary directory: %+v", err) + } + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + sysfs := path.Join(root, "sys") + devfs := path.Join(root, "dev") + + sysfslinks := []symlinkItem{ + {"/0042:01:02.0", "/class/drm/card0"}, + {"/0042:01:05.0", "/class/drm/card1"}, + {"driver/i915", "/class/drm/card0/device/driver"}, + {"driver/xe", "/class/drm/card1/device/driver"}, + } + + devfslinks := []symlinkItem{ + {"/dri/card0", "/dri/by-path/pci-0042:01:02.0-card"}, + {"/dri/renderD128", "/dri/by-path/pci-0042:01:02.0-render"}, + {"/dri/card1", "/dri/by-path/pci-0042:01:05.0-card"}, + {"/dri/renderD129", "/dri/by-path/pci-0042:01:05.0-render"}, + } + + sysfsDirs := []string{ + "class/drm/card0/device/drm/card0", + "class/drm/card0/device/drm/renderD128", + "class/drm/card1/device/drm/card1", + "class/drm/card1/device/drm/renderD129", + } + + sysfsFiles := map[string][]byte{ + "class/drm/card0/device/device": []byte("0x9a49"), + "class/drm/card0/device/vendor": []byte("0x8086"), + "class/drm/card1/device/device": []byte("0x9a48"), + "class/drm/card1/device/vendor": []byte("0x8086"), + } + + devfsfiles := map[string][]byte{ + "/dri/card0": []byte("1"), + "/dri/renderD128": []byte("1"), + "/dri/card1": []byte("1"), + "/dri/renderD129": []byte("1"), + } + + createSymlinks(t, sysfs, sysfslinks) + createFiles(t, devfs, devfsfiles) + createFiles(t, sysfs, sysfsFiles) + createDirs(t, sysfs, sysfsDirs) + createSymlinks(t, devfs, devfslinks) + + plugin := newDevicePlugin(sysfs+"/class/drm", devfs+"/dri", cliOptions{sharedDevNum: 1, bypathMount: bypathOptionAll}) + plugin.bypathFound = true + + devSpecs := []v1beta1.DeviceSpec{} + + sysfsPath := filepath.Join(sysfs, "class", "drm", "card0") + + mounts, _ := plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs) + + if len(mounts) != 1 { + t.Error("Invalid count for mounts for by-path option 'all'") + } + if !strings.HasSuffix(mounts[0].ContainerPath, "/by-path") { + t.Error("Invalid container path mount for by-path option 'all'") + } + + plugin.options.bypathMount = bypathOptionNone + + mounts, _ = plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs) + + if len(mounts) != 0 { + t.Error("Invalid count for mounts for by-path option 'none'") + } + + plugin.options.bypathMount = bypathOptionSingle + + mounts, _ = plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs) + + if len(mounts) != 2 { + t.Error("Invalid count for mounts for by-path option 'single'") + } + if !strings.HasSuffix(mounts[0].ContainerPath, "by-path/pci-0042:01:02.0-card") { + t.Error("Invalid container path mount for by-path option 'single'") + } + if !strings.HasSuffix(mounts[1].ContainerPath, "by-path/pci-0042:01:02.0-render") { + t.Error("Invalid container path mount for by-path option 'single'") + } +} + func TestParsePCIDeviceIDs(t *testing.T) { tests := []struct { name string From 8bfa57c65c05b5bce5497c94da92d4f81f3939ce Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Fri, 28 Nov 2025 12:03:33 +0200 Subject: [PATCH 6/6] operator: add gpu plugin's by-path option Signed-off-by: Tuomas Katila Co-authored-by: Eero Tamminen --- cmd/gpu_plugin/README.md | 4 ++-- cmd/gpu_plugin/gpu_plugin.go | 2 +- .../bases/deviceplugin.intel.com_gpudeviceplugins.yaml | 10 ++++++++++ pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go | 6 ++++++ pkg/controllers/gpu/controller.go | 4 ++++ 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md index ebb3bc74c..34cccedc4 100644 --- a/cmd/gpu_plugin/README.md +++ b/cmd/gpu_plugin/README.md @@ -275,8 +275,8 @@ To support possible all use cases, GPU plugin allows changing the by-path mounti * `single` - Symlinks are individually mounted per device. Default. * Mostly Works, but is known to have issues with some pytorch workloads. See [issue](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/2158). * `none` - No symlinks are mounted. - * Aligned with docker use where devices are included with privileged mode. -* `all` - All symlinks are mounted even if only one is allocated by the container. + * Aligned with Docker `privileged` mode devices usage. +* `all` - Mounts whole DRM `by-path` directory. Pro: symlink file types are preserved. Con: symlinks are present for all devices. * Optimal for scale-up workloads where all the GPUs are used by the workload. ### Issues with media workloads on multi-GPU setups diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 8c494fb08..c3bc65061 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -811,7 +811,7 @@ func main() { flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource") flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") - flag.StringVar(&opts.bypathMount, "bypath", bypathOptionSingle, "bypath mounting options: single, none, all. Default: single") + flag.StringVar(&opts.bypathMount, "bypath", bypathOptionSingle, "DRI device 'by-path/' directory mounting options: single, none, all. Default: single") flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy") diff --git a/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml b/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml index 0446bd40a..09aaf631d 100644 --- a/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml +++ b/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml @@ -62,6 +62,16 @@ spec: The list can contain IDs in the form of '0x1234,0x49a4,0x50b4'. Cannot be used together with DenyIDs. type: string + bypathMode: + description: |- + ByPathMode changes how plugin handles the DRM by-path/ directory mounting for GPU devices. + See GPU plugin documentation for detailed description of the modes. + If left empty, it defaults to 'single'. + enum: + - none + - single + - all + type: string denyIDs: description: |- DenyIDs is a comma-separated list of PCI IDs of GPU devices that should only be denied by the plugin. diff --git a/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go b/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go index c97943c98..959f29cab 100644 --- a/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go +++ b/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go @@ -51,6 +51,12 @@ type GpuDevicePluginSpec struct { // +kubebuilder:validation:Enum=balanced;packed;none PreferredAllocationPolicy string `json:"preferredAllocationPolicy,omitempty"` + // ByPathMode changes how plugin handles the DRM by-path/-dir mounting for GPU devices. + // See GPU plugin documentation for detailed description of the modes. + // If left empty, it defaults to 'single'. + // +kubebuilder:validation:Enum=none;single;all + ByPathMode string `json:"bypathMode,omitempty"` + // Specialized nodes (e.g., with accelerators) can be Tainted to make sure unwanted pods are not scheduled on them. Tolerations can be set for the plugin pod to neutralize the Taint. Tolerations []v1.Toleration `json:"tolerations,omitempty"` diff --git a/pkg/controllers/gpu/controller.go b/pkg/controllers/gpu/controller.go index bc7b68b54..8c72f00ed 100644 --- a/pkg/controllers/gpu/controller.go +++ b/pkg/controllers/gpu/controller.go @@ -285,5 +285,9 @@ func getPodArgs(gdp *devicepluginv1.GpuDevicePlugin) []string { args = append(args, "-deny-ids", gdp.Spec.DenyIDs) } + if gdp.Spec.ByPathMode != "" { + args = append(args, "-bypath", gdp.Spec.ByPathMode) + } + return args }