Skip to content

Commit

Permalink
Do not interpret GPU index as GPU device minor.
Browse files Browse the repository at this point in the history
There are 3 ways to identify a GPU device that are relevant:
1. GPU Index: This is the index of the GPU in PCIe ordering.
2. GPU Device Minor: This is the minor device number. The GPU device number
   on the system is 195:minor.
3. /dev/nvidia#: The GPU can be accessed via the device mounted at this path.

nvproxy incorrectly assumed that all 3 identifiers are the same. In reality,
only (2) and (3) are always the same. (1) can differ, as demonstrated in #9389.

So passthrough the value of `NVIDIA_VISIBLE_DEVICES` to `nvidia-container-cli`
while invoking the `configure` command via the `--devices` flag. The CLI uses
NVML to figure out the GPU Index -> GPU Device mapping and mounts the right
devices into the containers root filesystem at /container/rootfs/dev/. We
subsequently scan this directory to *infer* the GPU devices exposed to the
container. This information in plumbed through various places and appropriate
virtualized gVisor devices are created.

An unintended benefit of this is that identifying GPUs via GPU UUIDs now works!
Earlier we only accepted GPU index. Now we can just pass through GPU UUID to
the CLI and it will deal with it. So `docker run --gpus="device=GPU-4e716e7d"
works now.

Alternatives considered:
1. Parsing `nvidia-container-cli info` output to figure out index -> minor
   mapping. However, this is a costly operation (as reported in #9215) which
   can take 2-3 seconds.
2. Using NVML in runsc via nvidia/go-nvml library. Apart from the downsides of
   adding another third party dependency, this entails that we duplicate logic
   from nvidia-container-cli into runsc (mainly logic around
   src/cli/common.c:select_devices()). This is technical debt and won't age
   well (will require us to mimic updates to CLI into runsc).

Fixes #9389

PiperOrigin-RevId: 567443164
  • Loading branch information
ayushr2 authored and gvisor-bot committed Sep 21, 2023
1 parent ae1294b commit 05b7c55
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 63 deletions.
4 changes: 2 additions & 2 deletions pkg/sentry/devices/nvproxy/nvproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ func CreateDriverDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor, uvmD

// CreateIndexDevtmpfsFile creates the device special file in dev for the
// device with the given index.
func CreateIndexDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor, index uint32) error {
return dev.CreateDeviceFile(ctx, fmt.Sprintf("nvidia%d", index), vfs.CharDevice, nvgpu.NV_MAJOR_DEVICE_NUMBER, index, 0666)
func CreateIndexDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor, minor uint32) error {
return dev.CreateDeviceFile(ctx, fmt.Sprintf("nvidia%d", minor), vfs.CharDevice, nvgpu.NV_MAJOR_DEVICE_NUMBER, minor, 0666)
}

// +stateify savable
Expand Down
1 change: 1 addition & 0 deletions runsc/boot/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ go_library(
"loader.go",
"mount_hints.go",
"network.go",
"nvidia.go",
"overlay.go",
"seccheck.go",
"strace.go",
Expand Down
52 changes: 52 additions & 0 deletions runsc/boot/nvidia.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2023 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package boot

import (
"fmt"
"strconv"
"strings"
)

// NvidiaDevMinors can be used to pass nvidia device minors via flags.
type NvidiaDevMinors []uint32

// String implements flag.Value.
func (n *NvidiaDevMinors) String() string {
minors := make([]string, 0, len(*n))
for _, minor := range *n {
minors = append(minors, strconv.Itoa(int(minor)))
}
return strings.Join(minors, ",")
}

// Get implements flag.Value.
func (n *NvidiaDevMinors) Get() any {
return n
}

// Set implements flag.Value and appends a device minor from the command
// line to the device minors array. Set(String()) should be idempotent.
func (n *NvidiaDevMinors) Set(s string) error {
minors := strings.Split(s, ",")
for _, minor := range minors {
minorVal, err := strconv.Atoi(minor)
if err != nil {
return fmt.Errorf("invalid device minor value (%d): %v", minorVal, err)
}
*n = append(*n, uint32(minorVal))
}
return nil
}
8 changes: 4 additions & 4 deletions runsc/boot/vfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1195,16 +1195,16 @@ func nvproxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerIn
// In Docker mode, create all the device files now.
// In non-Docker mode, these are instead created as part of
// `createDeviceFiles`, using the spec's Device list.
nvd, err := specutils.NvidiaDeviceNumbers(info.spec, info.conf)
minors, err := specutils.FindAllGPUDevices("/")
if err != nil {
return fmt.Errorf("getting nvidia devices: %w", err)
}
if err := nvproxy.CreateDriverDevtmpfsFiles(ctx, a, uvmDevMajor); err != nil {
return fmt.Errorf("creating nvproxy devtmpfs files: %w", err)
}
for _, d := range nvd {
if err := nvproxy.CreateIndexDevtmpfsFile(ctx, a, d); err != nil {
return fmt.Errorf("creating nvproxy devtmpfs file for device %d: %w", d, err)
for _, minor := range minors {
if err := nvproxy.CreateIndexDevtmpfsFile(ctx, a, minor); err != nil {
return fmt.Errorf("creating nvproxy devtmpfs file for device minor %d: %w", minor, err)
}
}
}
Expand Down
7 changes: 6 additions & 1 deletion runsc/cmd/boot.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ type Boot struct {
// boot process should invoke setuid/setgid for root user. This is mainly
// used to synchronize rootless user namespace initialization.
syncUsernsFD int

// nvidiaDevMinors is a list of device minors for Nvidia GPU devices exposed
// to the sandbox.
nvidiaDevMinors boot.NvidiaDevMinors
}

// Name implements subcommands.Command.Name.
Expand Down Expand Up @@ -201,6 +205,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.")
f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.")
f.Var(&b.nvidiaDevMinors, "nvidia-dev-minors", "list of device minors for Nvidia GPU devices exposed to the sandbox.")

// Profiling flags.
b.profileFDs.SetFromFlags(f)
Expand Down Expand Up @@ -261,7 +266,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma
}

if b.setUpRoot {
if err := setUpChroot(b.pidns, spec, conf); err != nil {
if err := setUpChroot(b.pidns, spec, conf, b.nvidiaDevMinors); err != nil {
util.Fatalf("error setting up chroot: %v", err)
}
argOverride["setup-root"] = "false"
Expand Down
14 changes: 5 additions & 9 deletions runsc/cmd/chroot.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func copyFile(dst, src string) error {

// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
// mounted at /proc.
func setUpChroot(pidns bool, spec *specs.Spec, conf *config.Config) error {
func setUpChroot(pidns bool, spec *specs.Spec, conf *config.Config, nvidiaDevMinors []uint32) error {
// We are a new mount namespace, so we can use /tmp as a directory to
// construct a new root.
chroot := os.TempDir()
Expand Down Expand Up @@ -120,7 +120,7 @@ func setUpChroot(pidns bool, spec *specs.Spec, conf *config.Config) error {
}
}

if err := nvproxyUpdateChroot(chroot, spec, conf); err != nil {
if err := nvproxyUpdateChroot(chroot, spec, conf, nvidiaDevMinors); err != nil {
return fmt.Errorf("error configuring chroot for Nvidia GPUs: %w", err)
}
if err := tpuProxyUpdateChroot(chroot, conf); err != nil {
Expand Down Expand Up @@ -183,7 +183,7 @@ func tpuProxyUpdateChroot(chroot string, conf *config.Config) error {
return nil
}

func nvproxyUpdateChroot(chroot string, spec *specs.Spec, conf *config.Config) error {
func nvproxyUpdateChroot(chroot string, spec *specs.Spec, conf *config.Config, devMinors []uint32) error {
if !specutils.GPUFunctionalityRequested(spec, conf) {
return nil
}
Expand All @@ -196,12 +196,8 @@ func nvproxyUpdateChroot(chroot string, spec *specs.Spec, conf *config.Config) e
if err := mountInChroot(chroot, "/dev/nvidia-uvm", "/dev/nvidia-uvm", "bind", unix.MS_BIND); err != nil {
return fmt.Errorf("error mounting /dev/nvidia-uvm in chroot: %w", err)
}
deviceIDs, err := specutils.NvidiaDeviceNumbers(spec, conf)
if err != nil {
return fmt.Errorf("enumerating nvidia device IDs: %w", err)
}
for _, deviceID := range deviceIDs {
path := fmt.Sprintf("/dev/nvidia%d", deviceID)
for _, devMinor := range devMinors {
path := fmt.Sprintf("/dev/nvidia%d", devMinor)
if err := mountInChroot(chroot, path, path, "bind", unix.MS_BIND); err != nil {
return fmt.Errorf("error mounting %q in chroot: %v", path, err)
}
Expand Down
28 changes: 19 additions & 9 deletions runsc/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,23 @@ func New(conf *config.Config, args Args) (*Container, error) {
PassFiles: args.PassFiles,
ExecFile: args.ExecFile,
}
if specutils.GPUFunctionalityRequested(args.Spec, conf) {
// Expose all Nvidia devices in /dev/, because we don't know what
// devices future subcontainers will want.
searchDir := "/"
if conf.NVProxyDocker {
// For single-container use cases like Docker, the container rootfs
// is populated with the devices that need to be exposed. Scan that.
// This scan needs to happen outside the sandbox process because
// /rootfs/dev/nvidia* mounts made in gofer may not be propagated to
// sandbox's mount namespace.
searchDir = args.Spec.Root.Path
}
sandArgs.NvidiaDevMinors, err = specutils.FindAllGPUDevices(searchDir)
if err != nil {
return fmt.Errorf("FindAllGPUDevices: %w", err)
}
}
sand, err := sandbox.New(conf, sandArgs)
if err != nil {
return fmt.Errorf("cannot create sandbox: %w", err)
Expand Down Expand Up @@ -1802,17 +1819,10 @@ func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCm
ldconfigPath = "/sbin/ldconfig"
}

var nvidiaDevices strings.Builder
deviceIDs, err := specutils.NvidiaDeviceNumbers(spec, conf)
devices, err := specutils.NvidiaDeviceList(spec, conf)
if err != nil {
return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err)
}
for i, deviceID := range deviceIDs {
if i > 0 {
nvidiaDevices.WriteRune(',')
}
nvidiaDevices.WriteString(fmt.Sprintf("%d", uint32(deviceID)))
}

// Create synchronization FD for nvproxy.
fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
Expand All @@ -1834,7 +1844,7 @@ func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCm
"--utility",
"--compute",
fmt.Sprintf("--pid=%d", goferCmd.Process.Pid),
fmt.Sprintf("--device=%s", nvidiaDevices.String()),
fmt.Sprintf("--device=%s", devices),
spec.Root.Path,
}
log.Debugf("Executing %q", argv)
Expand Down
9 changes: 9 additions & 0 deletions runsc/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ type Args struct {

// ExecFile is the file from the host used for program execution.
ExecFile *os.File

// NvidiaDevMinors is the list of device minors for Nvidia GPU devices
// exposed to the sandbox.
NvidiaDevMinors boot.NvidiaDevMinors
}

// New creates the sandbox process. The caller must call Destroy() on the
Expand Down Expand Up @@ -752,6 +756,11 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
return err
}

// Pass nvidia device minors.
if len(args.NvidiaDevMinors) > 0 {
cmd.Args = append(cmd.Args, "--nvidia-dev-minors="+args.NvidiaDevMinors.String())
}

// Pass overlay mediums.
cmd.Args = append(cmd.Args, "--overlay-mediums="+args.OverlayMediums.String())

Expand Down
66 changes: 28 additions & 38 deletions runsc/specutils/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ package specutils

import (
"fmt"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"

specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/runsc/config"
)

Expand Down Expand Up @@ -52,26 +52,12 @@ func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
return nvd != "" && nvd != "void"
}

// CanAccessAtLeastOneGPU returns true if the sandbox and container should
// be able to access at least one Nvidia GPU. This is a function of the
// sandbox configuration and the container spec's NVIDIA_VISIBLE_DEVICES
// environment variable.
func CanAccessAtLeastOneGPU(spec *specs.Spec, conf *config.Config) bool {
gpus, err := NvidiaDeviceNumbers(spec, conf)
if err != nil {
log.Warningf("Cannot determine if the container should have access to GPUs: %v", err)
return false
}
return len(gpus) > 0
}

// nvidiaDeviceRegex matches Nvidia GPU device paths.
var nvidiaDeviceRegex = regexp.MustCompile(`^/dev/nvidia(\d+)$`)

// findAllGPUDevices returns the Nvidia GPU device minor numbers of all GPUs
// on the machine.
func findAllGPUDevices() ([]uint32, error) {
paths, err := filepath.Glob("/dev/nvidia*")
// FindAllGPUDevices returns the Nvidia GPU device minor numbers of all GPUs
// mounted in the provided rootfs.
func FindAllGPUDevices(rootfs string) ([]uint32, error) {
devPathPrefix := path.Join(rootfs, "dev/nvidia")
nvidiaDeviceRegex := regexp.MustCompile(fmt.Sprintf(`^%s(\d+)$`, devPathPrefix))
paths, err := filepath.Glob(devPathPrefix + "*")
if err != nil {
return nil, fmt.Errorf("enumerating Nvidia device files: %w", err)
}
Expand All @@ -88,38 +74,42 @@ func findAllGPUDevices() ([]uint32, error) {
return devMinors, nil
}

// NvidiaDeviceNumbers returns the Nvidia GPU device minor numbers that
// should be visible to the specified container.
// In Docker mode, this is the set of devices specified in
// NVIDIA_VISIBLE_DEVICES.
// In non-Docker mode, this is all Nvidia devices, as we cannot know the set
// of usable GPUs until subcontainer creation.
func NvidiaDeviceNumbers(spec *specs.Spec, conf *config.Config) ([]uint32, error) {
// NvidiaDeviceList returns the list of devices that should be visible to the
// sandbox. In Docker mode, this is the set of devices specified in
// NVIDIA_VISIBLE_DEVICES. In non-Docker mode, this is all Nvidia devices, as
// we cannot know the set of usable GPUs until subcontainer creation.
func NvidiaDeviceList(spec *specs.Spec, conf *config.Config) (string, error) {
if !GPUFunctionalityRequested(spec, conf) {
return nil, nil
return "", nil
}
if !conf.NVProxyDocker {
// nvproxy enabled in non-Docker mode.
// Return all GPUs on the machine.
return findAllGPUDevices()
return "all", nil
}
// nvproxy is enabled in Docker mode.
nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
if nvd == "none" {
return nil, nil
return "", nil
}
if nvd == "all" {
return findAllGPUDevices()
return "all", nil
}
var devMinors []uint32
// Expect nvd to be a list of indices; UUIDs aren't supported
// yet.
for _, indexStr := range strings.Split(nvd, ",") {
index, err := strconv.ParseUint(indexStr, 10, 32)
for _, gpuDev := range strings.Split(nvd, ",") {
// Validate gpuDev. We only support the following formats for now:
// * GPU indices (e.g. 0,1,2)
// * GPU UUIDs (e.g. GPU-fef8089b)
//
// We do not support MIG devices yet.
if strings.HasPrefix(gpuDev, "GPU-") {
continue
}
_, err := strconv.ParseUint(gpuDev, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", indexStr, nvd, err)
return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err)
}
devMinors = append(devMinors, uint32(index))
}
return devMinors, nil
return nvd, nil
}

0 comments on commit 05b7c55

Please sign in to comment.