@@ -91,6 +91,91 @@ namespace mesos {
9191namespace internal {
9292namespace slave {
9393
94+ namespace {
95+
96+ Try<Nothing> allowDevice (
97+ const std::string& hierarchy,
98+ const std::string& cgroup,
99+ unsigned int major,
100+ unsigned int minor)
101+ {
102+ cgroups::devices::Entry entry;
103+ entry.selector .type = Entry::Selector::Type::CHARACTER;
104+ entry.selector .major = major;
105+ entry.selector .minor = minor;
106+ entry.access .read = true ;
107+ entry.access .write = true ;
108+ entry.access .mknod = true ;
109+
110+ Try<Nothing> allow = cgroups::devices::allow (
111+ hierarchy, cgroup, entry);
112+
113+ if (allow.isError ()) {
114+ return Error (" Failed to allow device '" + stringify (entry)
115+ + " ': " + allow.error ());
116+ }
117+
118+ return Nothing ();
119+ }
120+
121+
122+ Try<Nothing> denyDevice (
123+ const std::string& hierarchy,
124+ const std::string& cgroup,
125+ unsigned int major,
126+ unsigned int minor)
127+ {
128+ cgroups::devices::Entry entry;
129+ entry.selector .type = Entry::Selector::Type::CHARACTER;
130+ entry.selector .major = major;
131+ entry.selector .minor = minor;
132+ entry.access .read = true ;
133+ entry.access .write = true ;
134+ entry.access .mknod = true ;
135+
136+ Try<Nothing> deny = cgroups::devices::deny (
137+ hierarchy, cgroup, entry);
138+
139+ if (deny.isError ()) {
140+ return Error (" Failed to deny device '" + stringify (entry)
141+ + " ': " + deny.error ());
142+ }
143+
144+ return Nothing ();
145+ }
146+
147+
148+ Try<Nothing> addDeviceToContainer (
149+ const string& device,
150+ const string& devicesDir,
151+ const string& rootfsDir,
152+ ContainerLaunchInfo& launchInfo)
153+ {
154+ const string devicePath = path::join (
155+ devicesDir, strings::remove (device, " /dev/" , strings::PREFIX), device);
156+
157+ Try<Nothing> mknod =
158+ fs::chroot::copyDeviceNode (device, devicePath);
159+ if (mknod.isError ()) {
160+ return Error (" Failed to copy device: " + mknod.error ());
161+ }
162+
163+ // Since we are adding the GPU devices to the container, make
164+ // them read/write to guarantee that they are accessible inside
165+ // the container.
166+ Try<Nothing> chmod = os::chmod (devicePath, 0666 );
167+ if (chmod.isError ()) {
168+ return Error (" Failed to set permissions: " + chmod.error ());
169+ }
170+
171+ *launchInfo.add_mounts () = protobuf::slave::createContainerMount (
172+ devicePath, path::join (rootfsDir, device), MS_BIND);
173+
174+ return Nothing ();
175+ }
176+
177+ } // namespace {
178+
94179NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess (
95180 const Flags& _flags,
96181 const string& _hierarchy,
@@ -297,9 +382,24 @@ Future<Nothing> NvidiaGpuIsolatorProcess::recover(
297382 foreach (const Gpu& gpu, available) {
298383 if (entry.selector .major == gpu.major &&
299384 entry.selector .minor == gpu.minor ) {
300- containerGpus.insert (gpu);
301- break ;
302- }
385+ if (gpu.ismig ) {
386+ // The GPU device itself; only a match with a GPU that
387+ // isn't a MIG instance, as MIG instances need access to
388+ // the GPU device and the MIG devices.
389+ continue ;
390+ }
391+
392+ containerGpus.insert (gpu);
393+ break ;
394+ }
395+
396+ // Match up MIG devices
397+ if ((entry.selector .major == gpu.caps_major )
398+ && ((entry.selector .minor == gpu.gi_minor )
399+ || (entry.selector .minor == gpu.ci_minor ))) {
400+ containerGpus.insert (gpu);
401+ break ;
402+ }
303403 }
304404 }
305405
@@ -443,39 +543,23 @@ Future<Option<ContainerLaunchInfo>> NvidiaGpuIsolatorProcess::_prepare(
443543 }
444544
445545 foreach (const string& device, nvidia.get ()) {
446- // The directory `/dev/nvidia-caps` was introduced in CUDA 11.0, just
447- // ignore it since we only care about the Nvidia GPU device files.
448- //
449- // TODO(qianzhang): Figure out how to handle the directory
450- // `/dev/nvidia-caps` more properly.
546+ // Ignore /dev/nvidia-caps, we'll handle that directory later on
451547 if (device == " /dev/nvidia-caps" ) {
452548 continue ;
453549 }
454550
455- const string devicePath = path::join (
456- devicesDir, strings::remove (device, " /dev/" , strings::PREFIX), device);
457-
458- Try<Nothing> mknod =
459- fs::chroot::copyDeviceNode (device, devicePath);
460- if (mknod.isError ()) {
461- return Failure (
462- " Failed to copy device '" + device + " ': " + mknod.error ());
551+ Try<Nothing> added = addDeviceToContainer (device, devicesDir, containerConfig.rootfs (), launchInfo);
552+ if (added.isError ()) {
553+ return Failure (" Could not add device '" + device + " ' to container: " + added.error ());
463554 }
555+ }
464556
465- // Since we are adding the GPU devices to the container, make
466- // them read/write to guarantee that they are accessible inside
467- // the container.
468- Try<Nothing> chmod = os::chmod (devicePath, 0666 );
469- if (chmod.isError ()) {
470- return Failure (
471- " Failed to set permissions on device '" + device + " ': " +
472- chmod.error ());
557+ Try<list<string>> caps = os::glob (" /dev/nvidia-caps/*" );
558+ foreach (const string& device, caps.get ()) {
559+ Try<Nothing> added = addDeviceToContainer (device, devicesDir, containerConfig.rootfs (), launchInfo);
560+ if (added.isError ()) {
561+ return Failure (" Could not add device '" + device + " ' to container: " + added.error ());
473562 }
474-
475- *launchInfo.add_mounts () = protobuf::slave::createContainerMount (
476- devicePath,
477- path::join (containerConfig.rootfs (), device),
478- MS_BIND);
479563 }
480564
481565 return launchInfo;
@@ -520,31 +604,55 @@ Future<Nothing> NvidiaGpuIsolatorProcess::update(
520604 } else if (requested < info->allocated .size ()) {
521605 size_t fewer = info->allocated .size () - requested;
522606
607+ set<std::pair<unsigned int , unsigned int >> deallocated_devs;
523608 set<Gpu> deallocated;
524609
525610 for (size_t i = 0 ; i < fewer; i++) {
526611 const auto gpu = info->allocated .begin ();
527612
528- cgroups::devices::Entry entry;
529- entry.selector .type = Entry::Selector::Type::CHARACTER;
530- entry.selector .major = gpu->major ;
531- entry.selector .minor = gpu->minor ;
532- entry.access .read = true ;
533- entry.access .write = true ;
534- entry.access .mknod = true ;
535-
536- Try<Nothing> deny = cgroups::devices::deny (
537- hierarchy, info->cgroup , entry);
538-
539- if (deny.isError ()) {
540- return Failure (" Failed to deny cgroups access to GPU device"
541- " '" + stringify (entry) + " ': " + deny.error ());
613+ // We can't blindly deny the main GPU device, as it is needed
614+ // by other MIG devices on that same GPU.
615+ deallocated_devs.insert (std::make_pair (gpu->major , gpu->minor ));
616+
617+ if (gpu->ismig ) {
618+ // MIG GPU instance
619+ Try<Nothing> deny = denyDevice (hierarchy, info->cgroup , gpu->caps_major , gpu->gi_minor );
620+ if (deny.isError ()) {
621+ return Failure (" Failed to deny cgroups access to MIG GI device: " + deny.error ());
622+ }
623+
624+ // MIG Compute instance
625+ deny = denyDevice (hierarchy, info->cgroup , gpu->caps_major , gpu->ci_minor );
626+ if (deny.isError ()) {
627+ return Failure (" Failed to deny cgroups access to MIG CI device: " + deny.error ());
628+ }
542629 }
543630
544631 deallocated.insert (*gpu);
545632 info->allocated .erase (gpu);
546633 }
547634
635+ set<std::pair<unsigned int , unsigned int >> allocated_devs;
636+ foreach (Gpu gpu, info->allocated ) {
637+ allocated_devs.insert (std::make_pair (gpu.major , gpu.minor ));
638+ }
639+
640+ // Any GPU device present in the difference of the two sets can now
641+ // be denied, as it is not needed by any of the remaining allocated
642+ // GPUs.
643+ set<std::pair<unsigned int , unsigned int >> safe_deny;
644+ std::set_difference (deallocated_devs.begin (), deallocated_devs.end (),
645+ allocated_devs.begin (), allocated_devs.end (),
646+ std::inserter (safe_deny, safe_deny.begin ()));
647+
648+ foreach (auto dev, safe_deny) {
649+ // Main GPU device node
650+ Try<Nothing> deny = denyDevice (hierarchy, info->cgroup , dev.first , dev.second );
651+ if (deny.isError ()) {
652+ return Failure (" Failed to deny cgroups access to GPU device: " + deny.error ());
653+ }
654+ }
655+
548656 return allocator.deallocate (deallocated);
549657 }
550658
@@ -563,20 +671,21 @@ Future<Nothing> NvidiaGpuIsolatorProcess::_update(
563671 Info* info = CHECK_NOTNULL (infos.at (containerId));
564672
565673 foreach (const Gpu& gpu, allocation) {
566- cgroups::devices::Entry entry;
567- entry.selector .type = Entry::Selector::Type::CHARACTER;
568- entry.selector .major = gpu.major ;
569- entry.selector .minor = gpu.minor ;
570- entry.access .read = true ;
571- entry.access .write = true ;
572- entry.access .mknod = true ;
674+ Try<Nothing> allow = allowDevice (hierarchy, info->cgroup , gpu.major , gpu.minor );
675+ if (allow.isError ()) {
676+ return Failure (" Failed to grant cgroups access to GPU device: " + allow.error ());
677+ }
573678
574- Try<Nothing> allow = cgroups::devices::allow (
575- hierarchy, info->cgroup , entry);
679+ if (gpu.ismig ) {
680+ allow = allowDevice (hierarchy, info->cgroup , gpu.caps_major , gpu.gi_minor );
681+ if (allow.isError ()) {
682+ return Failure (" Failed to grant cgroups access to MIG GI device: " + allow.error ());
683+ }
576684
577- if (allow.isError ()) {
578- return Failure (" Failed to grant cgroups access to GPU device"
579- " '" + stringify (entry) + " ': " + allow.error ());
685+ allow = allowDevice (hierarchy, info->cgroup , gpu.caps_major , gpu.ci_minor );
686+ if (allow.isError ()) {
687+ return Failure (" Failed to grant cgroups access to MIG CI device: " + allow.error ());
688+ }
580689 }
581690 }
582691
0 commit comments