Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions internal/linux/linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package linux

import (
"os"
"unsafe"

"golang.org/x/sys/unix"
)
Expand Down Expand Up @@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error
return n, from, err
}

// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation.
func SchedSetaffinity(pid int, buf []byte) (err error) {
err = retryOnEINTR(func() error {
_, _, errno := unix.Syscall(
unix.SYS_SCHED_SETAFFINITY,
uintptr(pid),
uintptr(len(buf)),
uintptr((unsafe.Pointer)(&buf[0])))
if errno != 0 {
return os.NewSyscallError("sched_setaffinity", errno)
}
return nil
})
return err
}

// Sendmsg wraps [unix.Sendmsg].
func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error {
err := retryOnEINTR(func() error {
Expand Down
41 changes: 16 additions & 25 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package libcontainer

import (
"bytes"
"context"
"encoding/json"
"errors"
Expand All @@ -24,6 +25,7 @@ import (

"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/fs2"
"github.com/opencontainers/runc/internal/linux"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/internal/userns"
Expand Down Expand Up @@ -163,33 +165,22 @@ type setnsProcess struct {

// tryResetCPUAffinity tries to reset the CPU affinity of the process
// identified by pid to include all possible CPUs (notwithstanding cgroup
// cpuset restrictions and isolated CPUs).
// cpuset restrictions, isolated CPUs and CPU online status).
func tryResetCPUAffinity(pid int) {
// When resetting the CPU affinity, we want to match the configured cgroup
// cpuset (or the default set of all CPUs, if no cpuset is configured)
// rather than some more restrictive affinity we were spawned in (such as
// one that may have been inherited from systemd). The cpuset cgroup used
// to reconfigure the cpumask automatically for joining processes, but
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
// this behaviour in Linux 6.2.
// When resetting the CPU affinity, we want to allow all
// possible CPUs in the system, including those not in
// cpuset.cpus, online or even present (hot-plugged) at call
// time. Using a cpumask any tighter this that may disallow
// using those CPUs if they are added to cpuset.cpus later.
//
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
// things like /proc/stat would be wrong for most nested containers), but
// luckily sched_setaffinity(2) will implicitly:
//
// * Clamp the cpumask so that it matches the current number of CPUs on
// the system.
// * Mask out any CPUs that are not a member of the target task's
// configured cgroup cpuset.
//
// So we can just pass a very large array of set cpumask bits and the
// kernel will silently convert that to the correct value very cheaply.
var cpuset unix.CPUSet
cpuset.Fill() // set all bits
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
logrus.WithError(
os.NewSyscallError("sched_setaffinity", err),
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
// Use similar huge buffer as go 1.25 runtime in getCPUCount()
// does for mask. This avoids reading and parsing
// /sys/devices/system/cpu/possible.
const maxCPUs = 64 * 1024
buf := bytes.Repeat([]byte{0xff}, maxCPUs/8)
if err := linux.SchedSetaffinity(pid, buf); err != nil {
logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
return
}
}

Expand Down