From 016fac805816b155b9b0896204206791490a353a Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Tue, 18 Nov 2025 14:13:20 +0200 Subject: [PATCH] libct: fix resetting CPU affinity unix.CPUSet is limited to 1024 CPUs. Calling unix.SchedSetaffinity(pid, cpuset) removes all CPUs starting from 1024 from allowed CPUs of pid, even if cpuset is all ones. The consequence of runc trying to reset CPU affinity by default is that it prevents all containers from using those CPUs. This change is uses huge CPU mask to play safe and get all possible CPUs enabled with single sched_setaffinity call. Fixes: #5023 Signed-off-by: Antti Kervinen --- internal/linux/linux.go | 17 +++++++++++++++ libcontainer/process_linux.go | 41 ++++++++++++++--------------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/internal/linux/linux.go b/internal/linux/linux.go index 722009166f8..b86b2006074 100644 --- a/internal/linux/linux.go +++ b/internal/linux/linux.go @@ -2,6 +2,7 @@ package linux import ( "os" + "unsafe" "golang.org/x/sys/unix" ) @@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error return n, from, err } +// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation. +func SchedSetaffinity(pid int, buf []byte) (err error) { + err = retryOnEINTR(func() error { + _, _, errno := unix.Syscall( + unix.SYS_SCHED_SETAFFINITY, + uintptr(pid), + uintptr(len(buf)), + uintptr((unsafe.Pointer)(&buf[0]))) + if errno != 0 { + return os.NewSyscallError("sched_setaffinity", errno) + } + return nil + }) + return err +} + // Sendmsg wraps [unix.Sendmsg]. func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error { err := retryOnEINTR(func() error { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index c1c27494fd6..c8e7146cdd2 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "bytes" "context" "encoding/json" "errors" @@ -24,6 +25,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" + "github.com/opencontainers/runc/internal/linux" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/internal/userns" @@ -163,33 +165,22 @@ type setnsProcess struct { // tryResetCPUAffinity tries to reset the CPU affinity of the process // identified by pid to include all possible CPUs (notwithstanding cgroup -// cpuset restrictions and isolated CPUs). +// cpuset restrictions, isolated CPUs and CPU online status). func tryResetCPUAffinity(pid int) { - // When resetting the CPU affinity, we want to match the configured cgroup - // cpuset (or the default set of all CPUs, if no cpuset is configured) - // rather than some more restrictive affinity we were spawned in (such as - // one that may have been inherited from systemd). The cpuset cgroup used - // to reconfigure the cpumask automatically for joining processes, but - // kcommit da019032819a ("sched: Enforce user requested affinity") changed - // this behaviour in Linux 6.2. + // When resetting the CPU affinity, we want to allow all + // possible CPUs in the system, including those not in + // cpuset.cpus, online or even present (hot-plugged) at call + // time. Using a cpumask any tighter this that may disallow + // using those CPUs if they are added to cpuset.cpus later. // - // Parsing cpuset.cpus.effective is quite inefficient (and looking at - // things like /proc/stat would be wrong for most nested containers), but - // luckily sched_setaffinity(2) will implicitly: - // - // * Clamp the cpumask so that it matches the current number of CPUs on - // the system. - // * Mask out any CPUs that are not a member of the target task's - // configured cgroup cpuset. - // - // So we can just pass a very large array of set cpumask bits and the - // kernel will silently convert that to the correct value very cheaply. - var cpuset unix.CPUSet - cpuset.Fill() // set all bits - if err := unix.SchedSetaffinity(pid, &cpuset); err != nil { - logrus.WithError( - os.NewSyscallError("sched_setaffinity", err), - ).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + // Use similar huge buffer as go 1.25 runtime in getCPUCount() + // does for mask. This avoids reading and parsing + // /sys/devices/system/cpu/possible. + const maxCPUs = 64 * 1024 + buf := bytes.Repeat([]byte{0xff}, maxCPUs/8) + if err := linux.SchedSetaffinity(pid, buf); err != nil { + logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + return } }