From ef74d05f150bde7f7b70dda2587392d87a34386c Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Tue, 19 Dec 2023 11:18:10 +0100 Subject: [PATCH] Use private cgroup namespaces for cgroup v2 Using the host's cgroup namespace along with a writable mount of the entire cgroup fs messes with container isolation quite a bit. The main purpose of this is to get a writable mount of the cgroup fs inside containers, so that init systems are able to set up their own cgroups accordingly. Use a different approach to achieve the same effect: Use a private cgroup namespace. Privileged containers will automatically have write access. A read-write mount is only performed when running non-privileged containers. Signed-off-by: Tom Wieczorek --- pkg/cluster/cluster.go | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index ad6f5e7..94a39ab 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "os" + "path" "regexp" "strconv" "strings" @@ -284,10 +285,57 @@ func (c *Cluster) createMachineRunArgs(machine *Machine, name string, i int) []s "--tmpfs", "/tmp:exec,mode=777", } if docker.CgroupVersion() == "2" { - runArgs = append(runArgs, "--cgroupns", "host", - "--cgroup-parent", "bootloose.slice", - "-v", "/sys/fs/cgroup:/sys/fs/cgroup:rw") - + runArgs = append(runArgs, "--cgroupns", "private") + + if !machine.spec.Privileged { + // Non-privileged containers will have their /sys/fs/cgroup folder + // mounted read-only, even when running in private cgroup + // namespaces. This is a bummer for init systems. Containers could + // probably remount the cgroup fs in read-write mode, but that would + // require CAP_SYS_ADMIN _and_ a custom logic in the container's + // entry point. Podman has `--security-opt unmask=/sys/fs/cgroup`, + // but that's not a thing for Docker. The only other way to get a + // writable cgroup fs inside the container is to explicitly mount + // it. Some references: + // - https://github.com/moby/moby/issues/42275 + // - https://serverfault.com/a/1054414 + + // Docker will use cgroups like + // /docker-{{ContainerID}}.scope. + // + // Ideally, we could mount those to /sys/fs/cgroup inside the + // containers. But there's some chicken-and-egg problem, as we only + // know the container ID _after_ the container creation. As a + // duct-tape solution, we mount our own cgroup as the root, which is + // unrelated to the Docker-managed one: + // /cluster-{{ClusterID}}.scope/machine-{{MachineID}}.scope + + // FIXME: How to clean this up? Especially when Docker is being run + // on a different machine? + + // Just assume that the cgroup fs is mounted at its default + // location. We could try to figure this out via + // /proc/self/mountinfo, but it's really not worth the hassle. + const cgroupMountpoint = "/sys/fs/cgroup" + + // Use this as the parent cgroup for everything. Note that if Docker + // uses the systemd cgroup driver, the cgroup name has to end with + // .slice. This is not a requirement for the cgroupfs driver; it + // won't care. Hence, just always use the .slice suffix, no matter + // if it's required or not. + const cgroupParent = "bootloose.slice" + + cg := path.Join( + cgroupMountpoint, cgroupParent, + fmt.Sprintf("cluster-%s.scope", c.spec.Cluster.Name), + fmt.Sprintf("machine-%s.scope", name), + ) + + runArgs = append(runArgs, + "--cgroup-parent", cgroupParent, + "-v", fmt.Sprintf("%s:%s:rw", cg, cgroupMountpoint), + ) + } } else { runArgs = append(runArgs, "-v", "/sys/fs/cgroup:/sys/fs/cgroup:ro") }