diff options
| author | Kir Kolyshkin <kolyshkin@gmail.com> | 2022-07-14 21:18:15 -0700 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2022-09-09 15:34:16 +0000 |
| commit | bca17d16ca0dabbe1b533bb78f367d64e076fe73 (patch) | |
| tree | 3cf5053ab51cc2944e6eb187d263a3ba6f0203f8 /src/syscall/exec_linux.go | |
| parent | f53b2111e489e61461837737cf69371a043d4fd9 (diff) | |
| download | go-bca17d16ca0dabbe1b533bb78f367d64e076fe73.tar.xz | |
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a
specified cgroup in a clean and simple way. Note that the feature only
works for cgroup v2, and requires Linux kernel 5.7 or newer.
Using the feature requires a new syscall, clone3. Currently this is the
only reason to use clone3, but the code is structured in a way so that
other cases may be easily added in the future.
Add a test case.
While at it, try to simplify the syscall calling code in
forkAndExecInChild1, which became complicated over time because:
1. It was using either rawVforkSyscall or RawSyscall6 depending on
whether CLONE_NEWUSER was set.
2. On Linux/s390, the first two arguments to clone(2) system call are
swapped (which deserved a mention in Linux ABI hall of shame). It
was worked around in rawVforkSyscall on s390, but had to be
implemented via a switch/case when using RawSyscall6, making the code
less clear.
Let's
- modify rawVforkSyscall to have two arguments (which is also required
for clone3);
- remove the arguments workaround from s390 asm, instead implementing
arguments swap in the caller (which still looks ugly but at least
it's done once and is clearly documented now);
- use rawVforkSyscall for all cases (since it is essentially similar to
RawSyscall6, except for having less parameters, not returning r2, and
saving/restoring the return address before/after syscall on 386 and
amd64).
Updates #51246.
Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e
Reviewed-on: https://go-review.googlesource.com/c/go/+/417695
Auto-Submit: Ian Lance Taylor <iant@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Run-TryBot: Ian Lance Taylor <iant@google.com>
Run-TryBot: Kirill Kolyshkin <kolyshkin@gmail.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Diffstat (limited to 'src/syscall/exec_linux.go')
| -rw-r--r-- | src/syscall/exec_linux.go | 50 |
1 files changed, 42 insertions, 8 deletions
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index d9e9e6df44..72b56f484a 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -99,6 +99,8 @@ type SysProcAttr struct { // users this should be set to false for mappings work. GidMappingsEnableSetgroups bool AmbientCaps []uintptr // Ambient capabilities (Linux only) + UseCgroupFD bool // Whether to make use of the CgroupFD field. + CgroupFD int // File descriptor of a cgroup to put the new process into. } var ( @@ -176,6 +178,21 @@ func capToIndex(cap uintptr) uintptr { return cap >> 5 } // See CAP_TO_MASK in linux/capability.h: func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) } +// cloneArgs holds arguments for clone3 Linux syscall. +type cloneArgs struct { + flags uint64 // Flags bit mask + pidFD uint64 // Where to store PID file descriptor (int *) + childTID uint64 // Where to store child TID, in child's memory (pid_t *) + parentTID uint64 // Where to store child TID, in parent's memory (pid_t *) + exitSignal uint64 // Signal to deliver to parent on child termination + stack uint64 // Pointer to lowest byte of stack + stackSize uint64 // Size of stack + tls uint64 // Location of new TLS + setTID uint64 // Pointer to a pid_t array (since Linux 5.5) + setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5) + cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7) +} + // forkAndExecInChild1 implements the body of forkAndExecInChild up to // the parent's post-fork path. This is a separate function so we can // separate the child's and parent's stack frames if we're using @@ -205,9 +222,10 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att nextfd int i int caps caps - fd1 uintptr + fd1, flags uintptr puid, psetgroups, pgid []byte uidmap, setgroups, gidmap []byte + clone3 *cloneArgs ) if sys.UidMappings != nil { @@ -252,17 +270,33 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att } } + flags = sys.Cloneflags + if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 { + flags |= CLONE_VFORK | CLONE_VM + } + // Whether to use clone3. + if sys.UseCgroupFD { + clone3 = &cloneArgs{ + flags: uint64(flags) | CLONE_INTO_CGROUP, + exitSignal: uint64(SIGCHLD), + cgroup: uint64(sys.CgroupFD), + } + } + // About to call fork. // No more allocation or calls of non-assembly functions. runtime_BeforeFork() locked = true - switch { - case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0: - r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags) - case runtime.GOARCH == "s390x": - r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0) - default: - r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) + if clone3 != nil { + r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3)) + } else { + flags |= uintptr(SIGCHLD) + if runtime.GOARCH == "s390x" { + // On Linux/s390, the first two arguments of clone(2) are swapped. + r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags) + } else { + r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0) + } } if err1 != 0 || r1 != 0 { // If we're in the parent, we must return immediately |
