aboutsummaryrefslogtreecommitdiff
path: root/src/syscall/exec_linux.go
diff options
context:
space:
mode:
authorKir Kolyshkin <kolyshkin@gmail.com>2022-07-14 21:18:15 -0700
committerGopher Robot <gobot@golang.org>2022-09-09 15:34:16 +0000
commitbca17d16ca0dabbe1b533bb78f367d64e076fe73 (patch)
tree3cf5053ab51cc2944e6eb187d263a3ba6f0203f8 /src/syscall/exec_linux.go
parentf53b2111e489e61461837737cf69371a043d4fd9 (diff)
downloadgo-bca17d16ca0dabbe1b533bb78f367d64e076fe73.tar.xz
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a specified cgroup in a clean and simple way. Note that the feature only works for cgroup v2, and requires Linux kernel 5.7 or newer. Using the feature requires a new syscall, clone3. Currently this is the only reason to use clone3, but the code is structured in a way so that other cases may be easily added in the future. Add a test case. While at it, try to simplify the syscall calling code in forkAndExecInChild1, which became complicated over time because: 1. It was using either rawVforkSyscall or RawSyscall6 depending on whether CLONE_NEWUSER was set. 2. On Linux/s390, the first two arguments to clone(2) system call are swapped (which deserved a mention in Linux ABI hall of shame). It was worked around in rawVforkSyscall on s390, but had to be implemented via a switch/case when using RawSyscall6, making the code less clear. Let's - modify rawVforkSyscall to have two arguments (which is also required for clone3); - remove the arguments workaround from s390 asm, instead implementing arguments swap in the caller (which still looks ugly but at least it's done once and is clearly documented now); - use rawVforkSyscall for all cases (since it is essentially similar to RawSyscall6, except for having less parameters, not returning r2, and saving/restoring the return address before/after syscall on 386 and amd64). Updates #51246. Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e Reviewed-on: https://go-review.googlesource.com/c/go/+/417695 Auto-Submit: Ian Lance Taylor <iant@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Run-TryBot: Kirill Kolyshkin <kolyshkin@gmail.com> TryBot-Result: Gopher Robot <gobot@golang.org>
Diffstat (limited to 'src/syscall/exec_linux.go')
-rw-r--r--src/syscall/exec_linux.go50
1 files changed, 42 insertions, 8 deletions
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index d9e9e6df44..72b56f484a 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -99,6 +99,8 @@ type SysProcAttr struct {
// users this should be set to false for mappings work.
GidMappingsEnableSetgroups bool
AmbientCaps []uintptr // Ambient capabilities (Linux only)
+ UseCgroupFD bool // Whether to make use of the CgroupFD field.
+ CgroupFD int // File descriptor of a cgroup to put the new process into.
}
var (
@@ -176,6 +178,21 @@ func capToIndex(cap uintptr) uintptr { return cap >> 5 }
// See CAP_TO_MASK in linux/capability.h:
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
+// cloneArgs holds arguments for clone3 Linux syscall.
+type cloneArgs struct {
+ flags uint64 // Flags bit mask
+ pidFD uint64 // Where to store PID file descriptor (int *)
+ childTID uint64 // Where to store child TID, in child's memory (pid_t *)
+ parentTID uint64 // Where to store child TID, in parent's memory (pid_t *)
+ exitSignal uint64 // Signal to deliver to parent on child termination
+ stack uint64 // Pointer to lowest byte of stack
+ stackSize uint64 // Size of stack
+ tls uint64 // Location of new TLS
+ setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
+ setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
+ cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
+}
+
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
// the parent's post-fork path. This is a separate function so we can
// separate the child's and parent's stack frames if we're using
@@ -205,9 +222,10 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
nextfd int
i int
caps caps
- fd1 uintptr
+ fd1, flags uintptr
puid, psetgroups, pgid []byte
uidmap, setgroups, gidmap []byte
+ clone3 *cloneArgs
)
if sys.UidMappings != nil {
@@ -252,17 +270,33 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
}
}
+ flags = sys.Cloneflags
+ if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
+ flags |= CLONE_VFORK | CLONE_VM
+ }
+ // Whether to use clone3.
+ if sys.UseCgroupFD {
+ clone3 = &cloneArgs{
+ flags: uint64(flags) | CLONE_INTO_CGROUP,
+ exitSignal: uint64(SIGCHLD),
+ cgroup: uint64(sys.CgroupFD),
+ }
+ }
+
// About to call fork.
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
locked = true
- switch {
- case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
- r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
- case runtime.GOARCH == "s390x":
- r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
- default:
- r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
+ if clone3 != nil {
+ r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
+ } else {
+ flags |= uintptr(SIGCHLD)
+ if runtime.GOARCH == "s390x" {
+ // On Linux/s390, the first two arguments of clone(2) are swapped.
+ r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
+ } else {
+ r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
+ }
}
if err1 != 0 || r1 != 0 {
// If we're in the parent, we must return immediately