aboutsummaryrefslogtreecommitdiff
path: root/src/syscall/exec_linux.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/syscall/exec_linux.go')
-rw-r--r--src/syscall/exec_linux.go50
1 files changed, 42 insertions, 8 deletions
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index d9e9e6df44..72b56f484a 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -99,6 +99,8 @@ type SysProcAttr struct {
// users this should be set to false for mappings work.
GidMappingsEnableSetgroups bool
AmbientCaps []uintptr // Ambient capabilities (Linux only)
+ UseCgroupFD bool // Whether to make use of the CgroupFD field.
+ CgroupFD int // File descriptor of a cgroup to put the new process into.
}
var (
@@ -176,6 +178,21 @@ func capToIndex(cap uintptr) uintptr { return cap >> 5 }
// See CAP_TO_MASK in linux/capability.h:
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
+// cloneArgs holds arguments for clone3 Linux syscall.
+type cloneArgs struct {
+ flags uint64 // Flags bit mask
+ pidFD uint64 // Where to store PID file descriptor (int *)
+ childTID uint64 // Where to store child TID, in child's memory (pid_t *)
+ parentTID uint64 // Where to store child TID, in parent's memory (pid_t *)
+ exitSignal uint64 // Signal to deliver to parent on child termination
+ stack uint64 // Pointer to lowest byte of stack
+ stackSize uint64 // Size of stack
+ tls uint64 // Location of new TLS
+ setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
+ setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
+ cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
+}
+
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
// the parent's post-fork path. This is a separate function so we can
// separate the child's and parent's stack frames if we're using
@@ -205,9 +222,10 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
nextfd int
i int
caps caps
- fd1 uintptr
+ fd1, flags uintptr
puid, psetgroups, pgid []byte
uidmap, setgroups, gidmap []byte
+ clone3 *cloneArgs
)
if sys.UidMappings != nil {
@@ -252,17 +270,33 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
}
}
+ flags = sys.Cloneflags
+ if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
+ flags |= CLONE_VFORK | CLONE_VM
+ }
+ // Whether to use clone3.
+ if sys.UseCgroupFD {
+ clone3 = &cloneArgs{
+ flags: uint64(flags) | CLONE_INTO_CGROUP,
+ exitSignal: uint64(SIGCHLD),
+ cgroup: uint64(sys.CgroupFD),
+ }
+ }
+
// About to call fork.
// No more allocation or calls of non-assembly functions.
runtime_BeforeFork()
locked = true
- switch {
- case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
- r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
- case runtime.GOARCH == "s390x":
- r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
- default:
- r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
+ if clone3 != nil {
+ r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
+ } else {
+ flags |= uintptr(SIGCHLD)
+ if runtime.GOARCH == "s390x" {
+ // On Linux/s390, the first two arguments of clone(2) are swapped.
+ r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
+ } else {
+ r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
+ }
}
if err1 != 0 || r1 != 0 {
// If we're in the parent, we must return immediately