diff options
| author | Michael Pratt <mpratt@google.com> | 2024-06-11 16:34:38 -0400 |
|---|---|---|
| committer | Michael Pratt <mpratt@google.com> | 2024-09-12 15:45:38 +0000 |
| commit | 7a5fc9b34deb8d9fe22c9d060a5839827344fcc2 (patch) | |
| tree | d10fcc5d7d2f7073bb59916d28bbe6cccf3bfcf4 /src/syscall/exec_linux.go | |
| parent | 0ee5d20b1fe617e425d1798a4f7439cf8c337459 (diff) | |
| download | go-7a5fc9b34deb8d9fe22c9d060a5839827344fcc2.tar.xz | |
os: add clone(CLONE_PIDFD) check to pidfd feature check
clone(CLONE_PIDFD) was added in Linux 5.2 and pidfd_open was added in
Linux 5.3. Thus our feature check for pidfd_open should be sufficient to
ensure that clone(CLONE_PIDFD) works.
Unfortuantely, some alternative Linux implementations may not follow
this strict ordering. For example, QEMU 7.2 (Dec 2022) added pidfd_open,
but clone(CLONE_PIDFD) was only added in QEMU 8.0 (Apr 2023).
Debian bookworm provides QEMU 7.2 by default.
Fixes #69259.
Change-Id: Ie3f3dc51f0cd76944871bf98690abf59f68fd7bf
Reviewed-on: https://go-review.googlesource.com/c/go/+/592078
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Diffstat (limited to 'src/syscall/exec_linux.go')
| -rw-r--r-- | src/syscall/exec_linux.go | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index 1859a58294..429a84635a 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -7,6 +7,7 @@ package syscall import ( + errpkg "errors" "internal/itoa" "runtime" "unsafe" @@ -330,6 +331,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att if clone3 != nil { pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0) } else { + // N.B. Keep in sync with doCheckClonePidfd. flags |= uintptr(SIGCHLD) if runtime.GOARCH == "s390x" { // On Linux/s390, the first two arguments of clone(2) are swapped. @@ -758,3 +760,82 @@ func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) { *sys.PidFD = -1 } } + +// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a +// clone. +// +//go:linkname os_checkClonePidfd os.checkClonePidfd +func os_checkClonePidfd() error { + pidfd := int32(-1) + pid, errno := doCheckClonePidfd(&pidfd) + if errno != 0 { + return errno + } + + if pidfd == -1 { + // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process + // before returning. + + var err error + for { + var status WaitStatus + _, err = Wait4(int(pid), &status, 0, nil) + if err != EINTR { + break + } + } + if err != nil { + return err + } + + return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd") + } + + // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the + // pidfd. + defer Close(int(pidfd)) + + for { + const _P_PIDFD = 3 + _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0) + if errno != EINTR { + break + } + } + if errno != 0 { + return errno + } + + return nil +} + +// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and +// child execution. This is a separate function so we can separate the child's +// and parent's stack frames if we're using vfork. +// +// This is go:noinline because the point is to keep the stack frames of this +// and os_checkClonePidfd separate. +// +//go:noinline +func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) { + flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD) + if runtime.GOARCH == "s390x" { + // On Linux/s390, the first two arguments of clone(2) are swapped. + pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd))) + } else { + pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd))) + } + if errno != 0 || pid != 0 { + // If we're in the parent, we must return immediately + // so we're not in the same stack frame as the child. + // This can at most use the return PC, which the child + // will not modify, and the results of + // rawVforkSyscall, which must have been written after + // the child was replaced. + return + } + + for { + RawSyscall(SYS_EXIT, 0, 0, 0) + } +} |
