From 7fc83126731de12449f7b38c32e2e318c439a6d4 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Tue, 11 Jun 2024 16:34:38 -0400 Subject: [release-branch.go1.23] os: add clone(CLONE_PIDFD) check to pidfd feature check clone(CLONE_PIDFD) was added in Linux 5.2 and pidfd_open was added in Linux 5.3. Thus our feature check for pidfd_open should be sufficient to ensure that clone(CLONE_PIDFD) works. Unfortuantely, some alternative Linux implementations may not follow this strict ordering. For example, QEMU 7.2 (Dec 2022) added pidfd_open, but clone(CLONE_PIDFD) was only added in QEMU 8.0 (Apr 2023). Debian bookworm provides QEMU 7.2 by default. For #68976. Fixes #69259. Change-Id: Ie3f3dc51f0cd76944871bf98690abf59f68fd7bf Reviewed-on: https://go-review.googlesource.com/c/go/+/592078 LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui (cherry picked from commit 7a5fc9b34deb8d9fe22c9d060a5839827344fcc2) Reviewed-on: https://go-review.googlesource.com/c/go/+/612218 --- src/syscall/exec_linux.go | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'src/syscall/exec_linux.go') diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index 2684412191..3e15676fcb 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -7,6 +7,7 @@ package syscall import ( + errpkg "errors" "internal/itoa" "runtime" "unsafe" @@ -328,6 +329,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att if clone3 != nil { pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0) } else { + // N.B. Keep in sync with doCheckClonePidfd. flags |= uintptr(SIGCHLD) if runtime.GOARCH == "s390x" { // On Linux/s390, the first two arguments of clone(2) are swapped. @@ -743,3 +745,82 @@ func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) { *sys.PidFD = -1 } } + +// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a +// clone. +// +//go:linkname os_checkClonePidfd os.checkClonePidfd +func os_checkClonePidfd() error { + pidfd := int32(-1) + pid, errno := doCheckClonePidfd(&pidfd) + if errno != 0 { + return errno + } + + if pidfd == -1 { + // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process + // before returning. + + var err error + for { + var status WaitStatus + _, err = Wait4(int(pid), &status, 0, nil) + if err != EINTR { + break + } + } + if err != nil { + return err + } + + return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd") + } + + // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the + // pidfd. + defer Close(int(pidfd)) + + for { + const _P_PIDFD = 3 + _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0) + if errno != EINTR { + break + } + } + if errno != 0 { + return errno + } + + return nil +} + +// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and +// child execution. This is a separate function so we can separate the child's +// and parent's stack frames if we're using vfork. +// +// This is go:noinline because the point is to keep the stack frames of this +// and os_checkClonePidfd separate. +// +//go:noinline +func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) { + flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD) + if runtime.GOARCH == "s390x" { + // On Linux/s390, the first two arguments of clone(2) are swapped. + pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd))) + } else { + pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd))) + } + if errno != 0 || pid != 0 { + // If we're in the parent, we must return immediately + // so we're not in the same stack frame as the child. + // This can at most use the return PC, which the child + // will not modify, and the results of + // rawVforkSyscall, which must have been written after + // the child was replaced. + return + } + + for { + RawSyscall(SYS_EXIT, 0, 0, 0) + } +} -- cgit v1.3