runtime, syscall: reimplement AllThreadsSyscall using only signals.

In issue 50113, we see that a thread blocked in a system call can result in a hang of AllThreadsSyscall. To resolve this, we must send a signal to these threads to knock them out of the system call long enough to run the per-thread syscall. Stepping back, if we need to send signals anyway, it should be possible to implement this entire mechanism on top of signals. This CL does so, vastly simplifying the mechanism, both as a direct result of newly-unnecessary code as well as some ancillary simplifications to make things simpler to follow. Major changes: * The rest of the mechanism is moved to os_linux.go, with fields in mOS instead of m itself. * 'Fixup' fields and functions are renamed to 'perThreadSyscall' so they are more precise about their purpose. * Rather than getting passed a closure, doAllThreadsSyscall takes the syscall number and arguments. This avoids a lot of hairy behavior: * The closure may potentially only be live in fields in the M, hidden from the GC. Not necessary with no closure. * The need to loan out the race context. A direct RawSyscall6 call does not require any race context. * The closure previously conditionally panicked in strange locations, like a signal handler. Now we simply throw. * All manual fixup synchronization with mPark, sysmon, templateThread, sigqueue, etc is gone. The core approach is much simpler: doAllThreadsSyscall sends a signal to every thread in allm, which executes the system call from the signal handler. We use (SIGRTMIN + 1), aka SIGSETXID, the same signal used by glibc for this purpose. As such, we are careful to only handle this signal on non-cgo binaries. Synchronization with thread creation is a key part of this CL. The comment near the top of doAllThreadsSyscall describes the required synchronization semantics and how they are achieved. Note that current use of allocmLock protects the state mutations of allm that are also protected by sched.lock. allocmLock is used instead of sched.lock simply to avoid holding sched.lock for so long. Fixes #50113 Change-Id: Ic7ea856dc66cf711731540a54996e08fc986ce84 Reviewed-on: https://go-review.googlesource.com/c/go/+/383434 Reviewed-by: Austin Clements <austin@google.com> Trust: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
author: Michael Pratt <mpratt@google.com> 2022-02-04 17:15:28 -0500
committer: Michael Pratt <mpratt@google.com> 2022-02-15 15:40:35 +0000
commit: 0a5fae2a0e965024f692b95f7e857904a274fcb6 (patch)
tree: 393819d9f85f5be1f54bc480f7c6763859bc8997 /src/runtime/os_linux.go
parent: 0b321c9a7c0055dfd3f875dea930a28690659211 (diff)
download: go-0a5fae2a0e965024f692b95f7e857904a274fcb6.tar.xz
1 files changed, 190 insertions, 117 deletions
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 2a826963dd..eb8aa076e9 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -8,9 +8,15 @@ import (
 	"internal/abi"
 	"internal/goarch"
 	"runtime/internal/atomic"
+	"runtime/internal/syscall"
 	"unsafe"
 )
 
+// sigPerThreadSyscall is the same signal (SIGSETXID) used by glibc for
+// per-thread syscalls on Linux. We use it for the same purpose in non-cgo
+// binaries.
+const sigPerThreadSyscall = _SIGRTMIN + 1
+
 type mOS struct {
 	// profileTimer holds the ID of the POSIX interval timer for profiling CPU
 	// usage on this thread.
@@ -21,6 +27,10 @@ type mOS struct {
 	// are in signal handling code, access to that field uses atomic operations.
 	profileTimer      int32
 	profileTimerValid uint32
+
+	// needPerThreadSyscall indicates that a per-thread syscall is required
+	// for doAllThreadsSyscall.
+	needPerThreadSyscall atomic.Uint8
 }
 
 //go:noescape
@@ -665,141 +675,204 @@ func setThreadCPUProfiler(hz int32) {
 	atomic.Store(&mp.profileTimerValid, 1)
 }
 
-// syscall_runtime_doAllThreadsSyscall serializes Go execution and
-// executes a specified fn() call on all m's.
+// perThreadSyscallArgs contains the system call number, arguments, and
+// expected return values for a system call to be executed on all threads.
+type perThreadSyscallArgs struct {
+	trap uintptr
+	a1   uintptr
+	a2   uintptr
+	a3   uintptr
+	a4   uintptr
+	a5   uintptr
+	a6   uintptr
+	r1   uintptr
+	r2   uintptr
+}
+
+// perThreadSyscall is the system call to execute for the ongoing
+// doAllThreadsSyscall.
 //
-// The boolean argument to fn() indicates whether the function's
-// return value will be consulted or not. That is, fn(true) should
-// return true if fn() succeeds, and fn(true) should return false if
-// it failed. When fn(false) is called, its return status will be
-// ignored.
+// perThreadSyscall may only be written while mp.needPerThreadSyscall == 0 on
+// all Ms.
+var perThreadSyscall perThreadSyscallArgs
+
+// syscall_runtime_doAllThreadsSyscall and executes a specified system call on
+// all Ms.
 //
-// syscall_runtime_doAllThreadsSyscall first invokes fn(true) on a
-// single, coordinating, m, and only if it returns true does it go on
-// to invoke fn(false) on all of the other m's known to the process.
+// The system call is expected to succeed and return the same value on every
+// thread. If any threads do not match, the runtime throws.
 //
 //go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall
-func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
+//go:uintptrescapes
+func syscall_runtime_doAllThreadsSyscall(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
 	if iscgo {
+		// In cgo, we are not aware of threads created in C, so this approach will not work.
 		panic("doAllThreadsSyscall not supported with cgo enabled")
 	}
-	if fn == nil {
-		return
+
+	// STW to guarantee that user goroutines see an atomic change to thread
+	// state. Without STW, goroutines could migrate Ms while change is in
+	// progress and e.g., see state old -> new -> old -> new.
+	//
+	// N.B. Internally, this function does not depend on STW to
+	// successfully change every thread. It is only needed for user
+	// expectations, per above.
+	stopTheWorld("doAllThreadsSyscall")
+
+	// This function depends on several properties:
+	//
+	// 1. All OS threads that already exist are associated with an M in
+	//    allm. i.e., we won't miss any pre-existing threads.
+	// 2. All Ms listed in allm will eventually have an OS thread exist.
+	//    i.e., they will set procid and be able to receive signals.
+	// 3. OS threads created after we read allm will clone from a thread
+	//    that has executed the system call. i.e., they inherit the
+	//    modified state.
+	//
+	// We achieve these through different mechanisms:
+	//
+	// 1. Addition of new Ms to allm in allocm happens before clone of its
+	//    OS thread later in newm.
+	// 2. newm does acquirem to avoid being preempted, ensuring that new Ms
+	//    created in allocm will eventually reach OS thread clone later in
+	//    newm.
+	// 3. We take allocmLock for write here to prevent allocation of new Ms
+	//    while this function runs. Per (1), this prevents clone of OS
+	//    threads that are not yet in allm.
+	allocmLock.lock()
+
+	// Disable preemption, preventing us from changing Ms, as we handle
+	// this M specially.
+	//
+	// N.B. STW and lock() above do this as well, this is added for extra
+	// clarity.
+	acquirem()
+
+	// N.B. allocmLock also prevents concurrent execution of this function,
+	// serializing use of perThreadSyscall, mp.needPerThreadSyscall, and
+	// ensuring all threads execute system calls from multiple calls in the
+	// same order.
+
+	r1, r2, errno := syscall.Syscall6(trap, a1, a2, a3, a4, a5, a6)
+	if GOARCH == "ppc64" || GOARCH == "ppc64le" {
+		// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2.
+		r2 = 0
 	}
-	for atomic.Load(&sched.sysmonStarting) != 0 {
-		osyield()
+	if errno != 0 {
+		releasem(getg().m)
+		allocmLock.unlock()
+		startTheWorld()
+		return r1, r2, errno
 	}
 
-	// We don't want this thread to handle signals for the
-	// duration of this critical section. The underlying issue
-	// being that this locked coordinating m is the one monitoring
-	// for fn() execution by all the other m's of the runtime,
-	// while no regular go code execution is permitted (the world
-	// is stopped). If this present m were to get distracted to
-	// run signal handling code, and find itself waiting for a
-	// second thread to execute go code before being able to
-	// return from that signal handling, a deadlock will result.
-	// (See golang.org/issue/44193.)
-	lockOSThread()
-	var sigmask sigset
-	sigsave(&sigmask)
-	sigblock(false)
+	perThreadSyscall = perThreadSyscallArgs{
+		trap: trap,
+		a1:   a1,
+		a2:   a2,
+		a3:   a3,
+		a4:   a4,
+		a5:   a5,
+		a6:   a6,
+		r1:   r1,
+		r2:   r2,
+	}
 
-	stopTheWorldGC("doAllThreadsSyscall")
-	if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
-		// Ensure that there are no in-flight thread
-		// creations: don't want to race with allm.
-		lock(&newmHandoff.lock)
-		for !newmHandoff.waiting {
-			unlock(&newmHandoff.lock)
+	// Wait for all threads to start.
+	//
+	// As described above, some Ms have been added to allm prior to
+	// allocmLock, but not yet completed OS clone and set procid.
+	//
+	// At minimum we must wait for a thread to set procid before we can
+	// send it a signal.
+	//
+	// We take this one step further and wait for all threads to start
+	// before sending any signals. This prevents system calls from getting
+	// applied twice: once in the parent and once in the child, like so:
+	//
+	//          A                     B                  C
+	//                         add C to allm
+	// doAllThreadsSyscall
+	//   allocmLock.lock()
+	//   signal B
+	//                         <receive signal>
+	//                         execute syscall
+	//                         <signal return>
+	//                         clone C
+	//                                             <thread start>
+	//                                             set procid
+	//   signal C
+	//                                             <receive signal>
+	//                                             execute syscall
+	//                                             <signal return>
+	//
+	// In this case, thread C inherited the syscall-modified state from
+	// thread B and did not need to execute the syscall, but did anyway
+	// because doAllThreadsSyscall could not be sure whether it was
+	// required.
+	//
+	// Some system calls may not be idempotent, so we ensure each thread
+	// executes the system call exactly once.
+	for mp := allm; mp != nil; mp = mp.alllink {
+		for atomic.Load64(&mp.procid) == 0 {
+			// Thread is starting.
 			osyield()
-			lock(&newmHandoff.lock)
 		}
-		unlock(&newmHandoff.lock)
-	}
-	if netpollinited() {
-		netpollBreak()
 	}
-	sigRecvPrepareForFixup()
-	_g_ := getg()
-	if raceenabled {
-		// For m's running without racectx, we loan out the
-		// racectx of this call.
-		lock(&mFixupRace.lock)
-		mFixupRace.ctx = _g_.racectx
-		unlock(&mFixupRace.lock)
+
+	// Signal every other thread, where they will execute perThreadSyscall
+	// from the signal handler.
+	gp := getg()
+	tid := gp.m.procid
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if atomic.Load64(&mp.procid) == tid {
+			// Our thread already performed the syscall.
+			continue
+		}
+		mp.needPerThreadSyscall.Store(1)
+		signalM(mp, sigPerThreadSyscall)
 	}
-	if ok := fn(true); ok {
-		tid := _g_.m.procid
-		for mp := allm; mp != nil; mp = mp.alllink {
-			if mp.procid == tid {
-				// This m has already completed fn()
-				// call.
-				continue
-			}
-			// Be wary of mp's without procid values if
-			// they are known not to park. If they are
-			// marked as parking with a zero procid, then
-			// they will be racing with this code to be
-			// allocated a procid and we will annotate
-			// them with the need to execute the fn when
-			// they acquire a procid to run it.
-			if mp.procid == 0 && !mp.doesPark {
-				// Reaching here, we are either
-				// running Windows, or cgo linked
-				// code. Neither of which are
-				// currently supported by this API.
-				throw("unsupported runtime environment")
-			}
-			// stopTheWorldGC() doesn't guarantee stopping
-			// all the threads, so we lock here to avoid
-			// the possibility of racing with mp.
-			lock(&mp.mFixup.lock)
-			mp.mFixup.fn = fn
-			atomic.Store(&mp.mFixup.used, 1)
-			if mp.doesPark {
-				// For non-service threads this will
-				// cause the wakeup to be short lived
-				// (once the mutex is unlocked). The
-				// next real wakeup will occur after
-				// startTheWorldGC() is called.
-				notewakeup(&mp.park)
-			}
-			unlock(&mp.mFixup.lock)
+
+	// Wait for all threads to complete.
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if mp.procid == tid {
+			continue
 		}
-		for {
-			done := true
-			for mp := allm; done && mp != nil; mp = mp.alllink {
-				if mp.procid == tid {
-					continue
-				}
-				done = atomic.Load(&mp.mFixup.used) == 0
-			}
-			if done {
-				break
-			}
-			// if needed force sysmon and/or newmHandoff to wakeup.
-			lock(&sched.lock)
-			if atomic.Load(&sched.sysmonwait) != 0 {
-				atomic.Store(&sched.sysmonwait, 0)
-				notewakeup(&sched.sysmonnote)
-			}
-			unlock(&sched.lock)
-			lock(&newmHandoff.lock)
-			if newmHandoff.waiting {
-				newmHandoff.waiting = false
-				notewakeup(&newmHandoff.wake)
-			}
-			unlock(&newmHandoff.lock)
+		for mp.needPerThreadSyscall.Load() != 0 {
 			osyield()
 		}
 	}
-	if raceenabled {
-		lock(&mFixupRace.lock)
-		mFixupRace.ctx = 0
-		unlock(&mFixupRace.lock)
+
+	perThreadSyscall = perThreadSyscallArgs{}
+
+	releasem(getg().m)
+	allocmLock.unlock()
+	startTheWorld()
+
+	return r1, r2, errno
+}
+
+// runPerThreadSyscall runs perThreadSyscall for this M if required.
+//
+// This function throws if the system call returns with anything other than the
+// expected values.
+//go:nosplit
+func runPerThreadSyscall() {
+	gp := getg()
+	if gp.m.needPerThreadSyscall.Load() == 0 {
+		return
+	}
+
+	args := perThreadSyscall
+	r1, r2, errno := syscall.Syscall6(args.trap, args.a1, args.a2, args.a3, args.a4, args.a5, args.a6)
+	if GOARCH == "ppc64" || GOARCH == "ppc64le" {
+		// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2.
+		r2 = 0
 	}
-	startTheWorldGC()
-	msigrestore(sigmask)
-	unlockOSThread()
+	if errno != 0 || r1 != args.r1 || r2 != args.r2 {
+		print("trap:", args.trap, ", a123456=[", args.a1, ",", args.a2, ",", args.a3, ",", args.a4, ",", args.a5, ",", args.a6, "]\n")
+		print("results: got {r1=", r1, ",r2=", r2, ",errno=", errno, "}, want {r1=", args.r1, ",r2=", args.r2, ",errno=0\n")
+		throw("AllThreadsSyscall6 results differ between threads; runtime corrupted")
+	}
+
+	gp.m.needPerThreadSyscall.Store(0)
 }
author	Michael Pratt <mpratt@google.com>	2022-02-04 17:15:28 -0500
committer	Michael Pratt <mpratt@google.com>	2022-02-15 15:40:35 +0000
commit	0a5fae2a0e965024f692b95f7e857904a274fcb6 (patch)
tree	393819d9f85f5be1f54bc480f7c6763859bc8997 /src/runtime/os_linux.go
parent	0b321c9a7c0055dfd3f875dea930a28690659211 (diff)
download	go-0a5fae2a0e965024f692b95f7e857904a274fcb6.tar.xz