1 files changed, 142 insertions, 123 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 64e102fb0a..dbb430fd25 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -490,8 +490,29 @@ func lockedOSThread() bool {
 }
 
 var (
-	allgs    []*g
+	// allgs contains all Gs ever created (including dead Gs), and thus
+	// never shrinks.
+	//
+	// Access via the slice is protected by allglock or stop-the-world.
+	// Readers that cannot take the lock may (carefully!) use the atomic
+	// variables below.
 	allglock mutex
+	allgs    []*g
+
+	// allglen and allgptr are atomic variables that contain len(allg) and
+	// &allg[0] respectively. Proper ordering depends on totally-ordered
+	// loads and stores. Writes are protected by allglock.
+	//
+	// allgptr is updated before allglen. Readers should read allglen
+	// before allgptr to ensure that allglen is always <= len(allgptr). New
+	// Gs appended during the race can be missed. For a consistent view of
+	// all Gs, allglock must be held.
+	//
+	// allgptr copies should always be stored as a concrete type or
+	// unsafe.Pointer, not uintptr, to ensure that GC can still reach it
+	// even if it points to a stale array.
+	allglen uintptr
+	allgptr **g
 )
 
 func allgadd(gp *g) {
@@ -501,10 +522,25 @@ func allgadd(gp *g) {
 
 	lock(&allglock)
 	allgs = append(allgs, gp)
-	allglen = uintptr(len(allgs))
+	if &allgs[0] != allgptr {
+		atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
+	}
+	atomic.Storeuintptr(&allglen, uintptr(len(allgs)))
 	unlock(&allglock)
 }
 
+// atomicAllG returns &allgs[0] and len(allgs) for use with atomicAllGIndex.
+func atomicAllG() (**g, uintptr) {
+	length := atomic.Loaduintptr(&allglen)
+	ptr := (**g)(atomic.Loadp(unsafe.Pointer(&allgptr)))
+	return ptr, length
+}
+
+// atomicAllGIndex returns ptr[i] with the allgptr returned from atomicAllG.
+func atomicAllGIndex(ptr **g, i uintptr) *g {
+	return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
+}
+
 const (
 	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -1170,8 +1206,38 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	return startTime
 }
 
+// usesLibcall indicates whether this runtime performs system calls
+// via libcall.
+func usesLibcall() bool {
+	switch GOOS {
+	case "aix", "darwin", "illumos", "ios", "solaris", "windows":
+		return true
+	case "openbsd":
+		return GOARCH == "amd64" || GOARCH == "arm64"
+	}
+	return false
+}
+
+// mStackIsSystemAllocated indicates whether this runtime starts on a
+// system-allocated stack.
+func mStackIsSystemAllocated() bool {
+	switch GOOS {
+	case "aix", "darwin", "plan9", "illumos", "ios", "solaris", "windows":
+		return true
+	case "openbsd":
+		switch GOARCH {
+		case "amd64", "arm64":
+			return true
+		}
+	}
+	return false
+}
+
 // mstart is the entry-point for new Ms.
-//
+// It is written in assembly, marked TOPFRAME, and calls mstart0.
+func mstart()
+
+// mstart0 is the Go entry-point for new Ms.
 // This must not split the stack because we may not even have stack
 // bounds set up yet.
 //
@@ -1180,7 +1246,7 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 //
 //go:nosplit
 //go:nowritebarrierrec
-func mstart() {
+func mstart0() {
 	_g_ := getg()
 
 	osStack := _g_.stack.lo == 0
@@ -1188,6 +1254,11 @@ func mstart() {
 		// Initialize stack bounds from system stack.
 		// Cgo may have left stack size in stack.hi.
 		// minit may update the stack bounds.
+		//
+		// Note: these bounds may not be very accurate.
+		// We set hi to &size, but there are things above
+		// it. The 1024 is supposed to compensate this,
+		// but is somewhat arbitrary.
 		size := _g_.stack.hi
 		if size == 0 {
 			size = 8192 * sys.StackGuardMultiplier
@@ -1204,8 +1275,7 @@ func mstart() {
 	mstart1()
 
 	// Exit this thread.
-	switch GOOS {
-	case "windows", "solaris", "illumos", "plan9", "darwin", "ios", "aix":
+	if mStackIsSystemAllocated() {
 		// Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate
 		// the stack, but put it in _g_.stack before mstart,
 		// so the logic above hasn't set osStack yet.
@@ -1214,6 +1284,9 @@ func mstart() {
 	mexit(osStack)
 }
 
+// The go:noinline is to guarantee the getcallerpc/getcallersp below are safe,
+// so that we can set up g0.sched to return to the call of mstart1 above.
+//go:noinline
 func mstart1() {
 	_g_ := getg()
 
@@ -1221,11 +1294,16 @@ func mstart1() {
 		throw("bad runtime·mstart")
 	}
 
-	// Record the caller for use as the top of stack in mcall and
-	// for terminating the thread.
+	// Set up m.g0.sched as a label returning returning to just
+	// after the mstart1 call in mstart0 above, for use by goexit0 and mcall.
 	// We're never coming back to mstart1 after we call schedule,
 	// so other calls can reuse the current frame.
-	save(getcallerpc(), getcallersp())
+	// And goexit0 does a gogo that needs to return from mstart1
+	// and let mstart0 exit the thread.
+	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
+	_g_.sched.pc = getcallerpc()
+	_g_.sched.sp = getcallersp()
+
 	asminit()
 	minit()
 
@@ -1313,7 +1391,7 @@ func mexit(osStack bool) {
 		throw("locked m0 woke up")
 	}
 
-	sigblock()
+	sigblock(true)
 	unminit()
 
 	// Free the gsignal stack.
@@ -1371,6 +1449,10 @@ found:
 		}
 	}
 
+	// Destroy all allocated resources. After this is called, we may no
+	// longer take any locks.
+	mdestroy(m)
+
 	if osStack {
 		// Return from mstart and let the system thread
 		// library free the g0 stack and terminate the thread.
@@ -1515,6 +1597,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
 	if netpollinited() {
 		netpollBreak()
 	}
+	sigRecvPrepareForFixup()
 	_g_ := getg()
 	if raceenabled {
 		// For m's running without racectx, we loan out the
@@ -1683,7 +1766,7 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 
 	// In case of cgo or Solaris or illumos or Darwin, pthread_create will make us a stack.
 	// Windows and Plan 9 will layout sched stack on OS stack.
-	if iscgo || GOOS == "solaris" || GOOS == "illumos" || GOOS == "windows" || GOOS == "plan9" || GOOS == "darwin" || GOOS == "ios" {
+	if iscgo || mStackIsSystemAllocated() {
 		mp.g0 = malg(-1)
 	} else {
 		mp.g0 = malg(8192 * sys.StackGuardMultiplier)
@@ -1754,7 +1837,7 @@ func needm() {
 	// starting a new m to run Go code via newosproc.
 	var sigmask sigset
 	sigsave(&sigmask)
-	sigblock()
+	sigblock(false)
 
 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
@@ -1828,7 +1911,7 @@ func oneNewExtraM() {
 	gp := malg(4096)
 	gp.sched.pc = funcPC(goexit) + sys.PCQuantum
 	gp.sched.sp = gp.stack.hi
-	gp.sched.sp -= 4 * sys.RegSize // extra space in case of reads slightly beyond frame
+	gp.sched.sp -= 4 * sys.PtrSize // extra space in case of reads slightly beyond frame
 	gp.sched.lr = 0
 	gp.sched.g = guintptr(unsafe.Pointer(gp))
 	gp.syscallpc = gp.sched.pc
@@ -1903,7 +1986,7 @@ func dropm() {
 	// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
 	// It's important not to try to handle a signal between those two steps.
 	sigmask := mp.sigmask
-	sigblock()
+	sigblock(false)
 	unminit()
 
 	mnext := lockextra(true)
@@ -1940,7 +2023,7 @@ func lockextra(nilokay bool) *m {
 	for {
 		old := atomic.Loaduintptr(&extram)
 		if old == locked {
-			osyield()
+			osyield_no_g()
 			continue
 		}
 		if old == 0 && !nilokay {
@@ -1951,13 +2034,13 @@ func lockextra(nilokay bool) *m {
 				atomic.Xadd(&extraMWaiters, 1)
 				incr = true
 			}
-			usleep(1)
+			usleep_no_g(1)
 			continue
 		}
 		if atomic.Casuintptr(&extram, old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
-		osyield()
+		osyield_no_g()
 		continue
 	}
 }
@@ -2882,7 +2965,9 @@ func wakeNetPoller(when int64) {
 	} else {
 		// There are no threads in the network poller, try to get
 		// one there so it can handle new timers.
-		wakep()
+		if GOOS != "plan9" { // Temporary workaround - see issue #42303.
+			wakep()
+		}
 	}
 }
 
@@ -3371,11 +3456,19 @@ func goexit0(gp *g) {
 func save(pc, sp uintptr) {
 	_g_ := getg()
 
+	if _g_ == _g_.m.g0 || _g_ == _g_.m.gsignal {
+		// m.g0.sched is special and must describe the context
+		// for exiting the thread. mstart1 writes to it directly.
+		// m.gsignal.sched should not be used at all.
+		// This check makes sure save calls do not accidentally
+		// run in contexts where they'd write to system g's.
+		throw("save on system g not allowed")
+	}
+
 	_g_.sched.pc = pc
 	_g_.sched.sp = sp
 	_g_.sched.lr = 0
 	_g_.sched.ret = 0
-	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
 	// We need to ensure ctxt is zero, but can't have a write
 	// barrier here. However, it should always already be zero.
 	// Assert that.
@@ -3389,7 +3482,7 @@ func save(pc, sp uintptr) {
 // This is called only from the go syscall library and cgocall,
 // not from the low-level system calls used by the runtime.
 //
-// Entersyscall cannot split the stack: the gosave must
+// Entersyscall cannot split the stack: the save must
 // make g->sched refer to the caller's stack segment, because
 // entersyscall is going to return immediately after.
 //
@@ -3774,7 +3867,7 @@ func beforefork() {
 	// group. See issue #18600.
 	gp.m.locks++
 	sigsave(&gp.m.sigmask)
-	sigblock()
+	sigblock(false)
 
 	// This function is called before fork in syscall package.
 	// Code between fork and exec must not allocate memory nor even try to grow stack.
@@ -3935,9 +4028,9 @@ func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerp
 
 	// We could allocate a larger initial stack if necessary.
 	// Not worth it: this is almost always an error.
-	// 4*sizeof(uintreg): extra space added below
-	// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
-	if siz >= _StackMin-4*sys.RegSize-sys.RegSize {
+	// 4*PtrSize: extra space added below
+	// PtrSize: caller's LR (arm) or return address (x86, in gostartcall).
+	if siz >= _StackMin-4*sys.PtrSize-sys.PtrSize {
 		throw("newproc: function arguments too large for new goroutine")
 	}
 
@@ -3956,8 +4049,8 @@ func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerp
 		throw("newproc1: new g is not Gdead")
 	}
 
-	totalSize := 4*sys.RegSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
-	totalSize += -totalSize & (sys.SpAlign - 1)                  // align to spAlign
+	totalSize := 4*sys.PtrSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
+	totalSize += -totalSize & (sys.StackAlign - 1)               // align to StackAlign
 	sp := newg.stack.hi - totalSize
 	spArg := sp
 	if usesLR {
@@ -4263,7 +4356,7 @@ func badunlockosthread() {
 }
 
 func gcount() int32 {
-	n := int32(allglen) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
+	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
 	for _, _p_ := range allp {
 		n -= _p_.gFree.n
 	}
@@ -4330,75 +4423,6 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// See golang.org/issue/17165.
 	getg().m.mallocing++
 
-	// Define that a "user g" is a user-created goroutine, and a "system g"
-	// is one that is m->g0 or m->gsignal.
-	//
-	// We might be interrupted for profiling halfway through a
-	// goroutine switch. The switch involves updating three (or four) values:
-	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
-	// because once it gets updated the new g is running.
-	//
-	// When switching from a user g to a system g, LR is not considered live,
-	// so the update only affects g, SP, and PC. Since PC must be last, there
-	// the possible partial transitions in ordinary execution are (1) g alone is updated,
-	// (2) both g and SP are updated, and (3) SP alone is updated.
-	// If SP or g alone is updated, we can detect the partial transition by checking
-	// whether the SP is within g's stack bounds. (We could also require that SP
-	// be changed only after g, but the stack bounds check is needed by other
-	// cases, so there is no need to impose an additional requirement.)
-	//
-	// There is one exceptional transition to a system g, not in ordinary execution.
-	// When a signal arrives, the operating system starts the signal handler running
-	// with an updated PC and SP. The g is updated last, at the beginning of the
-	// handler. There are two reasons this is okay. First, until g is updated the
-	// g and SP do not match, so the stack bounds check detects the partial transition.
-	// Second, signal handlers currently run with signals disabled, so a profiling
-	// signal cannot arrive during the handler.
-	//
-	// When switching from a system g to a user g, there are three possibilities.
-	//
-	// First, it may be that the g switch has no PC update, because the SP
-	// either corresponds to a user g throughout (as in asmcgocall)
-	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback). In this case, since the entire
-	// transition is a g+SP update, a partial transition updating just one of
-	// those will be detected by the stack bounds check.
-	//
-	// Second, when returning from a signal handler, the PC and SP updates
-	// are performed by the operating system in an atomic update, so the g
-	// update must be done before them. The stack bounds check detects
-	// the partial transition here, and (again) signal handlers run with signals
-	// disabled, so a profiling signal cannot arrive then anyway.
-	//
-	// Third, the common case: it may be that the switch updates g, SP, and PC
-	// separately. If the PC is within any of the functions that does this,
-	// we don't ask for a traceback. C.F. the function setsSP for more about this.
-	//
-	// There is another apparently viable approach, recorded here in case
-	// the "PC within setsSP function" check turns out not to be usable.
-	// It would be possible to delay the update of either g or SP until immediately
-	// before the PC update instruction. Then, because of the stack bounds check,
-	// the only problematic interrupt point is just before that PC update instruction,
-	// and the sigprof handler can detect that instruction and simulate stepping past
-	// it in order to reach a consistent state. On ARM, the update of g must be made
-	// in two places (in R10 and also in a TLS slot), so the delayed update would
-	// need to be the SP update. The sigprof handler must read the instruction at
-	// the current PC and if it was the known instruction (for example, JMP BX or
-	// MOV R2, PC), use that other register in place of the PC value.
-	// The biggest drawback to this solution is that it requires that we can tell
-	// whether it's safe to read from the memory pointed at by PC.
-	// In a correct program, we can test PC == nil and otherwise read,
-	// but if a profiling signal happens at the instant that a program executes
-	// a bad jump (before the program manages to handle the resulting fault)
-	// the profiling handler could fault trying to read nonexistent memory.
-	//
-	// To recap, there are no constraints on the assembly being used for the
-	// transition. We simply require that g and SP match and that the PC is not
-	// in gogo.
-	traceback := true
-	if gp == nil || sp < gp.stack.lo || gp.stack.hi < sp || setsSP(pc) || (mp != nil && mp.vdsoSP != 0) {
-		traceback = false
-	}
 	var stk [maxCPUProfStack]uintptr
 	n := 0
 	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
@@ -4421,7 +4445,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 		if n > 0 {
 			n += cgoOff
 		}
-	} else if traceback {
+	} else {
 		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
 	}
 
@@ -4429,7 +4453,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 		// Normal traceback is impossible or has failed.
 		// See if it falls into several common cases.
 		n = 0
-		if (GOOS == "windows" || GOOS == "solaris" || GOOS == "illumos" || GOOS == "darwin" || GOOS == "ios" || GOOS == "aix") && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+		if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
 			// Libcall, i.e. runtime syscall on windows.
 			// Collect Go stack that leads to the call.
 			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
@@ -4500,30 +4524,6 @@ func sigprofNonGoPC(pc uintptr) {
 	}
 }
 
-// Reports whether a function will set the SP
-// to an absolute value. Important that
-// we don't traceback when these are at the bottom
-// of the stack since we can't be sure that we will
-// find the caller.
-//
-// If the function is not on the bottom of the stack
-// we assume that it will have set it up so that traceback will be consistent,
-// either by being a traceback terminating function
-// or putting one on the stack at the right offset.
-func setsSP(pc uintptr) bool {
-	f := findfunc(pc)
-	if !f.valid() {
-		// couldn't find the function for this PC,
-		// so assume the worst and stop traceback
-		return true
-	}
-	switch f.funcID {
-	case funcID_gogo, funcID_systemstack, funcID_mcall, funcID_morestack:
-		return true
-	}
-	return false
-}
-
 // setcpuprofilerate sets the CPU profiling rate to hz times per second.
 // If hz <= 0, setcpuprofilerate turns off CPU profiling.
 func setcpuprofilerate(hz int32) {
@@ -4967,7 +4967,6 @@ func checkdead() {
 		case _Grunnable,
 			_Grunning,
 			_Gsyscall:
-			unlock(&allglock)
 			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
 			throw("checkdead: runnable g")
 		}
@@ -5128,6 +5127,26 @@ func sysmon() {
 			}
 		}
 		mDoFixup()
+		if GOOS == "netbsd" {
+			// netpoll is responsible for waiting for timer
+			// expiration, so we typically don't have to worry
+			// about starting an M to service timers. (Note that
+			// sleep for timeSleepUntil above simply ensures sysmon
+			// starts running again when that timer expiration may
+			// cause Go code to run again).
+			//
+			// However, netbsd has a kernel bug that sometimes
+			// misses netpollBreak wake-ups, which can lead to
+			// unbounded delays servicing timers. If we detect this
+			// overrun, then startm to get something to handle the
+			// timer.
+			//
+			// See issue 42515 and
+			// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
+			if next, _ := timeSleepUntil(); next < now {
+				startm(nil, false)
+			}
+		}
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()