runtime: eliminate _Psyscall

This change eliminates the _Psyscall state by using synchronization on the G status _Gsyscall to make syscalls work instead. This removes an atomic Store and an atomic CAS on the syscall path, which reduces syscall and cgo overheads. It also simplifies the syscall paths quite a bit. The one danger with this change is that we have a new combination of states that was previously impossible. There are brief windows where it's possible to observe a goroutine in _Grunning but without a P. This change is careful to hide this detail from the execution tracer, but it may have unexpected effects in the rest of the runtime, making this change somewhat risky. goos: linux goarch: amd64 pkg: internal/runtime/cgobench cpu: AMD EPYC 7B13 │ before.out │ after.out │ │ sec/op │ sec/op vs base │ CgoCall-64 43.69n ± 1% 35.83n ± 1% -17.99% (p=0.002 n=6) CgoCallParallel-64 5.306n ± 1% 5.338n ± 1% ~ (p=0.132 n=6) Change-Id: I4551afc1eea0c1b67a0b2dd26b0d49aa47bf1fb8 Reviewed-on: https://go-review.googlesource.com/c/go/+/646198 Auto-Submit: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com>
author: Michael Anthony Knyszek <mknyszek@google.com> 2025-02-02 19:50:39 +0000
committer: Gopher Robot <gobot@golang.org> 2025-10-30 10:08:55 -0700
commit: 7244e9221ff25b0c93a13ad8f1aa8917ca50f697 (patch)
tree: e6fb8794933dca05c72b7571008982815027b0ac /src/runtime/proc.go
parent: 5ef19c0d0c870cc014ba28aa13b926c13ec1063a (diff)
download: go-7244e9221ff25b0c93a13ad8f1aa8917ca50f697.tar.xz
1 files changed, 412 insertions, 259 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 6c16effc95..081b9a2825 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1654,29 +1654,21 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	sched.stopwait = gomaxprocs
 	sched.gcwaiting.Store(true)
 	preemptall()
-	// stop current P
+
+	// Stop current P.
 	gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	gp.m.p.ptr().gcStopTime = start
 	sched.stopwait--
-	// try to retake all P's in Psyscall status
-	trace = traceAcquire()
+
+	// Try to retake all P's in syscalls.
 	for _, pp := range allp {
-		s := pp.status
-		if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-			if trace.ok() {
-				trace.ProcSteal(pp, false)
-			}
-			sched.nGsyscallNoP.Add(1)
-			pp.syscalltick++
-			pp.gcStopTime = nanotime()
-			sched.stopwait--
+		if thread, ok := setBlockOnExitSyscall(pp); ok {
+			thread.gcstopP()
+			thread.resume()
 		}
 	}
-	if trace.ok() {
-		traceRelease(trace)
-	}
 
-	// stop idle P's
+	// Stop idle Ps.
 	now := nanotime()
 	for {
 		pp, _ := pidleget(now)
@@ -1690,7 +1682,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	wait := sched.stopwait > 0
 	unlock(&sched.lock)
 
-	// wait for remaining P's to stop voluntarily
+	// Wait for remaining Ps to stop voluntarily.
 	if wait {
 		for {
 			// wait for 100us, then try to re-preempt in case of any races
@@ -2156,9 +2148,9 @@ func forEachPInternal(fn func(*p)) {
 	}
 	preemptall()
 
-	// Any P entering _Pidle or _Psyscall from now on will observe
+	// Any P entering _Pidle or a system call from now on will observe
 	// p.runSafePointFn == 1 and will call runSafePointFn when
-	// changing its status to _Pidle/_Psyscall.
+	// changing its status to _Pidle.
 
 	// Run safe point function for all idle Ps. sched.pidle will
 	// not change because we hold sched.lock.
@@ -2175,25 +2167,19 @@ func forEachPInternal(fn func(*p)) {
 	// Run fn for the current P.
 	fn(pp)
 
-	// Force Ps currently in _Psyscall into _Pidle and hand them
+	// Force Ps currently in a system call into _Pidle and hand them
 	// off to induce safe point function execution.
 	for _, p2 := range allp {
-		s := p2.status
-
 		// We need to be fine-grained about tracing here, since handoffp
 		// might call into the tracer, and the tracer is non-reentrant.
-		trace := traceAcquire()
-		if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-			if trace.ok() {
-				// It's important that we traceRelease before we call handoffp, which may also traceAcquire.
-				trace.ProcSteal(p2, false)
-				traceRelease(trace)
-			}
-			sched.nGsyscallNoP.Add(1)
-			p2.syscalltick++
+		if atomic.Load(&p2.runSafePointFn) != 1 {
+			// Already ran it.
+			continue
+		}
+		if thread, ok := setBlockOnExitSyscall(p2); ok {
+			thread.takeP()
+			thread.resume()
 			handoffp(p2)
-		} else if trace.ok() {
-			traceRelease(trace)
 		}
 	}
 
@@ -2234,9 +2220,9 @@ func forEachPInternal(fn func(*p)) {
 //	}
 //
 // runSafePointFn must be checked on any transition in to _Pidle or
-// _Psyscall to avoid a race where forEachP sees that the P is running
-// just before the P goes into _Pidle/_Psyscall and neither forEachP
-// nor the P run the safe-point function.
+// when entering a system call to avoid a race where forEachP sees
+// that the P is running just before the P goes into _Pidle/system call
+// and neither forEachP nor the P run the safe-point function.
 func runSafePointFn() {
 	p := getg().m.p.ptr()
 	// Resolve the race between forEachP running the safe-point
@@ -2571,7 +2557,7 @@ func oneNewExtraM() {
 // So that the destructor would invoke dropm while the non-Go thread is exiting.
 // This is much faster since it avoids expensive signal-related syscalls.
 //
-// This always runs without a P, so //go:nowritebarrierrec is required.
+// This may run without a P, so //go:nowritebarrierrec is required.
 //
 // This may run with a different stack than was recorded in g0 (there is no
 // call to callbackUpdateSystemStack prior to dropm), so this must be
@@ -4566,7 +4552,6 @@ func save(pc, sp, bp uintptr) {
 //
 //go:nosplit
 func reentersyscall(pc, sp, bp uintptr) {
-	trace := traceAcquire()
 	gp := getg()
 
 	// Disable preemption because during this function g is in Gsyscall status,
@@ -4580,17 +4565,23 @@ func reentersyscall(pc, sp, bp uintptr) {
 	gp.stackguard0 = stackPreempt
 	gp.throwsplit = true
 
+	// Copy the syscalltick over so we can identify if the P got stolen later.
+	gp.m.syscalltick = gp.m.p.ptr().syscalltick
+
+	pp := gp.m.p.ptr()
+	if pp.runSafePointFn != 0 {
+		// runSafePointFn may stack split if run on this stack
+		systemstack(runSafePointFn)
+	}
+	gp.m.oldp.set(pp)
+
 	// Leave SP around for GC and traceback.
 	save(pc, sp, bp)
 	gp.syscallsp = sp
 	gp.syscallpc = pc
 	gp.syscallbp = bp
-	casgstatus(gp, _Grunning, _Gsyscall)
-	if staticLockRanking {
-		// When doing static lock ranking casgstatus can call
-		// systemstack which clobbers g.sched.
-		save(pc, sp, bp)
-	}
+
+	// Double-check sp and bp.
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
 			print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@@ -4603,40 +4594,45 @@ func reentersyscall(pc, sp, bp uintptr) {
 			throw("entersyscall")
 		})
 	}
-
+	trace := traceAcquire()
 	if trace.ok() {
+		// Emit a trace event. Notably, actually emitting the event must happen before
+		// the casgstatus because it mutates the P, but the traceLocker must be held
+		// across the casgstatus so the transition is atomic with respect to the event.
 		systemstack(func() {
 			trace.GoSysCall()
-			traceRelease(trace)
 		})
-		// systemstack itself clobbers g.sched.{pc,sp} and we might
-		// need them later when the G is genuinely blocked in a
-		// syscall
+		// systemstack clobbered gp.sched, so restore it.
 		save(pc, sp, bp)
 	}
-
-	if sched.sysmonwait.Load() {
-		systemstack(entersyscall_sysmon)
+	if sched.gcwaiting.Load() {
+		// Optimization: If there's a pending STW, do the equivalent of
+		// entersyscallblock here at the last minute and immediately give
+		// away our P.
+		systemstack(func() {
+			entersyscallHandleGCWait(trace)
+		})
+		// systemstack clobbered gp.sched, so restore it.
 		save(pc, sp, bp)
 	}
-
-	if gp.m.p.ptr().runSafePointFn != 0 {
-		// runSafePointFn may stack split if run on this stack
-		systemstack(runSafePointFn)
+	// As soon as we switch to _Gsyscall, we are in danger of losing our P.
+	// We must not touch it after this point.
+	casgstatus(gp, _Grunning, _Gsyscall)
+	if staticLockRanking {
+		// casgstatus clobbers gp.sched via systemstack under staticLockRanking. Restore it.
 		save(pc, sp, bp)
 	}
-
-	gp.m.syscalltick = gp.m.p.ptr().syscalltick
-	pp := gp.m.p.ptr()
-	pp.m = 0
-	gp.m.oldp.set(pp)
-	gp.m.p = 0
-	atomic.Store(&pp.status, _Psyscall)
-	if sched.gcwaiting.Load() {
-		systemstack(entersyscall_gcwait)
+	if trace.ok() {
+		// N.B. We don't need to go on the systemstack because traceRelease is very
+		// carefully recursively nosplit. This also means we don't need to worry
+		// about clobbering gp.sched.
+		traceRelease(trace)
+	}
+	if sched.sysmonwait.Load() {
+		systemstack(entersyscallWakeSysmon)
+		// systemstack clobbered gp.sched, so restore it.
 		save(pc, sp, bp)
 	}
-
 	gp.m.locks--
 }
 
@@ -4663,7 +4659,7 @@ func entersyscall() {
 	reentersyscall(sys.GetCallerPC(), sys.GetCallerSP(), fp)
 }
 
-func entersyscall_sysmon() {
+func entersyscallWakeSysmon() {
 	lock(&sched.lock)
 	if sched.sysmonwait.Load() {
 		sched.sysmonwait.Store(false)
@@ -4672,25 +4668,19 @@ func entersyscall_sysmon() {
 	unlock(&sched.lock)
 }
 
-func entersyscall_gcwait() {
+func entersyscallHandleGCWait(trace traceLocker) {
 	gp := getg()
-	pp := gp.m.oldp.ptr()
 
 	lock(&sched.lock)
-	trace := traceAcquire()
-	if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
+	if sched.stopwait > 0 {
+		// Set our P to _Pgcstop so the STW can take it.
+		pp := gp.m.p.ptr()
+		pp.m = 0
+		gp.m.p = 0
+		atomic.Store(&pp.status, _Pgcstop)
+
 		if trace.ok() {
-			// This is a steal in the new tracer. While it's very likely
-			// that we were the ones to put this P into _Psyscall, between
-			// then and now it's totally possible it had been stolen and
-			// then put back into _Psyscall for us to acquire here. In such
-			// case ProcStop would be incorrect.
-			//
-			// TODO(mknyszek): Consider emitting a ProcStop instead when
-			// gp.m.syscalltick == pp.syscalltick, since then we know we never
-			// lost the P.
-			trace.ProcSteal(pp, true)
-			traceRelease(trace)
+			trace.ProcStop(pp)
 		}
 		sched.nGsyscallNoP.Add(1)
 		pp.gcStopTime = nanotime()
@@ -4698,8 +4688,6 @@ func entersyscall_gcwait() {
 		if sched.stopwait--; sched.stopwait == 0 {
 			notewakeup(&sched.stopnote)
 		}
-	} else if trace.ok() {
-		traceRelease(trace)
 	}
 	unlock(&sched.lock)
 }
@@ -4744,6 +4732,22 @@ func entersyscallblock() {
 			throw("entersyscallblock")
 		})
 	}
+
+	// Once we switch to _Gsyscall, we can't safely touch
+	// our P anymore, so we need to hand it off beforehand.
+	// The tracer also needs to see the syscall before the P
+	// handoff, so the order here must be (1) trace,
+	// (2) handoff, (3) _Gsyscall switch.
+	trace := traceAcquire()
+	systemstack(func() {
+		if trace.ok() {
+			trace.GoSysCall()
+		}
+		handoffp(releasep())
+	})
+	// <--
+	// Caution: we're in a small window where we are in _Grunning without a P.
+	// -->
 	casgstatus(gp, _Grunning, _Gsyscall)
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
@@ -4757,8 +4761,11 @@ func entersyscallblock() {
 			throw("entersyscallblock")
 		})
 	}
-
-	systemstack(entersyscallblock_handoff)
+	if trace.ok() {
+		systemstack(func() {
+			traceRelease(trace)
+		})
+	}
 
 	// Resave for traceback during blocked call.
 	save(sys.GetCallerPC(), sys.GetCallerSP(), getcallerfp())
@@ -4766,15 +4773,6 @@ func entersyscallblock() {
 	gp.m.locks--
 }
 
-func entersyscallblock_handoff() {
-	trace := traceAcquire()
-	if trace.ok() {
-		trace.GoSysCall()
-		traceRelease(trace)
-	}
-	handoffp(releasep())
-}
-
 // The goroutine g exited its system call.
 // Arrange for it to run on a cpu again.
 // This is called only from the go syscall library, not
@@ -4802,13 +4800,87 @@ func exitsyscall() {
 	if sys.GetCallerSP() > gp.syscallsp {
 		throw("exitsyscall: syscall frame is no longer valid")
 	}
-
 	gp.waitsince = 0
+
+	if sched.stopwait == freezeStopWait {
+		// Wedge ourselves if there's an outstanding freezetheworld.
+		// If we transition to running, we might end up with our traceback
+		// being taken twice.
+		systemstack(func() {
+			lock(&deadlock)
+			lock(&deadlock)
+		})
+	}
+
+	// Optimistically assume we're going to keep running, and switch to running.
+	// Before this point, our P wiring is not ours. Once we get past this point,
+	// we can access our P if we have it, otherwise we lost it.
+	//
+	// N.B. Because we're transitioning to _Grunning here, traceAcquire doesn't
+	// need to be held ahead of time. We're effectively atomic with respect to
+	// the tracer because we're non-preemptible and in the runtime. It can't stop
+	// us to read a bad status.
+	casgstatus(gp, _Gsyscall, _Grunning)
+
+	// Caution: we're in a window where we may be in _Grunning without a P.
+	// Either we will grab a P or call exitsyscall0, where we'll switch to
+	// _Grunnable.
+
+	// Grab and clear our old P.
 	oldp := gp.m.oldp.ptr()
-	gp.m.oldp = 0
-	if exitsyscallfast(oldp) {
-		// When exitsyscallfast returns success, we have a P so can now use
-		// write barriers
+	gp.m.oldp.set(nil)
+
+	// Check if we still have a P, and if not, try to acquire an idle P.
+	pp := gp.m.p.ptr()
+	if pp != nil {
+		// Fast path: we still have our P. Just emit a syscall exit event.
+		if trace := traceAcquire(); trace.ok() {
+			systemstack(func() {
+				// The truth is we truly never lost the P, but syscalltick
+				// is used to indicate whether the P should be treated as
+				// lost anyway. For example, when syscalltick is trashed by
+				// dropm.
+				//
+				// TODO(mknyszek): Consider a more explicit mechanism for this.
+				// Then syscalltick doesn't need to be trashed, and can be used
+				// exclusively by sysmon for deciding when it's time to retake.
+				if pp.syscalltick == gp.m.syscalltick {
+					trace.GoSysExit(false)
+				} else {
+					// Since we need to pretend we lost the P, but nobody ever
+					// took it, we need a ProcSteal event to model the loss.
+					// Then, continue with everything else we'd do if we lost
+					// the P.
+					trace.ProcSteal(pp)
+					trace.ProcStart()
+					trace.GoSysExit(true)
+					trace.GoStart()
+				}
+				traceRelease(trace)
+			})
+		}
+	} else {
+		// Slow path: we lost our P. Try to get another one.
+		systemstack(func() {
+			// Try to get some other P.
+			if pp := exitsyscallTryGetP(oldp); pp != nil {
+				// Install the P.
+				acquirepNoTrace(pp)
+
+				// We're going to start running again, so emit all the relevant events.
+				if trace := traceAcquire(); trace.ok() {
+					trace.ProcStart()
+					trace.GoSysExit(true)
+					trace.GoStart()
+					traceRelease(trace)
+				}
+			}
+		})
+		pp = gp.m.p.ptr()
+	}
+
+	// If we have a P, clean up and exit.
+	if pp != nil {
 		if goroutineProfile.active {
 			// Make sure that gp has had its stack written out to the goroutine
 			// profile, exactly as it was when the goroutine profiler first
@@ -4817,41 +4889,19 @@ func exitsyscall() {
 				tryRecordGoroutineProfileWB(gp)
 			})
 		}
-		trace := traceAcquire()
-		if trace.ok() {
-			lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick
-			systemstack(func() {
-				// Write out syscall exit eagerly.
-				//
-				// It's important that we write this *after* we know whether we
-				// lost our P or not (determined by exitsyscallfast).
-				trace.GoSysExit(lostP)
-				if lostP {
-					// We lost the P at some point, even though we got it back here.
-					// Trace that we're starting again, because there was a tracev2.GoSysBlock
-					// call somewhere in exitsyscallfast (indicating that this goroutine
-					// had blocked) and we're about to start running again.
-					trace.GoStart()
-				}
-			})
-		}
-		// There's a cpu for us, so we can run.
-		gp.m.p.ptr().syscalltick++
-		// We need to cas the status and scan before resuming...
-		casgstatus(gp, _Gsyscall, _Grunning)
-		if trace.ok() {
-			traceRelease(trace)
-		}
+
+		// Increment the syscalltick for P, since we're exiting a syscall.
+		pp.syscalltick++
 
 		// Garbage collector isn't running (since we are),
 		// so okay to clear syscallsp.
 		gp.syscallsp = 0
 		gp.m.locks--
 		if gp.preempt {
-			// restore the preemption request in case we've cleared it in newstack
+			// Restore the preemption request in case we cleared it in newstack.
 			gp.stackguard0 = stackPreempt
 		} else {
-			// otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+			// Otherwise restore the real stackGuard, we clobbered it in entersyscall/entersyscallblock.
 			gp.stackguard0 = gp.stack.lo + stackGuard
 		}
 		gp.throwsplit = false
@@ -4860,14 +4910,13 @@ func exitsyscall() {
 			// Scheduling of this goroutine is disabled.
 			Gosched()
 		}
-
 		return
 	}
-
+	// Slowest path: We couldn't get a P, so call into the scheduler.
 	gp.m.locks--
 
 	// Call the scheduler.
-	mcall(exitsyscall0)
+	mcall(exitsyscallNoP)
 
 	// Scheduler returned, so we're allowed to run now.
 	// Delete the syscallsp information that we left for
@@ -4880,78 +4929,38 @@ func exitsyscall() {
 	gp.throwsplit = false
 }
 
-//go:nosplit
-func exitsyscallfast(oldp *p) bool {
-	// Freezetheworld sets stopwait but does not retake P's.
-	if sched.stopwait == freezeStopWait {
-		return false
-	}
-
-	// Try to re-acquire the last P.
-	trace := traceAcquire()
-	if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
-		// There's a cpu for us, so we can run.
-		wirep(oldp)
-		exitsyscallfast_reacquired(trace)
-		if trace.ok() {
-			traceRelease(trace)
+// exitsyscall's attempt to try to get any P, if it's missing one.
+// Returns true on success.
+//
+// Must execute on the systemstack because exitsyscall is nosplit.
+//
+//go:systemstack
+func exitsyscallTryGetP(oldp *p) *p {
+	// Try to steal our old P back.
+	if oldp != nil {
+		if thread, ok := setBlockOnExitSyscall(oldp); ok {
+			thread.takeP()
+			thread.resume()
+			sched.nGsyscallNoP.Add(-1) // takeP adds 1.
+			return oldp
 		}
-		return true
-	}
-	if trace.ok() {
-		traceRelease(trace)
 	}
 
-	// Try to get any other idle P.
+	// Try to get an idle P.
 	if sched.pidle != 0 {
-		var ok bool
-		systemstack(func() {
-			ok = exitsyscallfast_pidle()
-		})
-		if ok {
-			return true
+		lock(&sched.lock)
+		pp, _ := pidleget(0)
+		if pp != nil && sched.sysmonwait.Load() {
+			sched.sysmonwait.Store(false)
+			notewakeup(&sched.sysmonnote)
 		}
-	}
-	return false
-}
-
-// exitsyscallfast_reacquired is the exitsyscall path on which this G
-// has successfully reacquired the P it was running on before the
-// syscall.
-//
-//go:nosplit
-func exitsyscallfast_reacquired(trace traceLocker) {
-	gp := getg()
-	if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-		if trace.ok() {
-			// The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
-			// tracev2.GoSysBlock for this syscall was already emitted,
-			// but here we effectively retake the p from the new syscall running on the same p.
-			systemstack(func() {
-				// We're stealing the P. It's treated
-				// as if it temporarily stopped running. Then, start running.
-				trace.ProcSteal(gp.m.p.ptr(), true)
-				trace.ProcStart()
-			})
+		unlock(&sched.lock)
+		if pp != nil {
+			sched.nGsyscallNoP.Add(-1)
+			return pp
 		}
-		gp.m.p.ptr().syscalltick++
-	}
-}
-
-func exitsyscallfast_pidle() bool {
-	lock(&sched.lock)
-	pp, _ := pidleget(0)
-	if pp != nil && sched.sysmonwait.Load() {
-		sched.sysmonwait.Store(false)
-		notewakeup(&sched.sysmonnote)
 	}
-	unlock(&sched.lock)
-	if pp != nil {
-		sched.nGsyscallNoP.Add(-1)
-		acquirep(pp)
-		return true
-	}
-	return false
+	return nil
 }
 
 // exitsyscall slow path on g0.
@@ -4960,11 +4969,10 @@ func exitsyscallfast_pidle() bool {
 // Called via mcall, so gp is the calling g from this M.
 //
 //go:nowritebarrierrec
-func exitsyscall0(gp *g) {
-	var trace traceLocker
+func exitsyscallNoP(gp *g) {
 	traceExitingSyscall()
-	trace = traceAcquire()
-	casgstatus(gp, _Gsyscall, _Grunnable)
+	trace := traceAcquire()
+	casgstatus(gp, _Grunning, _Grunnable)
 	traceExitedSyscall()
 	if trace.ok() {
 		// Write out syscall exit eagerly.
@@ -6021,6 +6029,21 @@ func procresize(nprocs int32) *p {
 //
 //go:yeswritebarrierrec
 func acquirep(pp *p) {
+	// Do the work.
+	acquirepNoTrace(pp)
+
+	// Emit the event.
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.ProcStart()
+		traceRelease(trace)
+	}
+}
+
+// Internals of acquirep, just skipping the trace events.
+//
+//go:yeswritebarrierrec
+func acquirepNoTrace(pp *p) {
 	// Do the part that isn't allowed to have write barriers.
 	wirep(pp)
 
@@ -6029,12 +6052,6 @@ func acquirep(pp *p) {
 	// Perform deferred mcache flush before this P can allocate
 	// from a potentially stale mcache.
 	pp.mcache.prepareForSweep()
-
-	trace := traceAcquire()
-	if trace.ok() {
-		trace.ProcStart()
-		traceRelease(trace)
-	}
 }
 
 // wirep is the first step of acquirep, which actually associates the
@@ -6382,73 +6399,205 @@ func retake(now int64) uint32 {
 	// temporarily drop the allpLock. Hence, we need to re-fetch
 	// allp each time around the loop.
 	for i := 0; i < len(allp); i++ {
+		// Quickly filter out non-running Ps. Running Ps are either
+		// in a syscall or are actually executing. Idle Ps don't
+		// need to be retaken.
+		//
+		// This is best-effort, so it's OK that it's racy. Our target
+		// is to retake Ps that have been running or in a syscall for
+		// a long time (milliseconds), so the state has plenty of time
+		// to stabilize.
 		pp := allp[i]
-		if pp == nil {
-			// This can happen if procresize has grown
+		if pp == nil || atomic.Load(&pp.status) != _Prunning {
+			// pp can be nil if procresize has grown
 			// allp but not yet created new Ps.
 			continue
 		}
 		pd := &pp.sysmontick
-		s := pp.status
 		sysretake := false
-		if s == _Prunning || s == _Psyscall {
-			// Preempt G if it's running on the same schedtick for
-			// too long. This could be from a single long-running
-			// goroutine or a sequence of goroutines run via
-			// runnext, which share a single schedtick time slice.
-			t := int64(pp.schedtick)
-			if int64(pd.schedtick) != t {
-				pd.schedtick = uint32(t)
-				pd.schedwhen = now
-			} else if pd.schedwhen+forcePreemptNS <= now {
-				preemptone(pp)
-				// In case of syscall, preemptone() doesn't
-				// work, because there is no M wired to P.
-				sysretake = true
-			}
+
+		// Preempt G if it's running on the same schedtick for
+		// too long. This could be from a single long-running
+		// goroutine or a sequence of goroutines run via
+		// runnext, which share a single schedtick time slice.
+		schedt := int64(pp.schedtick)
+		if int64(pd.schedtick) != schedt {
+			pd.schedtick = uint32(schedt)
+			pd.schedwhen = now
+		} else if pd.schedwhen+forcePreemptNS <= now {
+			preemptone(pp)
+			// If pp is in a syscall, preemptone doesn't work.
+			// The goroutine nor the thread can respond to a
+			// preemption request because they're not in Go code,
+			// so we need to take the P ourselves.
+			sysretake = true
 		}
-		if s == _Psyscall {
-			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
-			t := int64(pp.syscalltick)
-			if !sysretake && int64(pd.syscalltick) != t {
-				pd.syscalltick = uint32(t)
-				pd.syscallwhen = now
-				continue
-			}
-			// On the one hand we don't want to retake Ps if there is no other work to do,
-			// but on the other hand we want to retake them eventually
-			// because they can prevent the sysmon thread from deep sleep.
-			if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
-				continue
-			}
-			// Drop allpLock so we can take sched.lock.
-			unlock(&allpLock)
-			// Need to decrement number of idle locked M's
-			// (pretending that one more is running) before the CAS.
-			// Otherwise the M from which we retake can exit the syscall,
-			// increment nmidle and report deadlock.
-			incidlelocked(-1)
-			trace := traceAcquire()
-			if atomic.Cas(&pp.status, s, _Pidle) {
-				if trace.ok() {
-					trace.ProcSteal(pp, false)
-					traceRelease(trace)
-				}
-				sched.nGsyscallNoP.Add(1)
-				n++
-				pp.syscalltick++
-				handoffp(pp)
-			} else if trace.ok() {
-				traceRelease(trace)
-			}
-			incidlelocked(1)
-			lock(&allpLock)
+
+		// Drop allpLock so we can take sched.lock.
+		unlock(&allpLock)
+
+		// Need to decrement number of idle locked M's (pretending that
+		// one more is running) before we take the P and resume.
+		// Otherwise the M from which we retake can exit the syscall,
+		// increment nmidle and report deadlock.
+		//
+		// Can't call incidlelocked once we setBlockOnExitSyscall, due
+		// to a lock ordering violation between sched.lock and _Gscan.
+		incidlelocked(-1)
+
+		// Try to prevent the P from continuing in the syscall, if it's in one at all.
+		thread, ok := setBlockOnExitSyscall(pp)
+		if !ok {
+			// Not in a syscall, or something changed out from under us.
+			goto done
 		}
+
+		// Retake the P if it's there for more than 1 sysmon tick (at least 20us).
+		if syst := int64(pp.syscalltick); !sysretake && int64(pd.syscalltick) != syst {
+			pd.syscalltick = uint32(syst)
+			pd.syscallwhen = now
+			thread.resume()
+			goto done
+		}
+
+		// On the one hand we don't want to retake Ps if there is no other work to do,
+		// but on the other hand we want to retake them eventually
+		// because they can prevent the sysmon thread from deep sleep.
+		if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
+			thread.resume()
+			goto done
+		}
+
+		// Take the P. Note: because we have the scan bit, the goroutine
+		// is at worst stuck spinning in exitsyscall.
+		thread.takeP()
+		thread.resume()
+		n++
+
+		// Handoff the P for some other thread to run it.
+		handoffp(pp)
+
+		// The P has been handed off to another thread, so risk of a false
+		// deadlock report while we hold onto it is gone.
+	done:
+		incidlelocked(1)
+		lock(&allpLock)
 	}
 	unlock(&allpLock)
 	return uint32(n)
 }
 
+// syscallingThread represents a thread in a system call that temporarily
+// cannot advance out of the system call.
+type syscallingThread struct {
+	gp     *g
+	mp     *m
+	pp     *p
+	status uint32
+}
+
+// setBlockOnExitSyscall prevents pp's thread from advancing out of
+// exitsyscall. On success, returns the g/m/p state of the thread
+// and true. At that point, the caller owns the g/m/p links referenced,
+// the goroutine is in _Gsyscall, and prevented from transitioning out
+// of it. On failure, it returns false, and none of these guarantees are
+// made.
+//
+// Callers must call resume on the resulting thread state once
+// they're done with thread, otherwise it will remain blocked forever.
+//
+// This function races with state changes on pp, and thus may fail
+// if pp is not in a system call, or exits a system call concurrently
+// with this function. However, this function is safe to call without
+// any additional synchronization.
+func setBlockOnExitSyscall(pp *p) (syscallingThread, bool) {
+	if pp.status != _Prunning {
+		return syscallingThread{}, false
+	}
+	// Be very careful here, these reads are intentionally racy.
+	// Once we notice the G is in _Gsyscall, acquire its scan bit,
+	// and validate that it's still connected to the *same* M and P,
+	// we can actually get to work. Holding the scan bit will prevent
+	// the G from exiting the syscall.
+	//
+	// Our goal here is to interrupt long syscalls. If it turns out
+	// that we're wrong and the G switched to another syscall while
+	// we were trying to do this, that's completely fine. It's
+	// probably making more frequent syscalls and the typical
+	// preemption paths should be effective.
+	mp := pp.m.ptr()
+	if mp == nil {
+		// Nothing to do.
+		return syscallingThread{}, false
+	}
+	gp := mp.curg
+	if gp == nil {
+		// Nothing to do.
+		return syscallingThread{}, false
+	}
+	status := readgstatus(gp) &^ _Gscan
+
+	// A goroutine is considered in a syscall, and may have a corresponding
+	// P, if it's in _Gsyscall *or* _Gdeadextra. In the latter case, it's an
+	// extra M goroutine.
+	if status != _Gsyscall && status != _Gdeadextra {
+		// Not in a syscall, nothing to do.
+		return syscallingThread{}, false
+	}
+	if !castogscanstatus(gp, status, status|_Gscan) {
+		// Not in _Gsyscall or _Gdeadextra anymore. Nothing to do.
+		return syscallingThread{}, false
+	}
+	if gp.m != mp || gp.m.p.ptr() != pp {
+		// This is not what we originally observed. Nothing to do.
+		casfrom_Gscanstatus(gp, status|_Gscan, status)
+		return syscallingThread{}, false
+	}
+	return syscallingThread{gp, mp, pp, status}, true
+}
+
+// gcstopP unwires the P attached to the syscalling thread
+// and moves it into the _Pgcstop state.
+//
+// The caller must be stopping the world.
+func (s syscallingThread) gcstopP() {
+	assertLockHeld(&sched.lock)
+
+	s.releaseP(_Pgcstop)
+	s.pp.gcStopTime = nanotime()
+	sched.stopwait--
+}
+
+// takeP unwires the P attached to the syscalling thread
+// and moves it into the _Pidle state.
+func (s syscallingThread) takeP() {
+	s.releaseP(_Pidle)
+}
+
+// releaseP unwires the P from the syscalling thread, moving
+// it to the provided state. Callers should prefer to use
+// takeP and gcstopP.
+func (s syscallingThread) releaseP(state uint32) {
+	if state != _Pidle && state != _Pgcstop {
+		throw("attempted to release P into a bad state")
+	}
+	trace := traceAcquire()
+	s.pp.m = 0
+	s.mp.p = 0
+	atomic.Store(&s.pp.status, state)
+	if trace.ok() {
+		trace.ProcSteal(s.pp)
+		traceRelease(trace)
+	}
+	sched.nGsyscallNoP.Add(1)
+	s.pp.syscalltick++
+}
+
+// resume allows a syscalling thread to advance beyond exitsyscall.
+func (s syscallingThread) resume() {
+	casfrom_Gscanstatus(s.gp, s.status|_Gscan, s.status)
+}
+
 // Tell all goroutines that they have been preempted and they should stop.
 // This function is purely best-effort. It can fail to inform a goroutine if a
 // processor just started running it.
@@ -6486,6 +6635,10 @@ func preemptone(pp *p) bool {
 	if gp == nil || gp == mp.g0 {
 		return false
 	}
+	if readgstatus(gp)&^_Gscan == _Gsyscall {
+		// Don't bother trying to preempt a goroutine in a syscall.
+		return false
+	}
 
 	gp.preempt = true
author	Michael Anthony Knyszek <mknyszek@google.com>	2025-02-02 19:50:39 +0000
committer	Gopher Robot <gobot@golang.org>	2025-10-30 10:08:55 -0700
commit	7244e9221ff25b0c93a13ad8f1aa8917ca50f697 (patch)
tree	e6fb8794933dca05c72b7571008982815027b0ac /src/runtime/proc.go
parent	5ef19c0d0c870cc014ba28aa13b926c13ec1063a (diff)
download	go-7244e9221ff25b0c93a13ad8f1aa8917ca50f697.tar.xz