runtime: prefer to restart Ps on the same M after STW

Today, Ps jump around arbitrarily across STW. Instead, try to keep the P on the previous M it ran on. In the future, we'll likely want to try to expand this beyond STW to create a more general affinity for specific Ms. For this to be useful, the Ps need to have runnable Gs. Today, STW preemption goes through goschedImpl, which places the G on the global run queue. If that was the only G then the P won't have runnable goroutines anymore. It makes more sense to keep the G with its P across STW anyway, so add a special case to goschedImpl for that. On my machine, this CL reduces the error rate in TestTraceSTW from 99.8% to 1.9%. As a nearly 2% error rate shows, there are still cases where this best effort scheduling doesn't work. The most obvious is that while procresize assigns Ps back to their original M, startTheWorldWithSema calls wakep to start a spinning M. The spinning M may steal a goroutine from another P if that P is too slow to start. For #65694. Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/714801 Reviewed-by: Michael Knyszek <mknyszek@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
author: Michael Pratt <mpratt@google.com> 2025-10-24 15:14:59 -0400
committer: Gopher Robot <gobot@golang.org> 2025-11-13 07:44:41 -0800
commit: 4ebf295b0b1740caac6302cc824ebd0f6175c1d5 (patch)
tree: ccc6d602bcf37d7a40d144323ee340d14e830ebd /src/runtime/proc.go
parent: 625d8e9b9cd7ede188a8856c5ac88791333baa63 (diff)
download: go-4ebf295b0b1740caac6302cc824ebd0f6175c1d5.tar.xz
1 files changed, 76 insertions, 7 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 30d2a68626..44b64913a5 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1009,6 +1009,8 @@ func mcommoninit(mp *m, id int64) {
 		mp.id = mReserveID()
 	}
 
+	mp.self = newMWeakPointer(mp)
+
 	mrandinit(mp)
 
 	mpreinit(mp)
@@ -2018,6 +2020,10 @@ func mexit(osStack bool) {
 	// Free vgetrandom state.
 	vgetrandomDestroy(mp)
 
+	// Clear the self pointer so Ps don't access this M after it is freed,
+	// or keep it alive.
+	mp.self.clear()
+
 	// Remove m from allm.
 	lock(&sched.lock)
 	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
@@ -4259,6 +4265,7 @@ func park_m(gp *g) {
 }
 
 func goschedImpl(gp *g, preempted bool) {
+	pp := gp.m.p.ptr()
 	trace := traceAcquire()
 	status := readgstatus(gp)
 	if status&^_Gscan != _Grunning {
@@ -4281,9 +4288,15 @@ func goschedImpl(gp *g, preempted bool) {
 	}
 
 	dropg()
-	lock(&sched.lock)
-	globrunqput(gp)
-	unlock(&sched.lock)
+	if preempted && sched.gcwaiting.Load() {
+		// If preempted for STW, keep the G on the local P in runnext
+		// so it can keep running immediately after the STW.
+		runqput(pp, gp, true)
+	} else {
+		lock(&sched.lock)
+		globrunqput(gp)
+		unlock(&sched.lock)
+	}
 
 	if mainStarted {
 		wakep()
@@ -6013,6 +6026,7 @@ func procresize(nprocs int32) *p {
 	}
 
 	var runnablePs *p
+	var runnablePsNeedM *p
 	for i := nprocs - 1; i >= 0; i-- {
 		pp := allp[i]
 		if gp.m.p.ptr() == pp {
@@ -6021,12 +6035,41 @@ func procresize(nprocs int32) *p {
 		pp.status = _Pidle
 		if runqempty(pp) {
 			pidleput(pp, now)
-		} else {
-			pp.m.set(mget())
-			pp.link.set(runnablePs)
-			runnablePs = pp
+			continue
+		}
+
+		// Prefer to run on the most recent M if it is
+		// available.
+		//
+		// Ps with no oldm (or for which oldm is already taken
+		// by an earlier P), we delay until all oldm Ps are
+		// handled. Otherwise, mget may return an M that a
+		// later P has in oldm.
+		var mp *m
+		if oldm := pp.oldm.get(); oldm != nil {
+			// Returns nil if oldm is not idle.
+			mp = mgetSpecific(oldm)
+		}
+		if mp == nil {
+			// Call mget later.
+			pp.link.set(runnablePsNeedM)
+			runnablePsNeedM = pp
+			continue
 		}
+		pp.m.set(mp)
+		pp.link.set(runnablePs)
+		runnablePs = pp
 	}
+	for runnablePsNeedM != nil {
+		pp := runnablePsNeedM
+		runnablePsNeedM = pp.link.ptr()
+
+		mp := mget()
+		pp.m.set(mp)
+		pp.link.set(runnablePs)
+		runnablePs = pp
+	}
+
 	stealOrder.reset(uint32(nprocs))
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
 	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@@ -6064,6 +6107,11 @@ func acquirepNoTrace(pp *p) {
 
 	// Have p; write barriers now allowed.
 
+	// The M we're associating with will be the old M after the next
+	// releasep. We must set this here because write barriers are not
+	// allowed in releasep.
+	pp.oldm = pp.m.ptr().self
+
 	// Perform deferred mcache flush before this P can allocate
 	// from a potentially stale mcache.
 	pp.mcache.prepareForSweep()
@@ -6998,6 +7046,27 @@ func mget() *m {
 	return mp
 }
 
+// Try to get a specific m from midle list. Returns nil if it isn't on the
+// midle list.
+//
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func mgetSpecific(mp *m) *m {
+	assertLockHeld(&sched.lock)
+
+	if mp.idleNode.prev == 0 && mp.idleNode.next == 0 {
+		// Not on the list.
+		return nil
+	}
+
+	sched.midle.remove(unsafe.Pointer(mp))
+	sched.nmidle--
+
+	return mp
+}
+
 // Put gp on the global runnable queue.
 // sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
author	Michael Pratt <mpratt@google.com>	2025-10-24 15:14:59 -0400
committer	Gopher Robot <gobot@golang.org>	2025-11-13 07:44:41 -0800
commit	4ebf295b0b1740caac6302cc824ebd0f6175c1d5 (patch)
tree	ccc6d602bcf37d7a40d144323ee340d14e830ebd /src/runtime/proc.go
parent	625d8e9b9cd7ede188a8856c5ac88791333baa63 (diff)
download	go-4ebf295b0b1740caac6302cc824ebd0f6175c1d5.tar.xz