From 4ebf295b0b1740caac6302cc824ebd0f6175c1d5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 24 Oct 2025 15:14:59 -0400
Subject: runtime: prefer to restart Ps on the same M after STW

Today, Ps jump around arbitrarily across STW. Instead, try to keep the P
on the previous M it ran on. In the future, we'll likely want to try to
expand this beyond STW to create a more general affinity for specific
Ms.

For this to be useful, the Ps need to have runnable Gs. Today, STW
preemption goes through goschedImpl, which places the G on the global
run queue. If that was the only G then the P won't have runnable
goroutines anymore.

It makes more sense to keep the G with its P across STW anyway, so add a
special case to goschedImpl for that.

On my machine, this CL reduces the error rate in TestTraceSTW from 99.8%
to 1.9%.

As a nearly 2% error rate shows, there are still cases where this best
effort scheduling doesn't work. The most obvious is that while
procresize assigns Ps back to their original M, startTheWorldWithSema
calls wakep to start a spinning M. The spinning M may steal a goroutine
from another P if that P is too slow to start.

For #65694.

Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/714801
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/runtime2.go | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'src/runtime/runtime2.go')

diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 85a9693ace..6c955460d4 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -716,6 +716,9 @@ type m struct {
 	// Up to 10 locks held by this m, maintained by the lock ranking code.
 	locksHeldLen int
 	locksHeld    [10]heldLockInfo
+
+	// self points this M until mexit clears it to return nil.
+	self mWeakPointer
 }
 
 const mRedZoneSize = (16 << 3) * asanenabledBit // redZoneSize(2048)
@@ -730,6 +733,37 @@ type mPadded struct {
 	_ [(1 - goarch.IsWasm) * (2048 - mallocHeaderSize - mRedZoneSize - unsafe.Sizeof(m{}))]byte
 }
 
+// mWeakPointer is a "weak" pointer to an M. A weak pointer for each M is
+// available as m.self. Users may copy mWeakPointer arbitrarily, and get will
+// return the M if it is still live, or nil after mexit.
+//
+// The zero value is treated as a nil pointer.
+//
+// Note that get may race with M exit. A successful get will keep the m object
+// alive, but the M itself may be exited and thus not actually usable.
+type mWeakPointer struct {
+	m *atomic.Pointer[m]
+}
+
+func newMWeakPointer(mp *m) mWeakPointer {
+	w := mWeakPointer{m: new(atomic.Pointer[m])}
+	w.m.Store(mp)
+	return w
+}
+
+func (w mWeakPointer) get() *m {
+	if w.m == nil {
+		return nil
+	}
+	return w.m.Load()
+}
+
+// clear sets the weak pointer to nil. It cannot be used on zero value
+// mWeakPointers.
+func (w mWeakPointer) clear() {
+	w.m.Store(nil)
+}
+
 type p struct {
 	id          int32
 	status      uint32 // one of pidle/prunning/...
@@ -742,6 +776,17 @@ type p struct {
 	pcache      pageCache
 	raceprocctx uintptr
 
+	// oldm is the previous m this p ran on.
+	//
+	// We are not assosciated with this m, so we have no control over its
+	// lifecycle. This value is an m.self object which points to the m
+	// until the m exits.
+	//
+	// Note that this m may be idle, running, or exiting. It should only be
+	// used with mgetSpecific, which will take ownership of the m only if
+	// it is idle.
+	oldm mWeakPointer
+
 	deferpool    []*_defer // pool of available defer structs (see panic.go)
 	deferpoolbuf [32]*_defer
 
-- 
cgit v1.3