4 files changed, 236 insertions, 13 deletions
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index f78fb6f636..388cce83cd 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -870,9 +870,12 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		gcCPULimiter.update(now)
 	}
 
-	ok, now := c.assignWaitingGCWorker(pp, now)
-	if !ok {
-		return nil, now
+	// If a worker wasn't already assigned by procresize, assign one now.
+	if pp.nextGCMarkWorker == nil {
+		ok, now := c.assignWaitingGCWorker(pp, now)
+		if !ok {
+			return nil, now
+		}
 	}
 
 	node := pp.nextGCMarkWorker
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a378b8c39d..62e79e74e2 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4135,11 +4135,23 @@ top:
 
 	gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
 
+	// May be on a new P.
+	pp = mp.p.ptr()
+
 	// findRunnable may have collected an allp snapshot. The snapshot is
 	// only required within findRunnable. Clear it to all GC to collect the
 	// slice.
 	mp.clearAllpSnapshot()
 
+	// If the P was assigned a next GC mark worker but findRunnable
+	// selected anything else, release the worker so another P may run it.
+	//
+	// N.B. If this occurs because a higher-priority goroutine was selected
+	// (trace reader), then tryWakeP is set, which will wake another P to
+	// run the worker. If this occurs because the GC is no longer active,
+	// there is no need to wakep.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	if debug.dontfreezetheworld > 0 && freezing.Load() {
 		// See comment in freezetheworld. We don't want to perturb
 		// scheduler state, so we didn't gcstopm in findRunnable, but
@@ -6036,8 +6048,10 @@ func procresize(nprocs int32) *p {
 		unlock(&allpLock)
 	}
 
+	// Assign Ms to Ps with runnable goroutines.
 	var runnablePs *p
 	var runnablePsNeedM *p
+	var idlePs *p
 	for i := nprocs - 1; i >= 0; i-- {
 		pp := allp[i]
 		if gp.m.p.ptr() == pp {
@@ -6045,7 +6059,8 @@ func procresize(nprocs int32) *p {
 		}
 		pp.status = _Pidle
 		if runqempty(pp) {
-			pidleput(pp, now)
+			pp.link.set(idlePs)
+			idlePs = pp
 			continue
 		}
 
@@ -6071,6 +6086,8 @@ func procresize(nprocs int32) *p {
 		pp.link.set(runnablePs)
 		runnablePs = pp
 	}
+	// Assign Ms to remaining runnable Ps without usable oldm. See comment
+	// above.
 	for runnablePsNeedM != nil {
 		pp := runnablePsNeedM
 		runnablePsNeedM = pp.link.ptr()
@@ -6081,6 +6098,62 @@ func procresize(nprocs int32) *p {
 		runnablePs = pp
 	}
 
+	// Now that we've assigned Ms to Ps with runnable goroutines, assign GC
+	// mark workers to remaining idle Ps, if needed.
+	//
+	// By assigning GC workers to Ps here, we slightly speed up starting
+	// the world, as we will start enough Ps to run all of the user
+	// goroutines and GC mark workers all at once, rather than using a
+	// sequence of wakep calls as each P's findRunnable realizes it needs
+	// to run a mark worker instead of a user goroutine.
+	//
+	// By assigning GC workers to Ps only _after_ previously-running Ps are
+	// assigned Ms, we ensure that goroutines previously running on a P
+	// continue to run on the same P, with GC mark workers preferring
+	// previously-idle Ps. This helps prevent goroutines from shuffling
+	// around too much across STW.
+	//
+	// N.B., if there aren't enough Ps left in idlePs for all of the GC
+	// mark workers, then findRunnable will still choose to run mark
+	// workers on Ps assigned above.
+	//
+	// N.B., we do this during any STW in the mark phase, not just the
+	// sweep termination STW that starts the mark phase. gcBgMarkWorker
+	// always preempts by removing itself from the P, so even unrelated
+	// STWs during the mark require that Ps reselect mark workers upon
+	// restart.
+	if gcBlackenEnabled != 0 {
+		for idlePs != nil {
+			pp := idlePs
+
+			ok, _ := gcController.assignWaitingGCWorker(pp, now)
+			if !ok {
+				// No more mark workers needed.
+				break
+			}
+
+			// Got a worker, P is now runnable.
+			//
+			// mget may return nil if there aren't enough Ms, in
+			// which case startTheWorldWithSema will start one.
+			//
+			// N.B. findRunnableGCWorker will make the worker G
+			// itself runnable.
+			idlePs = pp.link.ptr()
+			mp := mget()
+			pp.m.set(mp)
+			pp.link.set(runnablePs)
+			runnablePs = pp
+		}
+	}
+
+	// Finally, any remaining Ps are truly idle.
+	for idlePs != nil {
+		pp := idlePs
+		idlePs = pp.link.ptr()
+		pidleput(pp, now)
+	}
+
 	stealOrder.reset(uint32(nprocs))
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
 	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@@ -6183,6 +6256,10 @@ func releasepNoTrace() *p {
 		print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
 		throw("releasep: invalid p state")
 	}
+
+	// P must clear if nextGCMarkWorker if it stops.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	gp.m.p = 0
 	pp.m = 0
 	pp.status = _Pidle
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index b3084f4895..35a1aeab1f 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) {
 
 	var errors int
 	for i := range runs {
-		err := runTestTracesSTW(t, i)
+		err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
 		if err != nil {
 			t.Logf("Run %d failed: %v", i, err)
 			errors++
@@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) {
 	}
 }
 
-func runTestTracesSTW(t *testing.T, run int) (err error) {
+// TestTraceGCSTW verifies that goroutines continue running on the same M and P
+// after a GC STW.
+func TestTraceGCSTW(t *testing.T) {
+	// Very similar to TestTraceSTW, but using a STW that starts the GC.
+	// When the GC starts, the background GC mark workers start running,
+	// which provide an additional source of disturbance to the scheduler.
+	//
+	// procresize assigns GC workers to previously-idle Ps to avoid
+	// changing what the previously-running Ps are doing.
+
+	if testing.Short() {
+		t.Skip("skipping in -short mode")
+	}
+
+	if runtime.NumCPU() < 8 {
+		t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
+	}
+
+	const runs = 50
+
+	var errors int
+	for i := range runs {
+		err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
+		if err != nil {
+			t.Logf("Run %d failed: %v", i, err)
+			errors++
+		}
+	}
+
+	pct := float64(errors)/float64(runs)
+	t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
+	if pct > 0.25 {
+		t.Errorf("Error rate too high")
+	}
+}
+
+func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
 	t.Logf("Run %d", run)
 
 	// By default, TSAN sleeps for 1s at exit to allow background
@@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// much, since we are running 50 iterations, so disable the sleep.
 	//
 	// Outside of race mode, GORACE does nothing.
-	buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0"))
+	buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
 
 	// We locally "fail" the run (return an error) if the trace exhibits
 	// unwanted scheduling. i.e., the target goroutines did not remain on
@@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// occur, such as a trace parse error.
 	defer func() {
 		if err != nil || t.Failed() {
-			testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false)
+			testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
 		}
 	}()
 
@@ -1509,12 +1545,10 @@ findEnd:
 			break findEnd
 		case trace.EventRangeBegin:
 			r := ev.Range()
-			if r.Name == "stop-the-world (read mem stats)" {
+			if r.Name == stwType {
 				// Note when we see the STW begin. This is not
 				// load bearing; it's purpose is simply to fail
-				// the test if we manage to remove the STW from
-				// ReadMemStat, so we remember to change this
-				// test to add some new source of STW.
+				// the test if we accidentally remove the STW.
 				stwSeen = true
 			}
 		}
diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go
index 0fed55b875..0fa15da09e 100644
--- a/src/runtime/testdata/testprog/stw_trace.go
+++ b/src/runtime/testdata/testprog/stw_trace.go
@@ -7,15 +7,18 @@ package main
 import (
 	"context"
 	"log"
+	"math/rand/v2"
 	"os"
 	"runtime"
 	"runtime/debug"
+	"runtime/metrics"
 	"runtime/trace"
 	"sync/atomic"
 )
 
 func init() {
 	register("TraceSTW", TraceSTW)
+	register("TraceGCSTW", TraceGCSTW)
 }
 
 // The parent writes to ping and waits for the children to write back
@@ -53,7 +56,7 @@ func TraceSTW() {
 	// https://go.dev/issue/65694). Alternatively, we could just ignore the
 	// trace if the GC runs.
 	runtime.GOMAXPROCS(4)
-	debug.SetGCPercent(0)
+	debug.SetGCPercent(-1)
 
 	if err := trace.Start(os.Stdout); err != nil {
 		log.Fatalf("failed to start tracing: %v", err)
@@ -86,6 +89,112 @@ func TraceSTW() {
 	stop.Store(true)
 }
 
+// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
+// previously-idle Ps, rather than bumping the current P.
+func TraceGCSTW() {
+	ctx := context.Background()
+
+	// The idea here is to have 2 target goroutines that are constantly
+	// running. When the world restarts after STW, we expect these
+	// goroutines to continue execution on the same M and P.
+	//
+	// Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
+	// 2 dedicated workers, and a bit of slack.
+	//
+	// Disable the GC initially so we can be sure it only triggers once we
+	// are ready.
+	runtime.GOMAXPROCS(8)
+	debug.SetGCPercent(-1)
+
+	if err := trace.Start(os.Stdout); err != nil {
+		log.Fatalf("failed to start tracing: %v", err)
+	}
+	defer trace.Stop()
+
+	for i := range 2 {
+		go traceSTWTarget(i)
+	}
+
+	// Wait for children to start running.
+	ping.Store(1)
+	for pong[0].Load() != 1 {}
+	for pong[1].Load() != 1 {}
+
+	trace.Log(ctx, "TraceSTW", "start")
+
+	// STW
+	triggerGC()
+
+	// Make sure to run long enough for the children to schedule again
+	// after STW. This is included for good measure, but the goroutines
+	// really ought to have already scheduled since the entire GC
+	// completed.
+	ping.Store(2)
+	for pong[0].Load() != 2 {}
+	for pong[1].Load() != 2 {}
+
+	trace.Log(ctx, "TraceSTW", "end")
+
+	stop.Store(true)
+}
+
+func triggerGC() {
+	// Allocate a bunch to trigger the GC rather than using runtime.GC. The
+	// latter blocks until the GC is complete, which is convenient, but
+	// messes with scheduling as it gives this P a chance to steal the
+	// other goroutines before their Ps get up and running again.
+
+	// Bring heap size up prior to enabling the GC to ensure that there is
+	// a decent amount of work in case the GC triggers immediately upon
+	// re-enabling.
+	for range 1000 {
+		alloc()
+	}
+
+	sample := make([]metrics.Sample, 1)
+	sample[0].Name = "/gc/cycles/total:gc-cycles"
+	metrics.Read(sample)
+
+	start := sample[0].Value.Uint64()
+
+	debug.SetGCPercent(100)
+
+	// Keep allocating until the GC is complete. We really only need to
+	// continue until the mark workers are scheduled, but there isn't a
+	// good way to measure that.
+	for {
+		metrics.Read(sample)
+		if sample[0].Value.Uint64() != start {
+			return
+		}
+
+		alloc()
+	}
+}
+
+// Allocate a tree data structure to generate plenty of scan work for the GC.
+
+type node struct {
+	children []*node
+}
+
+var gcSink node
+
+func alloc() {
+	// 10% chance of adding a node a each layer.
+
+	curr := &gcSink
+	for {
+		if len(curr.children) == 0 || rand.Float32() < 0.1 {
+			curr.children = append(curr.children, new(node))
+			return
+		}
+
+		i := rand.IntN(len(curr.children))
+		curr = curr.children[i]
+	}
+}
+
 // Manually insert a morestack call. Leaf functions can omit morestack, but
 // non-leaf functions should include them.