From 7a8d0b5d53dc7be331016b6903707aa256e22c2a Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Thu, 30 Oct 2025 17:24:47 +0000 Subject: runtime: add debug mode to extend _Grunning-without-P windows This was suggested in CL 646198, and I tested with it, I forgot to push it so it wasn't merged. I think it might be worth keeping. Change-Id: Ibf97d969fe7d0eeb365f5f7b1fbeadea3a1076ab Reviewed-on: https://go-review.googlesource.com/c/go/+/716580 Reviewed-by: Cherry Mui Auto-Submit: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- src/runtime/proc.go | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src/runtime/proc.go') diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 44b64913a5..44dbb2fd44 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -4658,6 +4658,11 @@ func reentersyscall(pc, sp, bp uintptr) { gp.m.locks-- } +// debugExtendGrunningNoP is a debug mode that extends the windows in which +// we're _Grunning without a P in order to try to shake out bugs with code +// assuming this state is impossible. +const debugExtendGrunningNoP = false + // Standard syscall entry used by the go syscall library and normal cgo calls. // // This is exported via linkname to assembly in the syscall package and x/sys. @@ -4770,6 +4775,9 @@ func entersyscallblock() { // <-- // Caution: we're in a small window where we are in _Grunning without a P. // --> + if debugExtendGrunningNoP { + usleep(10) + } casgstatus(gp, _Grunning, _Gsyscall) if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { systemstack(func() { @@ -4852,6 +4860,9 @@ func exitsyscall() { // Caution: we're in a window where we may be in _Grunning without a P. // Either we will grab a P or call exitsyscall0, where we'll switch to // _Grunnable. + if debugExtendGrunningNoP { + usleep(10) + } // Grab and clear our old P. oldp := gp.m.oldp.ptr() -- cgit v1.3-5-g9baa From d55ecea9e5a5a4cfba30c6f35d4841ae66e05ccd Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Fri, 14 Nov 2025 17:41:58 +0000 Subject: runtime: usleep before stealing runnext only if not in syscall In the scheduler's steal path, we usleep(3) before stealing a _Prunning P's runnext slot. Before CL 646198, we would not call usleep(3) if the P was in _Psyscall. After CL 646198, Ps with Gs in syscalls stay in _Prunning until stolen, meaning we might unnecessarily usleep(3) where we didn't before. This probably isn't a huge deal in most cases, but can cause some apparent slowdowns in microbenchmarks that frequently take the steal path while there are syscalling goroutines. Change-Id: I5bf3df10fe61cf8d7f0e9fe9522102de66faf344 Reviewed-on: https://go-review.googlesource.com/c/go/+/720441 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt --- src/runtime/proc.go | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'src/runtime/proc.go') diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 44dbb2fd44..61364d91ff 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -7507,23 +7507,36 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) // Try to steal from pp.runnext. if next := pp.runnext; next != 0 { if pp.status == _Prunning { - // Sleep to ensure that pp isn't about to run the g - // we are about to steal. - // The important use case here is when the g running - // on pp ready()s another g and then almost - // immediately blocks. Instead of stealing runnext - // in this window, back off to give pp a chance to - // schedule runnext. This will avoid thrashing gs - // between different Ps. - // A sync chan send/recv takes ~50ns as of time of - // writing, so 3us gives ~50x overshoot. - if !osHasLowResTimer { - usleep(3) - } else { - // On some platforms system timer granularity is - // 1-15ms, which is way too much for this - // optimization. So just yield. - osyield() + if mp := pp.m.ptr(); mp != nil { + if gp := mp.curg; gp == nil || readgstatus(gp)&^_Gscan != _Gsyscall { + // Sleep to ensure that pp isn't about to run the g + // we are about to steal. + // The important use case here is when the g running + // on pp ready()s another g and then almost + // immediately blocks. Instead of stealing runnext + // in this window, back off to give pp a chance to + // schedule runnext. This will avoid thrashing gs + // between different Ps. + // A sync chan send/recv takes ~50ns as of time of + // writing, so 3us gives ~50x overshoot. + // If curg is nil, we assume that the P is likely + // to be in the scheduler. If curg isn't nil and isn't + // in a syscall, then it's either running, waiting, or + // runnable. In this case we want to sleep because the + // P might either call into the scheduler soon (running), + // or already is (since we found a waiting or runnable + // goroutine hanging off of a running P, suggesting it + // either recently transitioned out of running, or will + // transition to running shortly). + if !osHasLowResTimer { + usleep(3) + } else { + // On some platforms system timer granularity is + // 1-15ms, which is way too much for this + // optimization. So just yield. + osyield() + } + } } } if !pp.runnext.cas(next, 0) { -- cgit v1.3-5-g9baa From 6919858338ae3c4f244f65ca87e9e71662a83413 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 17 Nov 2025 13:34:51 -0500 Subject: runtime: rename findrunnable references to findRunnable These cases were missed by CL 393880. Change-Id: I6a6a636cf0d97a4efcf4b9df766002ecef48b4de Reviewed-on: https://go-review.googlesource.com/c/go/+/721120 LUCI-TryBot-Result: Go LUCI Auto-Submit: Michael Pratt Reviewed-by: Michael Knyszek --- src/runtime/proc.go | 18 +++++++++--------- src/runtime/runtime2.go | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'src/runtime/proc.go') diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 61364d91ff..a378b8c39d 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -3119,7 +3119,7 @@ func startm(pp *p, spinning, lockheld bool) { //go:nowritebarrierrec func handoffp(pp *p) { // handoffp must start an M in any situation where - // findrunnable would return a G to run on pp. + // findRunnable would return a G to run on pp. // if it has local work, start it straight away if !runqempty(pp) || !sched.runq.empty() { @@ -3362,7 +3362,7 @@ func findRunnable() (gp *g, inheritTime, tryWakeP bool) { mp := getg().m // The conditions here and in handoffp must agree: if - // findrunnable would return a G to run, handoffp must start + // findRunnable would return a G to run, handoffp must start // an M. top: @@ -3586,7 +3586,7 @@ top: goto top } if releasep() != pp { - throw("findrunnable: wrong p") + throw("findRunnable: wrong p") } now = pidleput(pp, now) unlock(&sched.lock) @@ -3631,7 +3631,7 @@ top: if mp.spinning { mp.spinning = false if sched.nmspinning.Add(-1) < 0 { - throw("findrunnable: negative nmspinning") + throw("findRunnable: negative nmspinning") } // Note the for correctness, only the last M transitioning from @@ -3704,10 +3704,10 @@ top: if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 { sched.pollUntil.Store(pollUntil) if mp.p != 0 { - throw("findrunnable: netpoll with p") + throw("findRunnable: netpoll with p") } if mp.spinning { - throw("findrunnable: netpoll with spinning") + throw("findRunnable: netpoll with spinning") } delay := int64(-1) if pollUntil != 0 { @@ -3973,7 +3973,7 @@ func checkIdleGCNoP() (*p, *g) { // timers and the network poller if there isn't one already. func wakeNetPoller(when int64) { if sched.lastpoll.Load() == 0 { - // In findrunnable we ensure that when polling the pollUntil + // In findRunnable we ensure that when polling the pollUntil // field is either zero or the time to which the current // poll is expected to run. This can have a spurious wakeup // but should never miss a wakeup. @@ -3998,7 +3998,7 @@ func resetspinning() { gp.m.spinning = false nmspinning := sched.nmspinning.Add(-1) if nmspinning < 0 { - throw("findrunnable: negative nmspinning") + throw("findRunnable: negative nmspinning") } // M wakeup policy is deliberately somewhat conservative, so check if we // need to wakeup another P here. See "Worker thread parking/unparking" @@ -7269,7 +7269,7 @@ func pidlegetSpinning(now int64) (*p, int64) { pp, now := pidleget(now) if pp == nil { - // See "Delicate dance" comment in findrunnable. We found work + // See "Delicate dance" comment in findRunnable. We found work // that we cannot take, we must synchronize with non-spinning // Ms that may be preparing to drop their P. sched.needspinning.Store(1) diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 6c955460d4..e415df6ec1 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -1425,9 +1425,9 @@ var ( // must be set. An idle P (passed to pidleput) cannot add new timers while // idle, so if it has no timers at that time, its mask may be cleared. // - // Thus, we get the following effects on timer-stealing in findrunnable: + // Thus, we get the following effects on timer-stealing in findRunnable: // - // - Idle Ps with no timers when they go idle are never checked in findrunnable + // - Idle Ps with no timers when they go idle are never checked in findRunnable // (for work- or timer-stealing; this is the ideal case). // - Running Ps must always be checked. // - Idle Ps whose timers are stolen must continue to be checked until they run -- cgit v1.3-5-g9baa From a18aff805706bfdaeb9aca042111fae32f9f8b61 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 17 Nov 2025 16:08:21 -0500 Subject: runtime: select GC mark workers during start-the-world When the GC starts today, procresize and startTheWorldWithSema don't consider the additional Ps required to run the mark workers. procresize and startTheWorldWithSema resume only the Ps necessary to run the normal user goroutines. Once those Ps start, findRunnable and findRunnableGCWorker determine that a GC worker is necessary and run the worker instead, calling wakep to wake another P to run the original user goroutine. This is unfortunate because it disrupts the intentional placement of Ps on Ms that procresize does. It also has the unfortunate side effect of slightly delaying start-the-world time, as it takes several sequential wakeps to get all Ps started. To address this, procresize explicitly assigns GC mark workers to Ps before starting the world. The assignment occurs _after_ selecting runnable Ps, so that we prefer to select Ps that were previously idle. Note that if fewer than 25% of Ps are idle then we won't be able to assign all dedicated workers, and some of the Ps intended for user goroutines will convert to dedicated workers once they reach findRunnableGCWorker. Also note that stack scanning temporarily suspends the goroutine. Resume occurs through ready, which will move the goroutine to the local runq of the P that did the scan. Thus there is still a source of migration at some point during the GC. For #65694. Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest Change-Id: I6a6a636c51f39f4f4bc716aa87de68f6ebe163a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/721002 LUCI-TryBot-Result: Go LUCI Auto-Submit: Michael Pratt Reviewed-by: Michael Knyszek --- src/runtime/mgcpacer.go | 9 ++- src/runtime/proc.go | 79 +++++++++++++++++++- src/runtime/proc_test.go | 50 ++++++++++--- src/runtime/testdata/testprog/stw_trace.go | 111 ++++++++++++++++++++++++++++- 4 files changed, 236 insertions(+), 13 deletions(-) (limited to 'src/runtime/proc.go') diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go index f78fb6f636..388cce83cd 100644 --- a/src/runtime/mgcpacer.go +++ b/src/runtime/mgcpacer.go @@ -870,9 +870,12 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) { gcCPULimiter.update(now) } - ok, now := c.assignWaitingGCWorker(pp, now) - if !ok { - return nil, now + // If a worker wasn't already assigned by procresize, assign one now. + if pp.nextGCMarkWorker == nil { + ok, now := c.assignWaitingGCWorker(pp, now) + if !ok { + return nil, now + } } node := pp.nextGCMarkWorker diff --git a/src/runtime/proc.go b/src/runtime/proc.go index a378b8c39d..62e79e74e2 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -4135,11 +4135,23 @@ top: gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available + // May be on a new P. + pp = mp.p.ptr() + // findRunnable may have collected an allp snapshot. The snapshot is // only required within findRunnable. Clear it to all GC to collect the // slice. mp.clearAllpSnapshot() + // If the P was assigned a next GC mark worker but findRunnable + // selected anything else, release the worker so another P may run it. + // + // N.B. If this occurs because a higher-priority goroutine was selected + // (trace reader), then tryWakeP is set, which will wake another P to + // run the worker. If this occurs because the GC is no longer active, + // there is no need to wakep. + gcController.releaseNextGCMarkWorker(pp) + if debug.dontfreezetheworld > 0 && freezing.Load() { // See comment in freezetheworld. We don't want to perturb // scheduler state, so we didn't gcstopm in findRunnable, but @@ -6036,8 +6048,10 @@ func procresize(nprocs int32) *p { unlock(&allpLock) } + // Assign Ms to Ps with runnable goroutines. var runnablePs *p var runnablePsNeedM *p + var idlePs *p for i := nprocs - 1; i >= 0; i-- { pp := allp[i] if gp.m.p.ptr() == pp { @@ -6045,7 +6059,8 @@ func procresize(nprocs int32) *p { } pp.status = _Pidle if runqempty(pp) { - pidleput(pp, now) + pp.link.set(idlePs) + idlePs = pp continue } @@ -6071,6 +6086,8 @@ func procresize(nprocs int32) *p { pp.link.set(runnablePs) runnablePs = pp } + // Assign Ms to remaining runnable Ps without usable oldm. See comment + // above. for runnablePsNeedM != nil { pp := runnablePsNeedM runnablePsNeedM = pp.link.ptr() @@ -6081,6 +6098,62 @@ func procresize(nprocs int32) *p { runnablePs = pp } + // Now that we've assigned Ms to Ps with runnable goroutines, assign GC + // mark workers to remaining idle Ps, if needed. + // + // By assigning GC workers to Ps here, we slightly speed up starting + // the world, as we will start enough Ps to run all of the user + // goroutines and GC mark workers all at once, rather than using a + // sequence of wakep calls as each P's findRunnable realizes it needs + // to run a mark worker instead of a user goroutine. + // + // By assigning GC workers to Ps only _after_ previously-running Ps are + // assigned Ms, we ensure that goroutines previously running on a P + // continue to run on the same P, with GC mark workers preferring + // previously-idle Ps. This helps prevent goroutines from shuffling + // around too much across STW. + // + // N.B., if there aren't enough Ps left in idlePs for all of the GC + // mark workers, then findRunnable will still choose to run mark + // workers on Ps assigned above. + // + // N.B., we do this during any STW in the mark phase, not just the + // sweep termination STW that starts the mark phase. gcBgMarkWorker + // always preempts by removing itself from the P, so even unrelated + // STWs during the mark require that Ps reselect mark workers upon + // restart. + if gcBlackenEnabled != 0 { + for idlePs != nil { + pp := idlePs + + ok, _ := gcController.assignWaitingGCWorker(pp, now) + if !ok { + // No more mark workers needed. + break + } + + // Got a worker, P is now runnable. + // + // mget may return nil if there aren't enough Ms, in + // which case startTheWorldWithSema will start one. + // + // N.B. findRunnableGCWorker will make the worker G + // itself runnable. + idlePs = pp.link.ptr() + mp := mget() + pp.m.set(mp) + pp.link.set(runnablePs) + runnablePs = pp + } + } + + // Finally, any remaining Ps are truly idle. + for idlePs != nil { + pp := idlePs + idlePs = pp.link.ptr() + pidleput(pp, now) + } + stealOrder.reset(uint32(nprocs)) var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32 atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs)) @@ -6183,6 +6256,10 @@ func releasepNoTrace() *p { print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n") throw("releasep: invalid p state") } + + // P must clear if nextGCMarkWorker if it stops. + gcController.releaseNextGCMarkWorker(pp) + gp.m.p = 0 pp.m = 0 pp.status = _Pidle diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index b3084f4895..35a1aeab1f 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) { var errors int for i := range runs { - err := runTestTracesSTW(t, i) + err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)") if err != nil { t.Logf("Run %d failed: %v", i, err) errors++ @@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) { } } -func runTestTracesSTW(t *testing.T, run int) (err error) { +// TestTraceGCSTW verifies that goroutines continue running on the same M and P +// after a GC STW. +func TestTraceGCSTW(t *testing.T) { + // Very similar to TestTraceSTW, but using a STW that starts the GC. + // When the GC starts, the background GC mark workers start running, + // which provide an additional source of disturbance to the scheduler. + // + // procresize assigns GC workers to previously-idle Ps to avoid + // changing what the previously-running Ps are doing. + + if testing.Short() { + t.Skip("skipping in -short mode") + } + + if runtime.NumCPU() < 8 { + t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs") + } + + const runs = 50 + + var errors int + for i := range runs { + err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)") + if err != nil { + t.Logf("Run %d failed: %v", i, err) + errors++ + } + } + + pct := float64(errors)/float64(runs) + t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct) + if pct > 0.25 { + t.Errorf("Error rate too high") + } +} + +func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) { t.Logf("Run %d", run) // By default, TSAN sleeps for 1s at exit to allow background @@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // much, since we are running 50 iterations, so disable the sleep. // // Outside of race mode, GORACE does nothing. - buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0")) + buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0")) // We locally "fail" the run (return an error) if the trace exhibits // unwanted scheduling. i.e., the target goroutines did not remain on @@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // occur, such as a trace parse error. defer func() { if err != nil || t.Failed() { - testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false) + testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false) } }() @@ -1509,12 +1545,10 @@ findEnd: break findEnd case trace.EventRangeBegin: r := ev.Range() - if r.Name == "stop-the-world (read mem stats)" { + if r.Name == stwType { // Note when we see the STW begin. This is not // load bearing; it's purpose is simply to fail - // the test if we manage to remove the STW from - // ReadMemStat, so we remember to change this - // test to add some new source of STW. + // the test if we accidentally remove the STW. stwSeen = true } } diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go index 0fed55b875..0fa15da09e 100644 --- a/src/runtime/testdata/testprog/stw_trace.go +++ b/src/runtime/testdata/testprog/stw_trace.go @@ -7,15 +7,18 @@ package main import ( "context" "log" + "math/rand/v2" "os" "runtime" "runtime/debug" + "runtime/metrics" "runtime/trace" "sync/atomic" ) func init() { register("TraceSTW", TraceSTW) + register("TraceGCSTW", TraceGCSTW) } // The parent writes to ping and waits for the children to write back @@ -53,7 +56,7 @@ func TraceSTW() { // https://go.dev/issue/65694). Alternatively, we could just ignore the // trace if the GC runs. runtime.GOMAXPROCS(4) - debug.SetGCPercent(0) + debug.SetGCPercent(-1) if err := trace.Start(os.Stdout); err != nil { log.Fatalf("failed to start tracing: %v", err) @@ -86,6 +89,112 @@ func TraceSTW() { stop.Store(true) } +// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on +// previously-idle Ps, rather than bumping the current P. +func TraceGCSTW() { + ctx := context.Background() + + // The idea here is to have 2 target goroutines that are constantly + // running. When the world restarts after STW, we expect these + // goroutines to continue execution on the same M and P. + // + // Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent, + // 2 dedicated workers, and a bit of slack. + // + // Disable the GC initially so we can be sure it only triggers once we + // are ready. + runtime.GOMAXPROCS(8) + debug.SetGCPercent(-1) + + if err := trace.Start(os.Stdout); err != nil { + log.Fatalf("failed to start tracing: %v", err) + } + defer trace.Stop() + + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + trace.Log(ctx, "TraceSTW", "start") + + // STW + triggerGC() + + // Make sure to run long enough for the children to schedule again + // after STW. This is included for good measure, but the goroutines + // really ought to have already scheduled since the entire GC + // completed. + ping.Store(2) + for pong[0].Load() != 2 {} + for pong[1].Load() != 2 {} + + trace.Log(ctx, "TraceSTW", "end") + + stop.Store(true) +} + +func triggerGC() { + // Allocate a bunch to trigger the GC rather than using runtime.GC. The + // latter blocks until the GC is complete, which is convenient, but + // messes with scheduling as it gives this P a chance to steal the + // other goroutines before their Ps get up and running again. + + // Bring heap size up prior to enabling the GC to ensure that there is + // a decent amount of work in case the GC triggers immediately upon + // re-enabling. + for range 1000 { + alloc() + } + + sample := make([]metrics.Sample, 1) + sample[0].Name = "/gc/cycles/total:gc-cycles" + metrics.Read(sample) + + start := sample[0].Value.Uint64() + + debug.SetGCPercent(100) + + // Keep allocating until the GC is complete. We really only need to + // continue until the mark workers are scheduled, but there isn't a + // good way to measure that. + for { + metrics.Read(sample) + if sample[0].Value.Uint64() != start { + return + } + + alloc() + } +} + +// Allocate a tree data structure to generate plenty of scan work for the GC. + +type node struct { + children []*node +} + +var gcSink node + +func alloc() { + // 10% chance of adding a node a each layer. + + curr := &gcSink + for { + if len(curr.children) == 0 || rand.Float32() < 0.1 { + curr.children = append(curr.children, new(node)) + return + } + + i := rand.IntN(len(curr.children)) + curr = curr.children[i] + } +} + // Manually insert a morestack call. Leaf functions can omit morestack, but // non-leaf functions should include them. -- cgit v1.3-5-g9baa