From a18aff805706bfdaeb9aca042111fae32f9f8b61 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 17 Nov 2025 16:08:21 -0500 Subject: runtime: select GC mark workers during start-the-world When the GC starts today, procresize and startTheWorldWithSema don't consider the additional Ps required to run the mark workers. procresize and startTheWorldWithSema resume only the Ps necessary to run the normal user goroutines. Once those Ps start, findRunnable and findRunnableGCWorker determine that a GC worker is necessary and run the worker instead, calling wakep to wake another P to run the original user goroutine. This is unfortunate because it disrupts the intentional placement of Ps on Ms that procresize does. It also has the unfortunate side effect of slightly delaying start-the-world time, as it takes several sequential wakeps to get all Ps started. To address this, procresize explicitly assigns GC mark workers to Ps before starting the world. The assignment occurs _after_ selecting runnable Ps, so that we prefer to select Ps that were previously idle. Note that if fewer than 25% of Ps are idle then we won't be able to assign all dedicated workers, and some of the Ps intended for user goroutines will convert to dedicated workers once they reach findRunnableGCWorker. Also note that stack scanning temporarily suspends the goroutine. Resume occurs through ready, which will move the goroutine to the local runq of the P that did the scan. Thus there is still a source of migration at some point during the GC. For #65694. Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest Change-Id: I6a6a636c51f39f4f4bc716aa87de68f6ebe163a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/721002 LUCI-TryBot-Result: Go LUCI Auto-Submit: Michael Pratt Reviewed-by: Michael Knyszek --- src/runtime/testdata/testprog/stw_trace.go | 111 ++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) (limited to 'src/runtime/testdata') diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go index 0fed55b875..0fa15da09e 100644 --- a/src/runtime/testdata/testprog/stw_trace.go +++ b/src/runtime/testdata/testprog/stw_trace.go @@ -7,15 +7,18 @@ package main import ( "context" "log" + "math/rand/v2" "os" "runtime" "runtime/debug" + "runtime/metrics" "runtime/trace" "sync/atomic" ) func init() { register("TraceSTW", TraceSTW) + register("TraceGCSTW", TraceGCSTW) } // The parent writes to ping and waits for the children to write back @@ -53,7 +56,7 @@ func TraceSTW() { // https://go.dev/issue/65694). Alternatively, we could just ignore the // trace if the GC runs. runtime.GOMAXPROCS(4) - debug.SetGCPercent(0) + debug.SetGCPercent(-1) if err := trace.Start(os.Stdout); err != nil { log.Fatalf("failed to start tracing: %v", err) @@ -86,6 +89,112 @@ func TraceSTW() { stop.Store(true) } +// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on +// previously-idle Ps, rather than bumping the current P. +func TraceGCSTW() { + ctx := context.Background() + + // The idea here is to have 2 target goroutines that are constantly + // running. When the world restarts after STW, we expect these + // goroutines to continue execution on the same M and P. + // + // Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent, + // 2 dedicated workers, and a bit of slack. + // + // Disable the GC initially so we can be sure it only triggers once we + // are ready. + runtime.GOMAXPROCS(8) + debug.SetGCPercent(-1) + + if err := trace.Start(os.Stdout); err != nil { + log.Fatalf("failed to start tracing: %v", err) + } + defer trace.Stop() + + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + trace.Log(ctx, "TraceSTW", "start") + + // STW + triggerGC() + + // Make sure to run long enough for the children to schedule again + // after STW. This is included for good measure, but the goroutines + // really ought to have already scheduled since the entire GC + // completed. + ping.Store(2) + for pong[0].Load() != 2 {} + for pong[1].Load() != 2 {} + + trace.Log(ctx, "TraceSTW", "end") + + stop.Store(true) +} + +func triggerGC() { + // Allocate a bunch to trigger the GC rather than using runtime.GC. The + // latter blocks until the GC is complete, which is convenient, but + // messes with scheduling as it gives this P a chance to steal the + // other goroutines before their Ps get up and running again. + + // Bring heap size up prior to enabling the GC to ensure that there is + // a decent amount of work in case the GC triggers immediately upon + // re-enabling. + for range 1000 { + alloc() + } + + sample := make([]metrics.Sample, 1) + sample[0].Name = "/gc/cycles/total:gc-cycles" + metrics.Read(sample) + + start := sample[0].Value.Uint64() + + debug.SetGCPercent(100) + + // Keep allocating until the GC is complete. We really only need to + // continue until the mark workers are scheduled, but there isn't a + // good way to measure that. + for { + metrics.Read(sample) + if sample[0].Value.Uint64() != start { + return + } + + alloc() + } +} + +// Allocate a tree data structure to generate plenty of scan work for the GC. + +type node struct { + children []*node +} + +var gcSink node + +func alloc() { + // 10% chance of adding a node a each layer. + + curr := &gcSink + for { + if len(curr.children) == 0 || rand.Float32() < 0.1 { + curr.children = append(curr.children, new(node)) + return + } + + i := rand.IntN(len(curr.children)) + curr = curr.children[i] + } +} + // Manually insert a morestack call. Leaf functions can omit morestack, but // non-leaf functions should include them. -- cgit v1.3