diff options
| author | Michael Pratt <mpratt@google.com> | 2025-11-17 16:08:21 -0500 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2025-11-20 08:10:14 -0800 |
| commit | a18aff805706bfdaeb9aca042111fae32f9f8b61 (patch) | |
| tree | d2dbb7b60f0dc01346d7f28444aade79efee655a /src/runtime/proc_test.go | |
| parent | 829779f4fe7e002b959a2f4966aa9e21c59e418c (diff) | |
| download | go-a18aff805706bfdaeb9aca042111fae32f9f8b61.tar.xz | |
runtime: select GC mark workers during start-the-world
When the GC starts today, procresize and startTheWorldWithSema don't
consider the additional Ps required to run the mark workers. procresize
and startTheWorldWithSema resume only the Ps necessary to run the normal
user goroutines.
Once those Ps start, findRunnable and findRunnableGCWorker determine
that a GC worker is necessary and run the worker instead, calling wakep
to wake another P to run the original user goroutine.
This is unfortunate because it disrupts the intentional placement of Ps
on Ms that procresize does. It also has the unfortunate side effect of
slightly delaying start-the-world time, as it takes several sequential
wakeps to get all Ps started.
To address this, procresize explicitly assigns GC mark workers to Ps
before starting the world. The assignment occurs _after_ selecting
runnable Ps, so that we prefer to select Ps that were previously idle.
Note that if fewer than 25% of Ps are idle then we won't be able to
assign all dedicated workers, and some of the Ps intended for user
goroutines will convert to dedicated workers once they reach
findRunnableGCWorker.
Also note that stack scanning temporarily suspends the goroutine. Resume
occurs through ready, which will move the goroutine to the local runq of
the P that did the scan. Thus there is still a source of migration at
some point during the GC.
For #65694.
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c51f39f4f4bc716aa87de68f6ebe163a5
Reviewed-on: https://go-review.googlesource.com/c/go/+/721002
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/runtime/proc_test.go')
| -rw-r--r-- | src/runtime/proc_test.go | 50 |
1 files changed, 42 insertions, 8 deletions
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index b3084f4895..35a1aeab1f 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) { var errors int for i := range runs { - err := runTestTracesSTW(t, i) + err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)") if err != nil { t.Logf("Run %d failed: %v", i, err) errors++ @@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) { } } -func runTestTracesSTW(t *testing.T, run int) (err error) { +// TestTraceGCSTW verifies that goroutines continue running on the same M and P +// after a GC STW. +func TestTraceGCSTW(t *testing.T) { + // Very similar to TestTraceSTW, but using a STW that starts the GC. + // When the GC starts, the background GC mark workers start running, + // which provide an additional source of disturbance to the scheduler. + // + // procresize assigns GC workers to previously-idle Ps to avoid + // changing what the previously-running Ps are doing. + + if testing.Short() { + t.Skip("skipping in -short mode") + } + + if runtime.NumCPU() < 8 { + t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs") + } + + const runs = 50 + + var errors int + for i := range runs { + err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)") + if err != nil { + t.Logf("Run %d failed: %v", i, err) + errors++ + } + } + + pct := float64(errors)/float64(runs) + t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct) + if pct > 0.25 { + t.Errorf("Error rate too high") + } +} + +func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) { t.Logf("Run %d", run) // By default, TSAN sleeps for 1s at exit to allow background @@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // much, since we are running 50 iterations, so disable the sleep. // // Outside of race mode, GORACE does nothing. - buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0")) + buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0")) // We locally "fail" the run (return an error) if the trace exhibits // unwanted scheduling. i.e., the target goroutines did not remain on @@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) { // occur, such as a trace parse error. defer func() { if err != nil || t.Failed() { - testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false) + testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false) } }() @@ -1509,12 +1545,10 @@ findEnd: break findEnd case trace.EventRangeBegin: r := ev.Range() - if r.Name == "stop-the-world (read mem stats)" { + if r.Name == stwType { // Note when we see the STW begin. This is not // load bearing; it's purpose is simply to fail - // the test if we manage to remove the STW from - // ReadMemStat, so we remember to change this - // test to add some new source of STW. + // the test if we accidentally remove the STW. stwSeen = true } } |
