From 4ebf295b0b1740caac6302cc824ebd0f6175c1d5 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Fri, 24 Oct 2025 15:14:59 -0400 Subject: runtime: prefer to restart Ps on the same M after STW Today, Ps jump around arbitrarily across STW. Instead, try to keep the P on the previous M it ran on. In the future, we'll likely want to try to expand this beyond STW to create a more general affinity for specific Ms. For this to be useful, the Ps need to have runnable Gs. Today, STW preemption goes through goschedImpl, which places the G on the global run queue. If that was the only G then the P won't have runnable goroutines anymore. It makes more sense to keep the G with its P across STW anyway, so add a special case to goschedImpl for that. On my machine, this CL reduces the error rate in TestTraceSTW from 99.8% to 1.9%. As a nearly 2% error rate shows, there are still cases where this best effort scheduling doesn't work. The most obvious is that while procresize assigns Ps back to their original M, startTheWorldWithSema calls wakep to start a spinning M. The spinning M may steal a goroutine from another P if that P is too slow to start. For #65694. Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/714801 Reviewed-by: Michael Knyszek Auto-Submit: Michael Pratt LUCI-TryBot-Result: Go LUCI --- src/runtime/testdata/testprog/stw_mexit.go | 69 +++++++++++++++++++++ src/runtime/testdata/testprog/stw_trace.go | 99 ++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 src/runtime/testdata/testprog/stw_mexit.go create mode 100644 src/runtime/testdata/testprog/stw_trace.go (limited to 'src/runtime/testdata') diff --git a/src/runtime/testdata/testprog/stw_mexit.go b/src/runtime/testdata/testprog/stw_mexit.go new file mode 100644 index 0000000000..b022ef4777 --- /dev/null +++ b/src/runtime/testdata/testprog/stw_mexit.go @@ -0,0 +1,69 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "runtime" +) + +func init() { + register("mexitSTW", mexitSTW) +} + +// Stress test for pp.oldm pointing to an exited M. +// +// If pp.oldm points to an exited M it should be ignored and another M used +// instead. To stress: +// +// 1. Start and exit many threads (thus setting oldm on some P). +// 2. Meanwhile, frequently stop the world. +// +// If procresize incorrect attempts to assign a P to an exited M, likely +// failure modes are: +// +// 1. Crash in startTheWorldWithSema attempting to access the M, if it is nil. +// +// 2. Memory corruption elsewhere after startTheWorldWithSema writes to the M, +// if it is not nil, but is freed and reused for another allocation. +// +// 3. Hang on a subsequent stop the world waiting for the P to stop, if the M +// object is valid, but the M is exited, because startTheWorldWithSema didn't +// actually wake anything to run the P. The P is _Pidle, but not in the pidle +// list, thus startTheWorldWithSema will wake for it to actively stop. +// +// For this to go wrong, an exited M must fail to clear mp.self and must leave +// the M on the sched.midle list. +// +// Similar to TraceSTW. +func mexitSTW() { + // Ensure we have multiple Ps, but not too many, as we want the + // runnable goroutines likely to run on Ps with oldm set. + runtime.GOMAXPROCS(4) + + // Background busy work so there is always something runnable. + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + for range 100 { + // Exit a thread. The last P to run this will have it in oldm. + go func() { + runtime.LockOSThread() + }() + + // STW + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + } + + stop.Store(true) + + println("OK") +} diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go new file mode 100644 index 0000000000..0fed55b875 --- /dev/null +++ b/src/runtime/testdata/testprog/stw_trace.go @@ -0,0 +1,99 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "context" + "log" + "os" + "runtime" + "runtime/debug" + "runtime/trace" + "sync/atomic" +) + +func init() { + register("TraceSTW", TraceSTW) +} + +// The parent writes to ping and waits for the children to write back +// via pong to show that they are running. +var ping atomic.Uint32 +var pong [2]atomic.Uint32 + +// Tell runners to stop. +var stop atomic.Bool + +func traceSTWTarget(i int) { + for !stop.Load() { + // Async preemption often takes 100ms+ to preempt this loop on + // windows-386. This makes the test flaky, as the traceReadCPU + // timer often fires by the time STW finishes, jumbling the + // goroutine scheduling. As a workaround, ensure we have a + // morestack call for prompt preemption. + ensureMorestack() + + pong[i].Store(ping.Load()) + } +} + +func TraceSTW() { + ctx := context.Background() + + // The idea here is to have 2 target goroutines that are constantly + // running. When the world restarts after STW, we expect these + // goroutines to continue execution on the same M and P. + // + // Set GOMAXPROCS=4 to make room for the 2 target goroutines, 1 parent, + // and 1 slack for potential misscheduling. + // + // Disable the GC because GC STW generally moves goroutines (see + // https://go.dev/issue/65694). Alternatively, we could just ignore the + // trace if the GC runs. + runtime.GOMAXPROCS(4) + debug.SetGCPercent(0) + + if err := trace.Start(os.Stdout); err != nil { + log.Fatalf("failed to start tracing: %v", err) + } + defer trace.Stop() + + for i := range 2 { + go traceSTWTarget(i) + } + + // Wait for children to start running. + ping.Store(1) + for pong[0].Load() != 1 {} + for pong[1].Load() != 1 {} + + trace.Log(ctx, "TraceSTW", "start") + + // STW + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + + // Make sure to run long enough for the children to schedule again + // after STW. + ping.Store(2) + for pong[0].Load() != 2 {} + for pong[1].Load() != 2 {} + + trace.Log(ctx, "TraceSTW", "end") + + stop.Store(true) +} + +// Manually insert a morestack call. Leaf functions can omit morestack, but +// non-leaf functions should include them. + +//go:noinline +func ensureMorestack() { + ensureMorestack1() +} + +//go:noinline +func ensureMorestack1() { +} -- cgit v1.3-5-g9baa