From eb3c6a93c3236bbde5dee6cc5bd4ca9f8ab1647a Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Mon, 10 Aug 2020 20:02:22 +0000 Subject: runtime: disable stack shrinking in activeStackChans race window Currently activeStackChans is set before a goroutine blocks on a channel operation in an unlockf passed to gopark. The trouble is that the unlockf is called *after* the G's status is changed, and the G's status is what is used by a concurrent mark worker (calling suspendG) to determine that a G has successfully been suspended. In this window between the status change and unlockf, the mark worker could try to shrink the G's stack, and in particular observe that activeStackChans is false. This observation will cause the mark worker to *not* synchronize with concurrent channel operations when it should, and so updating pointers in the sudog for the blocked goroutine (which may point to the goroutine's stack) races with channel operations which may also manipulate the pointer (read it, dereference it, update it, etc.). Fix the problem by adding a new atomically-updated flag to the g struct called parkingOnChan, which is non-zero in the race window above. Then, in isShrinkStackSafe, check if parkingOnChan is zero. The race is resolved like so: * Blocking G sets parkingOnChan, then changes status in gopark. * Mark worker successfully suspends blocking G. * If the mark worker observes parkingOnChan is non-zero when checking isShrinkStackSafe, then it's not safe to shrink (we're in the race window). * If the mark worker observes parkingOnChan as zero, then because the mark worker observed the G status change, it can be sure that gopark's unlockf completed, and gp.activeStackChans will be correct. The risk of this change is low, since although it reduces the number of places that stack shrinking is allowed, the window here is incredibly small. Essentially, every place that it might crash now is replaced with no shrink. This change adds a test, but the race window is so small that it's hard to trigger without a well-placed sleep in park_m. Also, this change fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte stack frame. It turns out the compiler was destructuring the "pad" field and only allocating one uint64 on the stack. Fixes #40641. Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/247050 Run-TryBot: Michael Knyszek TryBot-Result: Go Bot Reviewed-by: Michael Pratt Trust: Michael Knyszek --- src/runtime/chan_test.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'src/runtime/chan_test.go') diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go index 039a086e9b..756bbbeccf 100644 --- a/src/runtime/chan_test.go +++ b/src/runtime/chan_test.go @@ -623,6 +623,62 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) { <-done } +func TestNoShrinkStackWhileParking(t *testing.T) { + // The goal of this test is to trigger a "racy sudog adjustment" + // throw. Basically, there's a window between when a goroutine + // becomes available for preemption for stack scanning (and thus, + // stack shrinking) but before the goroutine has fully parked on a + // channel. See issue 40641 for more details on the problem. + // + // The way we try to induce this failure is to set up two + // goroutines: a sender and a reciever that communicate across + // a channel. We try to set up a situation where the sender + // grows its stack temporarily then *fully* blocks on a channel + // often. Meanwhile a GC is triggered so that we try to get a + // mark worker to shrink the sender's stack and race with the + // sender parking. + // + // Unfortunately the race window here is so small that we + // either need a ridiculous number of iterations, or we add + // "usleep(1000)" to park_m, just before the unlockf call. + const n = 10 + send := func(c chan<- int, done chan struct{}) { + for i := 0; i < n; i++ { + c <- i + // Use lots of stack briefly so that + // the GC is going to want to shrink us + // when it scans us. Make sure not to + // do any function calls otherwise + // in order to avoid us shrinking ourselves + // when we're preempted. + stackGrowthRecursive(20) + } + done <- struct{}{} + } + recv := func(c <-chan int, done chan struct{}) { + for i := 0; i < n; i++ { + // Sleep here so that the sender always + // fully blocks. + time.Sleep(10 * time.Microsecond) + <-c + } + done <- struct{}{} + } + for i := 0; i < n*20; i++ { + c := make(chan int) + done := make(chan struct{}) + go recv(c, done) + go send(c, done) + // Wait a little bit before triggering + // the GC to make sure the sender and + // reciever have gotten into their groove. + time.Sleep(50 * time.Microsecond) + runtime.GC() + <-done + <-done + } +} + func TestSelectDuplicateChannel(t *testing.T) { // This test makes sure we can queue a G on // the same channel multiple times. -- cgit v1.3