Revert "testing: simplify concurrency and cleanup logic"

reverts CL 506755 Reason for revert: This leads to deadlocks of tests in Google internal testing For #64402 Change-Id: I78329fc9dcc2377e7e880b264ac1d18d577ef99a Reviewed-on: https://go-review.googlesource.com/c/go/+/544895 Auto-Submit: Bryan Mills <bcmills@google.com> Auto-Submit: Lasse Folger <lassefolger@google.com> Reviewed-by: Bryan Mills <bcmills@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Bryan Mills <bcmills@google.com>
author: Lasse Folger <lassefolger@google.com> 2023-11-24 15:30:07 +0000
committer: Gopher Robot <gobot@golang.org> 2023-11-27 16:49:24 +0000
commit: 6317fb41e920cc593620d4b9a55376472fc02c07 (patch)
tree: 5008e5b12850909a494a2bc9e186fa78d85a620a /src/testing/testing.go
parent: d961b12be9001cf8dbf8f52847607dbf84d94f8d (diff)
download: go-6317fb41e920cc593620d4b9a55376472fc02c07.tar.xz
1 files changed, 201 insertions, 201 deletions
diff --git a/src/testing/testing.go b/src/testing/testing.go
index 96f71c89b9..ed8b3630f1 100644
--- a/src/testing/testing.go
+++ b/src/testing/testing.go
@@ -480,7 +480,7 @@ var (
 	cpuList     []int
 	testlogFile *os.File
 
-	anyFailed atomic.Bool
+	numFailed atomic.Uint32 // number of test failures
 
 	running sync.Map // map[string]time.Time of running, unpaused tests
 )
@@ -593,40 +593,38 @@ const maxStackLen = 50
 // captures common methods such as Errorf.
 type common struct {
 	mu          sync.RWMutex         // guards this group of fields
-	output      []byte               // output generated by test or benchmark
-	w           io.Writer            // output to which flushToParent should write
-	ranAnyLeaf  bool                 // Test or benchmark ran to completion without calling Run itself, or had a subtest that did so.
+	output      []byte               // Output generated by test or benchmark.
+	w           io.Writer            // For flushToParent.
+	ran         bool                 // Test or benchmark (or one of its subtests) was executed.
 	failed      bool                 // Test or benchmark has failed.
 	skipped     bool                 // Test or benchmark has been skipped.
-	goexiting   bool                 // Test function is attempting to abort by goexit (implies failed || skipped).
 	done        bool                 // Test is finished and all subtests have completed.
 	helperPCs   map[uintptr]struct{} // functions to be skipped when writing file/line info
 	helperNames map[string]struct{}  // helperPCs converted to function names
 	cleanups    []func()             // optional functions to be called at the end of the test
-	cleanupName string               // name of the cleanup function currently running
-	cleanupPc   []uintptr            // stack trace at the point where Cleanup was called
+	cleanupName string               // Name of the cleanup function.
+	cleanupPc   []uintptr            // The stack trace at the point where Cleanup was called.
+	finished    bool                 // Test function has completed.
+	inFuzzFn    bool                 // Whether the fuzz target, if this is one, is running.
 
-	chatty     *chattyPrinter // copy of chattyPrinter, if the chatty flag is set
-	bench      bool           // Current test is a benchmark.
-	runner     string         // function name of tRunner running the test
-	isParallel bool           // Test is running in parallel.
-	inFuzzFn   bool           // Fuzz target, if this is one, is running.
-	inCleanup  bool           // Cleanup callbacks, if any, are running.
+	chatty         *chattyPrinter // A copy of chattyPrinter, if the chatty flag is set.
+	bench          bool           // Whether the current test is a benchmark.
+	hasSub         atomic.Bool    // whether there are sub-benchmarks.
+	cleanupStarted atomic.Bool    // Registered cleanup callbacks have started to execute
+	runner         string         // Function name of tRunner running the test.
+	isParallel     bool           // Whether the test is parallel.
 
-	parent         *common
-	level          int           // nesting depth of test or benchmark
-	creator        []uintptr     // if level > 0, the stack trace at the point where the parent called t.Run
-	name           string        // name of test or benchmark
-	start          time.Time     // time test or benchmark started or resumed
-	duration       time.Duration // time in the test function, excluding time blocked in t.Parallel
-	runParallel    chan struct{} // Closed when parallel subtests may start. Nil when T.Parallel is not present (B) or not usable (when fuzzing).
-	doneOrParallel chan struct{} // Closed when the test is either blocked in Parallel or done running.
+	parent   *common
+	level    int       // Nesting depth of test or benchmark.
+	creator  []uintptr // If level > 0, the stack trace at the point where the parent called t.Run.
+	name     string    // Name of test or benchmark.
+	start    time.Time // Time test or benchmark started
+	duration time.Duration
+	barrier  chan bool // To signal parallel subtests they may start. Nil when T.Parallel is not present (B) or not usable (when fuzzing).
+	signal   chan bool // To signal a test is done.
+	sub      []*T      // Queue of subtests to be run in parallel.
 
-	hasSub           atomic.Bool // Test or benchmark has subtests or sub-benchmarks.
-	parallelSubtests sync.WaitGroup
-	runMu            sync.Mutex // Held during calls to Run to prevent the total number of active subtests from exceeding the parallelism limit.
-
-	lastRaceErrors  atomic.Int64 // max value of race.Errors seen during the test or its subtests
+	lastRaceErrors  atomic.Int64 // Max value of race.Errors seen during the test or its subtests.
 	raceErrorLogged atomic.Bool
 
 	tempDirMu  sync.Mutex
@@ -933,13 +931,13 @@ func (c *common) Name() string {
 	return c.name
 }
 
-func (c *common) setRanLeaf() {
+func (c *common) setRan() {
 	if c.parent != nil {
-		c.parent.setRanLeaf()
+		c.parent.setRan()
 	}
 	c.mu.Lock()
 	defer c.mu.Unlock()
-	c.ranAnyLeaf = true
+	c.ran = true
 }
 
 // Fail marks the function as having failed but continues execution.
@@ -954,7 +952,6 @@ func (c *common) Fail() {
 		panic("Fail in goroutine after " + c.name + " has completed")
 	}
 	c.failed = true
-	anyFailed.Store(true)
 }
 
 // Failed reports whether the function has failed.
@@ -1003,7 +1000,7 @@ func (c *common) FailNow() {
 	// a top-of-stack deferred function now, we know that the send
 	// only happens after any other stacked defers have completed.
 	c.mu.Lock()
-	c.goexiting = true
+	c.finished = true
 	c.mu.Unlock()
 	runtime.Goexit()
 }
@@ -1118,7 +1115,7 @@ func (c *common) SkipNow() {
 	c.checkFuzzFn("SkipNow")
 	c.mu.Lock()
 	c.skipped = true
-	c.goexiting = true
+	c.finished = true
 	c.mu.Unlock()
 	runtime.Goexit()
 }
@@ -1321,8 +1318,8 @@ const (
 // If ph is recoverAndReturnPanic, it will catch panics, and return the
 // recovered value if any.
 func (c *common) runCleanup(ph panicHandling) (panicVal any) {
-	c.inCleanup = true
-	defer func() { c.inCleanup = false }()
+	c.cleanupStarted.Store(true)
+	defer c.cleanupStarted.Store(false)
 
 	if ph == recoverAndReturnPanic {
 		defer func() {
@@ -1449,7 +1446,8 @@ func (t *T) Parallel() {
 	if t.isEnvSet {
 		panic("testing: t.Parallel called after t.Setenv; cannot set environment variables in parallel tests")
 	}
-	if t.parent.runParallel == nil {
+	t.isParallel = true
+	if t.parent.barrier == nil {
 		// T.Parallel has no effect when fuzzing.
 		// Multiple processes may run in parallel, but only one input can run at a
 		// time per process so we can attribute crashes to specific inputs.
@@ -1462,7 +1460,7 @@ func (t *T) Parallel() {
 	t.duration += time.Since(t.start)
 
 	// Add to the list of tests to be released by the parent.
-	t.parent.parallelSubtests.Add(1)
+	t.parent.sub = append(t.parent.sub, t)
 
 	// Report any races during execution of this test up to this point.
 	//
@@ -1481,19 +1479,9 @@ func (t *T) Parallel() {
 	}
 	running.Delete(t.name)
 
-	t.isParallel = true
-
-	// Release the parent test to run. We can't just use parallelSem tokens for
-	// this because some callers (notably TestParallelSub) expect to be able to
-	// call Run from multiple goroutines and have those calls succeed.
-	//
-	// Instead, we close a channel to unblock the parent's call to Run, allowing
-	// it to resume. Then, we wait for it to finish and unblock its parallel
-	// subtests, and acquire a parallel run token so that we don't run too many of
-	// the subtests together all at once.
-	close(t.doneOrParallel)
-	<-t.parent.runParallel
-	t.context.acquireParallel()
+	t.signal <- true   // Release calling test.
+	<-t.parent.barrier // Wait for the parent test to complete.
+	t.context.waitParallel()
 
 	if t.chatty != nil {
 		t.chatty.Updatef(t.name, "=== CONT  %s\n", t.name)
@@ -1550,13 +1538,19 @@ var errNilPanicOrGoexit = errors.New("test executed panic(nil) or runtime.Goexit
 
 func tRunner(t *T, fn func(t *T)) {
 	t.runner = callerName(0)
-	returned := false
 
 	// When this goroutine is done, either because fn(t)
 	// returned normally or because a test failure triggered
 	// a call to runtime.Goexit, record the duration and send
 	// a signal saying that the test is done.
 	defer func() {
+		t.checkRaces()
+
+		// TODO(#61034): This is the wrong place for this check.
+		if t.Failed() {
+			numFailed.Add(1)
+		}
+
 		// Check if the test panicked or Goexited inappropriately.
 		//
 		// If this happens in a normal test, print output but continue panicking.
@@ -1565,120 +1559,132 @@ func tRunner(t *T, fn func(t *T)) {
 		// If this happens while fuzzing, recover from the panic and treat it like a
 		// normal failure. It's important that the process keeps running in order to
 		// find short inputs that cause panics.
-		panicVal := recover()
-		if !returned && !t.goexiting && panicVal == nil {
-			panicVal = errNilPanicOrGoexit
+		err := recover()
+		signal := true
+
+		t.mu.RLock()
+		finished := t.finished
+		t.mu.RUnlock()
+		if !finished && err == nil {
+			err = errNilPanicOrGoexit
 			for p := t.parent; p != nil; p = p.parent {
 				p.mu.RLock()
-				pGoexiting := p.goexiting
+				finished = p.finished
 				p.mu.RUnlock()
-				if pGoexiting {
-					t.Errorf("%v: subtest may have called FailNow on a parent test", panicVal)
-					panicVal = nil
+				if finished {
+					if !t.isParallel {
+						t.Errorf("%v: subtest may have called FailNow on a parent test", err)
+						err = nil
+					}
+					signal = false
 					break
 				}
 			}
 		}
 
-		if panicVal != nil && t.context.isFuzzing {
+		if err != nil && t.context.isFuzzing {
 			prefix := "panic: "
-			if panicVal == errNilPanicOrGoexit {
+			if err == errNilPanicOrGoexit {
 				prefix = ""
 			}
-			t.Errorf("%s%s\n%s\n", prefix, panicVal, string(debug.Stack()))
-			panicVal = nil
-		}
-
-		if panicVal != nil {
-			// Mark the test as failed so that Cleanup functions can see its correct status.
-			t.Fail()
-		} else if t.runParallel != nil {
-			// Run parallel subtests.
-
-			// Check for races before starting parallel subtests, so that if a
-			// parallel subtest *also* triggers a data race we will report the two
-			// races to the two tests and not attribute all of them to the subtest.
-			t.checkRaces()
-
-			if t.isParallel {
-				// Release our own parallel token first, so that it is available for
-				// subtests to acquire.
-				t.context.releaseParallel()
-			}
-			close(t.runParallel)
-			t.parallelSubtests.Wait()
-			if t.isParallel {
-				// Re-acquire a parallel token to limit concurrent cleanup.
-				t.context.acquireParallel()
-			}
+			t.Errorf("%s%s\n%s\n", prefix, err, string(debug.Stack()))
+			t.mu.Lock()
+			t.finished = true
+			t.mu.Unlock()
+			err = nil
 		}
 
 		// Use a deferred call to ensure that we report that the test is
 		// complete even if a cleanup function calls t.FailNow. See issue 41355.
+		didPanic := false
 		defer func() {
-			cleanupPanic := recover()
-			if panicVal == nil {
-				panicVal = cleanupPanic
-			}
-
 			// Only report that the test is complete if it doesn't panic,
 			// as otherwise the test binary can exit before the panic is
 			// reported to the user. See issue 41479.
-			if panicVal != nil {
-				// Flush the output log up to the root before dying.
-				for root := &t.common; root.parent != nil; root = root.parent {
-					root.mu.Lock()
-					root.duration += time.Since(root.start)
-					d := root.duration
-					root.mu.Unlock()
-					root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
+			if didPanic {
+				return
+			}
+			if err != nil {
+				panic(err)
+			}
+			running.Delete(t.name)
+			t.signal <- signal
+		}()
 
-					// Since the parent will never finish running, do its cleanup now.
-					// Run the cleanup in a fresh goroutine in case it calls runtime.Goexit,
-					// which we cannot recover.
-					cleanupDone := make(chan struct{})
-					go func() {
-						defer close(cleanupDone)
-						if r := root.parent.runCleanup(recoverAndReturnPanic); r != nil {
-							fmt.Fprintf(root.parent.w, "cleanup panicked with %v", r)
-						}
-					}()
-					<-cleanupDone
+		doPanic := func(err any) {
+			t.Fail()
+			if r := t.runCleanup(recoverAndReturnPanic); r != nil {
+				t.Logf("cleanup panicked with %v", r)
+			}
+			// Flush the output log up to the root before dying.
+			for root := &t.common; root.parent != nil; root = root.parent {
+				root.mu.Lock()
+				root.duration += time.Since(root.start)
+				d := root.duration
+				root.mu.Unlock()
+				root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
+				if r := root.parent.runCleanup(recoverAndReturnPanic); r != nil {
+					fmt.Fprintf(root.parent.w, "cleanup panicked with %v", r)
 				}
-				panic(panicVal)
 			}
+			didPanic = true
+			panic(err)
+		}
+		if err != nil {
+			doPanic(err)
+		}
 
-			t.checkRaces()
-			t.duration += time.Since(t.start)
-			t.report()
+		t.duration += time.Since(t.start)
 
-			// Do not lock t.done to allow race detector to detect race in case
-			// the user does not appropriately synchronize a goroutine.
-			t.done = true
-			if t.parent != nil && !t.hasSub.Load() {
-				t.setRanLeaf()
+		if len(t.sub) > 0 {
+			// Run parallel subtests.
+			// Decrease the running count for this test.
+			t.context.release()
+			// Release the parallel subtests.
+			close(t.barrier)
+			// Wait for subtests to complete.
+			for _, sub := range t.sub {
+				<-sub.signal
 			}
-
-			running.Delete(t.name)
-			if t.isParallel {
-				t.context.releaseParallel()
-				t.parent.parallelSubtests.Done()
-			} else {
-				close(t.doneOrParallel)
+			cleanupStart := time.Now()
+			err := t.runCleanup(recoverAndReturnPanic)
+			t.duration += time.Since(cleanupStart)
+			if err != nil {
+				doPanic(err)
 			}
-		}()
+			t.checkRaces()
+			if !t.isParallel {
+				// Reacquire the count for sequential tests. See comment in Run.
+				t.context.waitParallel()
+			}
+		} else if t.isParallel {
+			// Only release the count for this test if it was run as a parallel
+			// test. See comment in Run method.
+			t.context.release()
+		}
+		t.report() // Report after all subtests have finished.
 
-		t.runCleanup(normalPanic)
+		// Do not lock t.done to allow race detector to detect race in case
+		// the user does not appropriately synchronize a goroutine.
+		t.done = true
+		if t.parent != nil && !t.hasSub.Load() {
+			t.setRan()
+		}
+	}()
+	defer func() {
+		if len(t.sub) == 0 {
+			t.runCleanup(normalPanic)
+		}
 	}()
 
-	// Run the actual test function.
 	t.start = time.Now()
 	t.resetRaces()
 	fn(t)
 
-	// Code beyond this point will not be executed when FailNow or SkipNow
-	// is invoked.
-	returned = true
+	// code beyond here will not be executed when FailNow is invoked
+	t.mu.Lock()
+	t.finished = true
+	t.mu.Unlock()
 }
 
 // Run runs f as a subtest of t called name. It runs f in a separate goroutine
@@ -1688,7 +1694,7 @@ func tRunner(t *T, fn func(t *T)) {
 // Run may be called simultaneously from multiple goroutines, but all such calls
 // must return before the outer test function for t returns.
 func (t *T) Run(name string, f func(t *T)) bool {
-	if t.inCleanup {
+	if t.cleanupStarted.Load() {
 		panic("testing: t.Run called during t.Cleanup")
 	}
 
@@ -1702,56 +1708,40 @@ func (t *T) Run(name string, f func(t *T)) bool {
 	// continue walking the stack into the parent test.
 	var pc [maxStackLen]uintptr
 	n := runtime.Callers(2, pc[:])
-	sub := &T{
+	t = &T{
 		common: common{
-			runParallel:    make(chan struct{}),
-			doneOrParallel: make(chan struct{}),
-			name:           testName,
-			parent:         &t.common,
-			level:          t.level + 1,
-			creator:        pc[:n],
-			chatty:         t.chatty,
+			barrier: make(chan bool),
+			signal:  make(chan bool, 1),
+			name:    testName,
+			parent:  &t.common,
+			level:   t.level + 1,
+			creator: pc[:n],
+			chatty:  t.chatty,
 		},
 		context: t.context,
 	}
-	sub.w = indenter{&sub.common}
-
-	// Ensure that only one non-parallel subtest runs at a time so that we don't
-	// exceed the -parallel setting due to concurrent calls.
-	// (Run may be called concurrently even if the test is not marked parallel —
-	// see TestParallelSub.)
-	t.runMu.Lock()
-	defer t.runMu.Unlock()
-
-	if t.isParallel {
-		// Release our parallelism token for the subtest to use
-		// for its own parallel subtests.
-		t.context.releaseParallel()
-		defer t.context.acquireParallel()
-	}
+	t.w = indenter{&t.common}
 
-	if sub.chatty != nil {
-		sub.chatty.Updatef(sub.name, "=== RUN   %s\n", sub.name)
+	if t.chatty != nil {
+		t.chatty.Updatef(t.name, "=== RUN   %s\n", t.name)
 	}
-	running.Store(sub.name, time.Now())
+	running.Store(t.name, time.Now())
 
 	// Instead of reducing the running count of this test before calling the
 	// tRunner and increasing it afterwards, we rely on tRunner keeping the
 	// count correct. This ensures that a sequence of sequential tests runs
 	// without being preempted, even when their parent is a parallel test. This
 	// may especially reduce surprises if *parallel == 1.
-	go tRunner(sub, f)
-	<-sub.doneOrParallel
-	if t.goexiting {
-		// The parent test (t) thinks it called runtime.Goexit, but here we are
-		// still running. It is likely that this subtest called FailNow or SkipNow
-		// on the t instead of sub, so propagate the Goexit to the parent goroutine.
+	go tRunner(t, f)
+	if !<-t.signal {
+		// At this point, it is likely that FailNow was called on one of the
+		// parent tests by one of the subtests. Continue aborting up the chain.
 		runtime.Goexit()
 	}
 	if t.chatty != nil && t.chatty.json {
-		t.chatty.Updatef(t.name, "=== NAME  %s\n", t.name)
+		t.chatty.Updatef(t.parent.name, "=== NAME  %s\n", t.parent.name)
 	}
-	return !sub.failed
+	return !t.failed
 }
 
 // Deadline reports the time at which the test binary will have
@@ -1775,43 +1765,53 @@ type testContext struct {
 	// does not match).
 	isFuzzing bool
 
-	// parallelSem is a counting semaphore to limit concurrency of Parallel tests.
-	// It has a capacity equal to the parallel flag.
-	// Send a token to acquire; receive to release.
-	// Non-parallel tests do not require a token.
-	parallelSem chan token
-}
+	mu sync.Mutex
 
-// A token is a semaphore token.
-type token struct{}
+	// Channel used to signal tests that are ready to be run in parallel.
+	startParallel chan bool
 
-// newTestContext returns a new testContext with the given parallelism and matcher.
-func newTestContext(maxParallel int, m *matcher) *testContext {
-	tc := &testContext{
-		match:       m,
-		parallelSem: make(chan token, maxParallel),
-	}
-	return tc
+	// running is the number of tests currently running in parallel.
+	// This does not include tests that are waiting for subtests to complete.
+	running int
+
+	// numWaiting is the number tests waiting to be run in parallel.
+	numWaiting int
+
+	// maxParallel is a copy of the parallel flag.
+	maxParallel int
 }
 
-// acquireParallel blocks until it can obtain a semaphore token for running a
-// parallel test.
-func (c *testContext) acquireParallel() {
-	c.parallelSem <- token{}
+func newTestContext(maxParallel int, m *matcher) *testContext {
+	return &testContext{
+		match:         m,
+		startParallel: make(chan bool),
+		maxParallel:   maxParallel,
+		running:       1, // Set the count to 1 for the main (sequential) test.
+	}
 }
 
-// releaseParallel returns a semaphore token obtained by acquireParallel.
-func (c *testContext) releaseParallel() {
-	select {
-	case <-c.parallelSem:
-	default:
-		panic("testing: internal error: no parallel token to release")
+func (c *testContext) waitParallel() {
+	c.mu.Lock()
+	if c.running < c.maxParallel {
+		c.running++
+		c.mu.Unlock()
+		return
 	}
+	c.numWaiting++
+	c.mu.Unlock()
+	<-c.startParallel
 }
 
-// running returns the number of semaphore tokens outstanding.
-func (c *testContext) running() int {
-	return len(c.parallelSem)
+func (c *testContext) release() {
+	c.mu.Lock()
+	if c.numWaiting == 0 {
+		c.running--
+		c.mu.Unlock()
+		return
+	}
+	c.numWaiting--
+	c.mu.Unlock()
+	c.startParallel <- true // Pick a waiting test to be run.
 }
 
 // No one should be using func Main anymore.
@@ -2132,9 +2132,9 @@ func runTests(matchString func(pat, str string) (bool, error), tests []InternalT
 			ctx.deadline = deadline
 			t := &T{
 				common: common{
-					doneOrParallel: make(chan struct{}),
-					runParallel:    make(chan struct{}),
-					w:              os.Stdout,
+					signal:  make(chan bool, 1),
+					barrier: make(chan bool),
+					w:       os.Stdout,
 				},
 				context: ctx,
 			}
@@ -2147,12 +2147,12 @@ func runTests(matchString func(pat, str string) (bool, error), tests []InternalT
 				}
 			})
 			select {
-			case <-t.doneOrParallel:
+			case <-t.signal:
 			default:
-				panic("internal error: tRunner exited without closing t.doneOrParallel")
+				panic("internal error: tRunner exited without sending on t.signal")
 			}
 			ok = ok && !t.Failed()
-			ran = ran || t.ranAnyLeaf
+			ran = ran || t.ran
 		}
 	}
 	return ran, ok
@@ -2390,5 +2390,5 @@ func parseCpuList() {
 }
 
 func shouldFailFast() bool {
-	return *failFast && anyFailed.Load()
+	return *failFast && numFailed.Load() > 0
 }
author	Lasse Folger <lassefolger@google.com>	2023-11-24 15:30:07 +0000
committer	Gopher Robot <gobot@golang.org>	2023-11-27 16:49:24 +0000
commit	6317fb41e920cc593620d4b9a55376472fc02c07 (patch)
tree	5008e5b12850909a494a2bc9e186fa78d85a620a /src/testing/testing.go
parent	d961b12be9001cf8dbf8f52847607dbf84d94f8d (diff)
download	go-6317fb41e920cc593620d4b9a55376472fc02c07.tar.xz