aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/mgcmark.go
diff options
context:
space:
mode:
authorAustin Clements <austin@google.com>2016-03-04 11:58:26 -0500
committerAustin Clements <austin@google.com>2016-04-26 23:40:13 +0000
commit2a889b9d931e58166350f785b16edc51e28ef19b (patch)
tree6cfac1e2a45e4dc7fdb557eca7edca37974e8635 /src/runtime/mgcmark.go
parent5b765ce310c594276ea919a9cb455cc894fee999 (diff)
downloadgo-2a889b9d931e58166350f785b16edc51e28ef19b.tar.xz
runtime: make stack re-scan O(# dirty stacks)
Currently the stack re-scan during mark termination is O(# stacks) because we enqueue a root marking job for every goroutine. It takes ~34ns to process this root marking job for a valid (clean) stack, so at around 300k goroutines we exceed the 10ms pause goal. A non-trivial portion of this time is spent simply taking the cache miss to check the gcscanvalid flag, so simply optimizing the path that handles clean stacks can only improve this so much. Fix this by keeping an explicit list of goroutines with dirty stacks that need to be rescanned. When a goroutine first transitions to running after a stack scan and marks its stack dirty, it adds itself to this list. We enqueue root marking jobs only for the goroutines in this list, so this improves stack re-scanning asymptotically by completely eliminating time spent on clean goroutines. This reduces mark termination time for 500k idle goroutines from 15ms to 238µs. Overall performance effect is negligible. name \ 95%ile-time/markTerm old new delta IdleGs/gs:500000/gomaxprocs:12 15000µs ± 0% 238µs ± 5% -98.41% (p=0.000 n=10+10) name old time/op new time/op delta XBenchGarbage-12 2.30ms ± 3% 2.29ms ± 1% -0.43% (p=0.049 n=17+18) name old time/op new time/op delta BinaryTree17-12 2.57s ± 3% 2.59s ± 2% ~ (p=0.141 n=19+20) Fannkuch11-12 2.09s ± 0% 2.10s ± 1% +0.53% (p=0.000 n=19+19) FmtFprintfEmpty-12 45.3ns ± 3% 45.2ns ± 2% ~ (p=0.845 n=20+20) FmtFprintfString-12 129ns ± 0% 127ns ± 0% -1.55% (p=0.000 n=16+16) FmtFprintfInt-12 123ns ± 0% 119ns ± 1% -3.24% (p=0.000 n=19+19) FmtFprintfIntInt-12 195ns ± 1% 189ns ± 1% -3.11% (p=0.000 n=17+17) FmtFprintfPrefixedInt-12 193ns ± 1% 187ns ± 1% -3.06% (p=0.000 n=19+19) FmtFprintfFloat-12 254ns ± 0% 255ns ± 1% +0.35% (p=0.001 n=14+17) FmtManyArgs-12 781ns ± 0% 770ns ± 0% -1.48% (p=0.000 n=16+19) GobDecode-12 7.00ms ± 1% 6.98ms ± 1% ~ (p=0.563 n=19+19) GobEncode-12 5.91ms ± 1% 5.92ms ± 0% ~ (p=0.118 n=19+18) Gzip-12 219ms ± 1% 215ms ± 1% -1.81% (p=0.000 n=18+18) Gunzip-12 37.2ms ± 0% 37.4ms ± 0% +0.45% (p=0.000 n=17+19) HTTPClientServer-12 76.9µs ± 3% 77.5µs ± 2% +0.81% (p=0.030 n=20+19) JSONEncode-12 15.0ms ± 0% 14.8ms ± 1% -0.88% (p=0.001 n=15+19) JSONDecode-12 50.6ms ± 0% 53.2ms ± 2% +5.07% (p=0.000 n=17+19) Mandelbrot200-12 4.05ms ± 0% 4.05ms ± 1% ~ (p=0.581 n=16+17) GoParse-12 3.34ms ± 1% 3.30ms ± 1% -1.21% (p=0.000 n=15+20) RegexpMatchEasy0_32-12 69.6ns ± 1% 69.8ns ± 2% ~ (p=0.566 n=19+19) RegexpMatchEasy0_1K-12 238ns ± 1% 236ns ± 0% -0.91% (p=0.000 n=17+13) RegexpMatchEasy1_32-12 69.8ns ± 1% 70.0ns ± 1% +0.23% (p=0.026 n=17+16) RegexpMatchEasy1_1K-12 371ns ± 1% 363ns ± 1% -2.07% (p=0.000 n=19+19) RegexpMatchMedium_32-12 107ns ± 2% 106ns ± 1% -0.51% (p=0.031 n=18+20) RegexpMatchMedium_1K-12 33.0µs ± 0% 32.9µs ± 0% -0.30% (p=0.004 n=16+16) RegexpMatchHard_32-12 1.70µs ± 0% 1.70µs ± 0% +0.45% (p=0.000 n=16+17) RegexpMatchHard_1K-12 51.1µs ± 2% 51.4µs ± 1% +0.53% (p=0.000 n=17+19) Revcomp-12 378ms ± 1% 385ms ± 1% +1.92% (p=0.000 n=19+18) Template-12 64.3ms ± 2% 65.0ms ± 2% +1.09% (p=0.001 n=19+19) TimeParse-12 315ns ± 1% 317ns ± 2% ~ (p=0.108 n=18+20) TimeFormat-12 360ns ± 1% 337ns ± 0% -6.30% (p=0.000 n=18+13) [Geo mean] 51.8µs 51.6µs -0.48% Change-Id: Icf8994671476840e3998236e15407a505d4c760c Reviewed-on: https://go-review.googlesource.com/20700 Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/runtime/mgcmark.go')
-rw-r--r--src/runtime/mgcmark.go133
1 files changed, 114 insertions, 19 deletions
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index bad7c7e92b..7f481dee22 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -32,6 +32,8 @@ const (
//
// The caller must have call gcCopySpans().
//
+// The world must be stopped.
+//
//go:nowritebarrier
func gcMarkRootPrepare() {
// Compute how many data and BSS root blocks there are.
@@ -63,24 +65,31 @@ func gcMarkRootPrepare() {
// after concurrent mark. In STW GC, this will happen
// during mark termination.
work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
+
+ // On the first markroot, we need to scan all Gs. Gs
+ // may be created after this point, but it's okay that
+ // we ignore them because they begin life without any
+ // roots, so there's nothing to scan, and any roots
+ // they create during the concurrent phase will be
+ // scanned during mark termination. During mark
+ // termination, allglen isn't changing, so we'll scan
+ // all Gs.
+ work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+ work.nRescanRoots = 0
} else {
// We've already scanned span roots and kept the scan
// up-to-date during concurrent mark.
work.nSpanRoots = 0
- }
- // Snapshot of allglen. During concurrent scan, we just need
- // to be consistent about how many markroot jobs we create and
- // how many Gs we check. Gs may be created after this point,
- // but it's okay that we ignore them because they begin life
- // without any roots, so there's nothing to scan, and any
- // roots they create during the concurrent phase will be
- // scanned during mark termination. During mark termination,
- // allglen isn't changing, so we'll scan all Gs.
- work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+ // On the second pass of markroot, we're just scanning
+ // dirty stacks. It's safe to access rescan since the
+ // world is stopped.
+ work.nStackRoots = 0
+ work.nRescanRoots = len(work.rescan.list)
+ }
work.markrootNext = 0
- work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+ work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
}
// gcMarkRootCheck checks that all roots have been scanned. It is
@@ -92,11 +101,24 @@ func gcMarkRootCheck() {
}
lock(&allglock)
- // Check that gc work is done.
- for i := 0; i < work.nStackRoots; i++ {
- gp := allgs[i]
- if !gp.gcscandone {
- throw("scan missed a g")
+ // Check that stacks have been scanned.
+ if gcphase == _GCmarktermination {
+ for i := 0; i < len(allgs); i++ {
+ gp := allgs[i]
+ if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+ println("gp", gp, "goid", gp.goid,
+ "status", readgstatus(gp),
+ "gcscandone", gp.gcscandone,
+ "gcscanvalid", gp.gcscanvalid)
+ throw("scan missed a g")
+ }
+ }
+ } else {
+ for i := 0; i < work.nStackRoots; i++ {
+ gp := allgs[i]
+ if !gp.gcscandone {
+ throw("scan missed a g")
+ }
}
}
unlock(&allglock)
@@ -109,12 +131,18 @@ var oneptrmask = [...]uint8{1}
//
// Preemption must be disabled (because this uses a gcWork).
//
+// nowritebarrier is only advisory here.
+//
//go:nowritebarrier
func markroot(gcw *gcWork, i uint32) {
+ // TODO(austin): This is a bit ridiculous. Compute and store
+ // the bases in gcMarkRootPrepare instead of the counts.
baseData := uint32(fixedRootCount)
baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots)
+ baseRescan := baseStacks + uint32(work.nStackRoots)
+ end := baseRescan + uint32(work.nRescanRoots)
// Note: if you add a case here, please also update heapdump.go:dumproots.
switch {
@@ -151,10 +179,14 @@ func markroot(gcw *gcWork, i uint32) {
default:
// the rest is scanning goroutine stacks
- if uintptr(i-baseStacks) >= allglen {
+ var gp *g
+ if baseStacks <= i && i < baseRescan {
+ gp = allgs[i-baseStacks]
+ } else if baseRescan <= i && i < end {
+ gp = work.rescan.list[i-baseRescan].ptr()
+ } else {
throw("markroot: bad index")
}
- gp := allgs[i-baseStacks]
// remember when we've first observed the G blocked
// needed only to output in traceback
@@ -163,13 +195,14 @@ func markroot(gcw *gcWork, i uint32) {
gp.waitsince = work.tstart
}
- if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC {
+ if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC && readgstatus(gp) != _Gdead {
// GC background workers may be
// non-preemptible, so we may deadlock if we
// try to scan them during a concurrent phase.
// They also have tiny stacks, so just ignore
// them until mark termination.
gp.gcscandone = true
+ queueRescan(gp)
break
}
@@ -721,6 +754,14 @@ func scanstack(gp *g) {
gcw.dispose()
}
gcUnlockStackBarriers(gp)
+ if gcphase == _GCmark {
+ // gp may have added itself to the rescan list between
+ // when GC started and now. It's clean now, so remove
+ // it. This isn't safe during mark termination because
+ // mark termination is consuming this list, but it's
+ // also not necessary.
+ dequeueRescan(gp)
+ }
gp.gcscanvalid = true
}
@@ -797,6 +838,60 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
}
}
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+ if gcphase == _GCoff {
+ gp.gcscanvalid = false
+ return
+ }
+ if gp.gcRescan != -1 {
+ throw("g already on rescan list")
+ }
+
+ lock(&work.rescan.lock)
+ gp.gcscanvalid = false
+
+ // Recheck gcphase under the lock in case there was a phase change.
+ if gcphase == _GCoff {
+ unlock(&work.rescan.lock)
+ return
+ }
+ if len(work.rescan.list) == cap(work.rescan.list) {
+ throw("rescan list overflow")
+ }
+ n := len(work.rescan.list)
+ gp.gcRescan = int32(n)
+ work.rescan.list = work.rescan.list[:n+1]
+ work.rescan.list[n].set(gp)
+ unlock(&work.rescan.lock)
+}
+
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+ if gp.gcRescan == -1 {
+ return
+ }
+ if gcphase == _GCoff {
+ gp.gcRescan = -1
+ return
+ }
+
+ lock(&work.rescan.lock)
+ if work.rescan.list[gp.gcRescan].ptr() != gp {
+ throw("bad dequeueRescan")
+ }
+ // Careful: gp may itself be the last G on the list.
+ last := work.rescan.list[len(work.rescan.list)-1]
+ work.rescan.list[gp.gcRescan] = last
+ last.ptr().gcRescan = gp.gcRescan
+ gp.gcRescan = -1
+ work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+ unlock(&work.rescan.lock)
+}
+
type gcDrainFlags int
const (