diff options
| author | Dmitry Vyukov <dvyukov@google.com> | 2016-04-04 16:22:38 +0200 |
|---|---|---|
| committer | Dmitry Vyukov <dvyukov@google.com> | 2016-04-05 15:52:03 +0000 |
| commit | 475d113b53024fe7a35cea0f620b30f97cd0810f (patch) | |
| tree | 0712f9f200f0e8707b9c7d4a72a63c882d41a7e7 /src/runtime/proc.go | |
| parent | 3b246fa863dbd91588c5920969f9fd0ce8362129 (diff) | |
| download | go-475d113b53024fe7a35cea0f620b30f97cd0810f.tar.xz | |
runtime: don't burn CPU unnecessarily
Two GC-related functions, scang and casgstatus, wait in an active spin loop.
Active spinning is never a good idea in user-space. Once we wait several
times more than the expected wait time, something unexpected is happenning
(e.g. the thread we are waiting for is descheduled or handling a page fault)
and we need to yield to OS scheduler. Moreover, the expected wait time is
very high for these functions: scang wait time can be tens of milliseconds,
casgstatus can be hundreds of microseconds. It does not make sense to spin
even for that time.
go install -a std profile on a 4-core machine shows that 11% of time is spent
in the active spin in scang:
6.12% compile compile [.] runtime.scang
3.27% compile compile [.] runtime.readgstatus
1.72% compile compile [.] runtime/internal/atomic.Load
The active spin also increases tail latency in the case of the slightest
oversubscription: GC goroutines spend whole quantum in the loop instead of
executing user code.
Here is scang wait time histogram during go install -a std:
13707.0000 - 1815442.7667 [ 118]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎...
1815442.7667 - 3617178.5333 [ 9]: ∎∎∎∎∎∎∎∎∎
3617178.5333 - 5418914.3000 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
5418914.3000 - 7220650.0667 [ 5]: ∎∎∎∎∎
7220650.0667 - 9022385.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
9022385.8333 - 10824121.6000 [ 13]: ∎∎∎∎∎∎∎∎∎∎∎∎∎
10824121.6000 - 12625857.3667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
12625857.3667 - 14427593.1333 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14427593.1333 - 16229328.9000 [ 18]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
16229328.9000 - 18031064.6667 [ 32]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
18031064.6667 - 19832800.4333 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
19832800.4333 - 21634536.2000 [ 6]: ∎∎∎∎∎∎
21634536.2000 - 23436271.9667 [ 15]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
23436271.9667 - 25238007.7333 [ 11]: ∎∎∎∎∎∎∎∎∎∎∎
25238007.7333 - 27039743.5000 [ 27]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
27039743.5000 - 28841479.2667 [ 20]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
28841479.2667 - 30643215.0333 [ 10]: ∎∎∎∎∎∎∎∎∎∎
30643215.0333 - 32444950.8000 [ 7]: ∎∎∎∎∎∎∎
32444950.8000 - 34246686.5667 [ 4]: ∎∎∎∎
34246686.5667 - 36048422.3333 [ 4]: ∎∎∎∎
36048422.3333 - 37850158.1000 [ 1]: ∎
37850158.1000 - 39651893.8667 [ 5]: ∎∎∎∎∎
39651893.8667 - 41453629.6333 [ 2]: ∎∎
41453629.6333 - 43255365.4000 [ 2]: ∎∎
43255365.4000 - 45057101.1667 [ 2]: ∎∎
45057101.1667 - 46858836.9333 [ 1]: ∎
46858836.9333 - 48660572.7000 [ 2]: ∎∎
48660572.7000 - 50462308.4667 [ 3]: ∎∎∎
50462308.4667 - 52264044.2333 [ 2]: ∎∎
52264044.2333 - 54065780.0000 [ 2]: ∎∎
and the zoomed-in first part:
13707.0000 - 19916.7667 [ 2]: ∎∎
19916.7667 - 26126.5333 [ 2]: ∎∎
26126.5333 - 32336.3000 [ 9]: ∎∎∎∎∎∎∎∎∎
32336.3000 - 38546.0667 [ 8]: ∎∎∎∎∎∎∎∎
38546.0667 - 44755.8333 [ 12]: ∎∎∎∎∎∎∎∎∎∎∎∎
44755.8333 - 50965.6000 [ 10]: ∎∎∎∎∎∎∎∎∎∎
50965.6000 - 57175.3667 [ 5]: ∎∎∎∎∎
57175.3667 - 63385.1333 [ 6]: ∎∎∎∎∎∎
63385.1333 - 69594.9000 [ 5]: ∎∎∎∎∎
69594.9000 - 75804.6667 [ 6]: ∎∎∎∎∎∎
75804.6667 - 82014.4333 [ 6]: ∎∎∎∎∎∎
82014.4333 - 88224.2000 [ 4]: ∎∎∎∎
88224.2000 - 94433.9667 [ 1]: ∎
94433.9667 - 100643.7333 [ 1]: ∎
100643.7333 - 106853.5000 [ 2]: ∎∎
106853.5000 - 113063.2667 [ 0]:
113063.2667 - 119273.0333 [ 2]: ∎∎
119273.0333 - 125482.8000 [ 2]: ∎∎
125482.8000 - 131692.5667 [ 1]: ∎
131692.5667 - 137902.3333 [ 1]: ∎
137902.3333 - 144112.1000 [ 0]:
144112.1000 - 150321.8667 [ 2]: ∎∎
150321.8667 - 156531.6333 [ 1]: ∎
156531.6333 - 162741.4000 [ 1]: ∎
162741.4000 - 168951.1667 [ 0]:
168951.1667 - 175160.9333 [ 0]:
175160.9333 - 181370.7000 [ 1]: ∎
181370.7000 - 187580.4667 [ 1]: ∎
187580.4667 - 193790.2333 [ 2]: ∎∎
193790.2333 - 200000.0000 [ 0]:
Here is casgstatus wait time histogram:
631.0000 - 5276.6333 [ 3]: ∎∎∎
5276.6333 - 9922.2667 [ 5]: ∎∎∎∎∎
9922.2667 - 14567.9000 [ 2]: ∎∎
14567.9000 - 19213.5333 [ 6]: ∎∎∎∎∎∎
19213.5333 - 23859.1667 [ 5]: ∎∎∎∎∎
23859.1667 - 28504.8000 [ 6]: ∎∎∎∎∎∎
28504.8000 - 33150.4333 [ 6]: ∎∎∎∎∎∎
33150.4333 - 37796.0667 [ 2]: ∎∎
37796.0667 - 42441.7000 [ 1]: ∎
42441.7000 - 47087.3333 [ 3]: ∎∎∎
47087.3333 - 51732.9667 [ 0]:
51732.9667 - 56378.6000 [ 1]: ∎
56378.6000 - 61024.2333 [ 0]:
61024.2333 - 65669.8667 [ 0]:
65669.8667 - 70315.5000 [ 0]:
70315.5000 - 74961.1333 [ 1]: ∎
74961.1333 - 79606.7667 [ 0]:
79606.7667 - 84252.4000 [ 0]:
84252.4000 - 88898.0333 [ 0]:
88898.0333 - 93543.6667 [ 0]:
93543.6667 - 98189.3000 [ 0]:
98189.3000 - 102834.9333 [ 0]:
102834.9333 - 107480.5667 [ 1]: ∎
107480.5667 - 112126.2000 [ 0]:
112126.2000 - 116771.8333 [ 0]:
116771.8333 - 121417.4667 [ 0]:
121417.4667 - 126063.1000 [ 0]:
126063.1000 - 130708.7333 [ 0]:
130708.7333 - 135354.3667 [ 0]:
135354.3667 - 140000.0000 [ 1]: ∎
Ideally we eliminate the waiting by switching to async
state machine for GC, but for now just yield to OS scheduler
after a reasonable wait time.
To choose yielding parameters I've measured
golang.org/x/benchmarks/http tail latencies with different yield
delays and oversubscription levels.
With no oversubscription (to the degree possible):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.41ms ±15% 1.41ms ± 5% ~ (p=0.611 n=13+12)
Latency-95 5.21ms ± 2% 5.15ms ± 2% -1.15% (p=0.012 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.54% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.2ms ±10% -5.46% (p=0.004 n=12+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.41ms ±15% 1.41ms ± 8% ~ (p=0.511 n=13+13)
Latency-95 5.21ms ± 2% 5.14ms ± 2% -1.23% (p=0.006 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.94% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 10.1ms ± 8% -6.14% (p=0.000 n=12+13)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.41ms ±15% 1.45ms ± 6% ~ (p=0.724 n=13+13)
Latency-95 5.21ms ± 2% 5.18ms ± 1% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.05ms ± 2% -1.64% (p=0.002 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 5% -6.72% (p=0.000 n=12+13)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.41ms ±15% 1.51ms ± 7% +6.57% (p=0.002 n=13+13)
Latency-95 5.21ms ± 2% 5.21ms ± 2% ~ (p=0.960 n=13+13)
Latency-99 7.16ms ± 2% 7.06ms ± 2% -1.50% (p=0.012 n=13+13)
Latency-999 10.7ms ± 9% 10.0ms ± 6% -6.49% (p=0.000 n=12+13)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.41ms ±15% 1.53ms ± 6% +8.48% (p=0.000 n=13+12)
Latency-95 5.21ms ± 2% 5.23ms ± 2% ~ (p=0.287 n=13+13)
Latency-99 7.16ms ± 2% 7.08ms ± 2% -1.21% (p=0.004 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 3% -7.99% (p=0.000 n=12+12)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.41ms ±15% 1.47ms ± 5% ~ (p=0.072 n=13+13)
Latency-95 5.21ms ± 2% 5.17ms ± 2% ~ (p=0.091 n=13+13)
Latency-99 7.16ms ± 2% 7.02ms ± 2% -1.99% (p=0.000 n=13+13)
Latency-999 10.7ms ± 9% 9.9ms ± 5% -7.86% (p=0.000 n=12+13)
With slight oversubscription (another instance of http benchmark
was running in background with reduced GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 840µs ± 3% 804µs ± 3% -4.37% (p=0.000 n=15+18)
Latency-95 6.52ms ± 4% 6.03ms ± 4% -7.51% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.0ms ± 4% -7.33% (p=0.000 n=18+14)
Latency-999 18.0ms ± 9% 16.8ms ± 7% -6.84% (p=0.000 n=18+18)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 840µs ± 3% 809µs ± 3% -3.71% (p=0.000 n=15+17)
Latency-95 6.52ms ± 4% 6.11ms ± 4% -6.29% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 9.9ms ± 6% -7.55% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±11% -8.49% (p=0.000 n=18+18)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 840µs ± 3% 823µs ± 5% -2.06% (p=0.002 n=15+18)
Latency-95 6.52ms ± 4% 6.32ms ± 3% -3.05% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -5.22% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.7ms ±10% -7.09% (p=0.000 n=18+18)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 840µs ± 3% 836µs ± 5% ~ (p=0.442 n=15+18)
Latency-95 6.52ms ± 4% 6.39ms ± 3% -2.00% (p=0.000 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 6% -5.15% (p=0.000 n=18+17)
Latency-999 18.0ms ± 9% 16.6ms ± 8% -7.48% (p=0.000 n=18+18)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 840µs ± 3% 836µs ± 6% ~ (p=0.401 n=15+18)
Latency-95 6.52ms ± 4% 6.40ms ± 4% -1.79% (p=0.010 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 5% -4.95% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.5ms ±14% -8.17% (p=0.000 n=18+18)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 840µs ± 3% 828µs ± 2% -1.49% (p=0.001 n=15+17)
Latency-95 6.52ms ± 4% 6.38ms ± 4% -2.04% (p=0.001 n=18+18)
Latency-99 10.8ms ± 7% 10.2ms ± 4% -4.77% (p=0.000 n=18+18)
Latency-999 18.0ms ± 9% 16.9ms ± 9% -6.23% (p=0.000 n=18+18)
With significant oversubscription (background http benchmark
was running with full GOMAXPROCS):
scang yield delay = 1, casgstatus yield delay = 1
Latency-50 1.32ms ±12% 1.30ms ±13% ~ (p=0.454 n=14+14)
Latency-95 16.3ms ±10% 15.3ms ± 7% -6.29% (p=0.001 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 5% -5.04% (p=0.001 n=14+12)
Latency-999 49.9ms ±19% 45.9ms ± 5% -8.00% (p=0.008 n=14+13)
scang yield delay = 5000, casgstatus yield delay = 3000
Latency-50 1.32ms ±12% 1.29ms ± 9% ~ (p=0.227 n=14+14)
Latency-95 16.3ms ±10% 15.4ms ± 5% -5.27% (p=0.002 n=14+14)
Latency-99 29.4ms ±10% 27.9ms ± 6% -5.16% (p=0.001 n=14+14)
Latency-999 49.9ms ±19% 46.8ms ± 8% -6.21% (p=0.050 n=14+14)
scang yield delay = 10000, casgstatus yield delay = 5000
Latency-50 1.32ms ±12% 1.35ms ± 9% ~ (p=0.401 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 4% -7.67% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 5% -6.98% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.7ms ± 5% -10.56% (p=0.000 n=14+11)
scang yield delay = 30000, casgstatus yield delay = 10000
Latency-50 1.32ms ±12% 1.36ms ±10% ~ (p=0.246 n=14+14)
Latency-95 16.3ms ±10% 14.9ms ± 5% -8.31% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.4ms ± 7% -6.70% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.9ms ±15% -10.13% (p=0.003 n=14+14)
scang yield delay = 100000, casgstatus yield delay = 50000
Latency-50 1.32ms ±12% 1.41ms ± 9% +6.37% (p=0.008 n=14+13)
Latency-95 16.3ms ±10% 15.1ms ± 8% -7.45% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.5ms ±12% -6.67% (p=0.002 n=14+14)
Latency-999 49.9ms ±19% 45.9ms ±16% -8.06% (p=0.019 n=14+14)
scang yield delay = 200000, casgstatus yield delay = 100000
Latency-50 1.32ms ±12% 1.42ms ±10% +7.21% (p=0.003 n=14+14)
Latency-95 16.3ms ±10% 15.0ms ± 7% -7.59% (p=0.000 n=14+14)
Latency-99 29.4ms ±10% 27.3ms ± 8% -7.20% (p=0.000 n=14+14)
Latency-999 49.9ms ±19% 44.8ms ± 8% -10.21% (p=0.001 n=14+13)
All numbers are on 8 cores and with GOGC=10 (http benchmark has
tiny heap, few goroutines and low allocation rate, so by default
GC barely affects tail latency).
10us/5us yield delays seem to provide a reasonable compromise
and give 5-10% tail latency reduction. That's what used in this change.
go install -a std results on 4 core machine:
name old time/op new time/op delta
Time 8.39s ± 2% 7.94s ± 2% -5.34% (p=0.000 n=47+49)
UserTime 24.6s ± 2% 22.9s ± 2% -6.76% (p=0.000 n=49+49)
SysTime 1.77s ± 9% 1.89s ±11% +7.00% (p=0.000 n=49+49)
CpuLoad 315ns ± 2% 313ns ± 1% -0.59% (p=0.000 n=49+48) # %CPU
MaxRSS 97.1ms ± 4% 97.5ms ± 9% ~ (p=0.838 n=46+49) # bytes
Update #14396
Update #14189
Change-Id: I3f4109bf8f7fd79b39c466576690a778232055a2
Reviewed-on: https://go-review.googlesource.com/21503
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Diffstat (limited to 'src/runtime/proc.go')
| -rw-r--r-- | src/runtime/proc.go | 37 |
1 files changed, 35 insertions, 2 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 2cc29df434..5145c84aea 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -721,9 +721,13 @@ func casgstatus(gp *g, oldval, newval uint32) { throw("casgstatus") } + // See http://golang.org/cl/21503 for justification of the yield delay. + const yieldDelay = 5 * 1000 + var nextYield int64 + // loop if gp->atomicstatus is in a scan state giving // GC time to finish and change the state to oldval. - for !atomic.Cas(&gp.atomicstatus, oldval, newval) { + for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ { if oldval == _Gwaiting && gp.atomicstatus == _Grunnable { systemstack(func() { throw("casgstatus: waiting for Gwaiting but is Grunnable") @@ -736,6 +740,18 @@ func casgstatus(gp *g, oldval, newval uint32) { // gcphasework(gp) // }) // } + // But meanwhile just yield. + if i == 0 { + nextYield = nanotime() + yieldDelay + } + if nanotime() < nextYield { + for x := 0; x < 10 && gp.atomicstatus != oldval; x++ { + procyield(1) + } + } else { + osyield() + nextYield = nanotime() + yieldDelay/2 + } } if newval == _Grunning { gp.gcscanvalid = false @@ -773,12 +789,17 @@ func scang(gp *g) { gp.gcscandone = false + // See http://golang.org/cl/21503 for justification of the yield delay. + const yieldDelay = 10 * 1000 + var nextYield int64 + // Endeavor to get gcscandone set to true, // either by doing the stack scan ourselves or by coercing gp to scan itself. // gp.gcscandone can transition from false to true when we're not looking // (if we asked for preemption), so any time we lock the status using // castogscanstatus we have to double-check that the scan is still not done. - for !gp.gcscandone { +loop: + for i := 0; !gp.gcscandone; i++ { switch s := readgstatus(gp); s { default: dumpgstatus(gp) @@ -787,6 +808,7 @@ func scang(gp *g) { case _Gdead: // No stack. gp.gcscandone = true + break loop case _Gcopystack: // Stack being switched. Go around again. @@ -802,6 +824,7 @@ func scang(gp *g) { gp.gcscandone = true } restartg(gp) + break loop } case _Gscanwaiting: @@ -827,6 +850,16 @@ func scang(gp *g) { casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning) } } + + if i == 0 { + nextYield = nanotime() + yieldDelay + } + if nanotime() < nextYield { + procyield(10) + } else { + osyield() + nextYield = nanotime() + yieldDelay/2 + } } gp.preemptscan = false // cancel scan request if no longer needed |
