aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/proc.go
diff options
context:
space:
mode:
authorMichael Anthony Knyszek <mknyszek@google.com>2024-03-25 17:50:13 +0000
committerGopher Robot <gobot@golang.org>2024-04-08 21:43:16 +0000
commite995aa95cb5f379c1df5d5511ee09970261d877f (patch)
tree5b348fd16dd9bac0453d2530027981b2b958e54c /src/runtime/proc.go
parentd4395ebc0c90a11a5f31fab0808baf3cb25d6ba8 (diff)
downloadgo-e995aa95cb5f379c1df5d5511ee09970261d877f.tar.xz
runtime: account for _Pgcstop in GC CPU pause time in a fine-grained way
The previous CL, CL 570257, made it so that STW time no longer overlapped with other CPU time tracking. However, what we lost was insight into the CPU time spent _stopping_ the world, which can be just as important. There's pretty much no easy way to measure this indirectly, so this CL implements a direct measurement: whenever a P enters _Pgcstop, it writes down what time it did so. stopTheWorld then accumulates all the time deltas between when it finished stopping the world and each P's stop time into a total additional pause time. The GC pause cases then accumulate this number into the metrics. This should cause minimal additional overhead in stopping the world. GC STWs already take on the order of 10s to 100s of microseconds. Even for 100 Ps, the extra `nanotime` call per P is only 1500ns of additional CPU time. This is likely to be much less in actual pause latency, since it all happens concurrently. Change-Id: Icf190ffea469cd35ebaf0b2587bf6358648c8554 Reviewed-on: https://go-review.googlesource.com/c/go/+/574215 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Nicolas Hillegeer <aktau@google.com> Auto-Submit: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/runtime/proc.go')
-rw-r--r--src/runtime/proc.go25
1 files changed, 23 insertions, 2 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 1aad8e3c63..6a3c786e36 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1325,6 +1325,7 @@ type worldStop struct {
reason stwReason
startedStopping int64
finishedStopping int64
+ stoppingCPUTime int64
}
// Temporary variable for stopTheWorld, when it can't write to the stack.
@@ -1480,6 +1481,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
preemptall()
// stop current P
gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
+ gp.m.p.ptr().gcStopTime = start
sched.stopwait--
// try to retake all P's in Psyscall status
trace = traceAcquire()
@@ -1491,6 +1493,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
trace.ProcSteal(pp, false)
}
pp.syscalltick++
+ pp.gcStopTime = nanotime()
sched.stopwait--
}
}
@@ -1506,6 +1509,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
break
}
pp.status = _Pgcstop
+ pp.gcStopTime = nanotime()
sched.stopwait--
}
wait := sched.stopwait > 0
@@ -1531,7 +1535,11 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
sched.stwStoppingTimeOther.record(startTime)
}
- // sanity checks
+ // Double-check we actually stopped everything, and all the invariants hold.
+ // Also accumulate all the time spent by each P in _Pgcstop up to the point
+ // where everything was stopped. This will be accumulated into the total pause
+ // CPU time by the caller.
+ stoppingCPUTime := int64(0)
bad := ""
if sched.stopwait != 0 {
bad = "stopTheWorld: not stopped (stopwait != 0)"
@@ -1540,6 +1548,11 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
if pp.status != _Pgcstop {
bad = "stopTheWorld: not stopped (status != _Pgcstop)"
}
+ if pp.gcStopTime == 0 && bad == "" {
+ bad = "stopTheWorld: broken CPU time accounting"
+ }
+ stoppingCPUTime += finish - pp.gcStopTime
+ pp.gcStopTime = 0
}
}
if freezing.Load() {
@@ -1556,7 +1569,12 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
worldStopped()
- return worldStop{reason: reason, startedStopping: start, finishedStopping: finish}
+ return worldStop{
+ reason: reason,
+ startedStopping: start,
+ finishedStopping: finish,
+ stoppingCPUTime: stoppingCPUTime,
+ }
}
// reason is the same STW reason passed to stopTheWorld. start is the start
@@ -2945,6 +2963,7 @@ func handoffp(pp *p) {
lock(&sched.lock)
if sched.gcwaiting.Load() {
pp.status = _Pgcstop
+ pp.gcStopTime = nanotime()
sched.stopwait--
if sched.stopwait == 0 {
notewakeup(&sched.stopnote)
@@ -3087,6 +3106,7 @@ func gcstopm() {
pp := releasep()
lock(&sched.lock)
pp.status = _Pgcstop
+ pp.gcStopTime = nanotime()
sched.stopwait--
if sched.stopwait == 0 {
notewakeup(&sched.stopnote)
@@ -4404,6 +4424,7 @@ func entersyscall_gcwait() {
}
traceRelease(trace)
}
+ pp.gcStopTime = nanotime()
pp.syscalltick++
if sched.stopwait--; sched.stopwait == 0 {
notewakeup(&sched.stopnote)