aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/runtime/crash_test.go3
-rw-r--r--src/runtime/heapdump.go8
-rw-r--r--src/runtime/metrics.go11
-rw-r--r--src/runtime/mprof.go13
-rw-r--r--src/runtime/mstats.go4
-rw-r--r--src/runtime/os_windows.go4
-rw-r--r--src/runtime/preempt.go2
-rw-r--r--src/runtime/proc.go671
-rw-r--r--src/runtime/runtime2.go60
-rw-r--r--src/runtime/traceevent.go7
-rw-r--r--src/runtime/traceruntime.go26
-rw-r--r--src/runtime/tracestatus.go11
12 files changed, 508 insertions, 312 deletions
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 2b8ca549ad..7d02363cc9 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -788,6 +788,9 @@ func TestPanicInlined(t *testing.T) {
// Test for issues #3934 and #20018.
// We want to delay exiting until a panic print is complete.
func TestPanicRace(t *testing.T) {
+ if race.Enabled {
+ t.Skip("racefini exits program before main can delay exiting")
+ }
testenv.MustHaveGoRun(t)
exe, err := buildTestProg(t, "testprog")
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 9aaee8f90c..3671c65ab7 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -412,6 +412,14 @@ func dumpgs() {
forEachG(func(gp *g) {
status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
switch status {
+ case _Grunning:
+ // Dump goroutine if it's _Grunning only during a syscall. This is safe
+ // because the goroutine will just park without mutating its stack, since
+ // the world is stopped.
+ if gp.syscallsp != 0 {
+ dumpgoroutine(gp)
+ }
+ fallthrough
default:
print("runtime: unexpected G.status ", hex(status), "\n")
throw("dumpgs in STW - bad status")
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 6ee2c28549..2710bf6865 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -818,9 +818,12 @@ func (a *schedStatsAggregate) compute() {
a.gCreated += p.goroutinesCreated
switch p.status {
case _Prunning:
- a.gRunning++
- case _Psyscall:
- a.gNonGo++
+ if thread, ok := setBlockOnExitSyscall(p); ok {
+ thread.resume()
+ a.gNonGo++
+ } else {
+ a.gRunning++
+ }
case _Pgcstop:
// The world is stopping or stopped.
// This is fine. The results will be
@@ -847,7 +850,7 @@ func (a *schedStatsAggregate) compute() {
// Global run queue.
a.gRunnable += uint64(sched.runq.size)
- // Account for Gs that are in _Gsyscall without a P in _Psyscall.
+ // Account for Gs that are in _Gsyscall without a P.
nGsyscallNoP := sched.nGsyscallNoP.Load()
// nGsyscallNoP can go negative during temporary races.
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 19e90fbb33..d745d5f5b9 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -1535,7 +1535,18 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
// everything else, we just don't record the stack in the profile.
return
}
- if readgstatus(gp1) == _Grunning {
+ // Double-check that we didn't make a grave mistake. If the G is running then in
+ // general, we cannot safely read its stack.
+ //
+ // However, there is one case where it's OK. There's a small window of time in
+ // exitsyscall where a goroutine could be in _Grunning as it's exiting a syscall.
+ // This is OK because goroutine will not exit the syscall until it passes through
+ // a call to tryRecordGoroutineProfile. (An explicit one on the fast path, an
+ // implicit one via the scheduler on the slow path.)
+ //
+ // This is also why it's safe to check syscallsp here. The syscall path mutates
+ // syscallsp only after passing through tryRecordGoroutineProfile.
+ if readgstatus(gp1) == _Grunning && gp1.syscallsp == 0 {
print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
throw("cannot read stack of running goroutine")
}
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index e34f0b10ea..febfb69a3a 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -914,7 +914,7 @@ type cpuStats struct {
ScavengeTotalTime int64
IdleTime int64 // Time Ps spent in _Pidle.
- UserTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
+ UserTime int64 // Time Ps spent in _Prunning that's not any of the above.
TotalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
}
@@ -976,7 +976,7 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) {
// Compute userTime. We compute this indirectly as everything that's not the above.
//
// Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
- // is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
+ // is covered by idleTime, what we're left with is time spent in _Prunning,
// the latter of which is fine because the P will either go idle or get used for something
// else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
// _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 2ae625580d..1688ba910e 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -191,8 +191,8 @@ type mOS struct {
// complete.
//
// TODO(austin): We may not need this if preemption were more
- // tightly synchronized on the G/P status and preemption
- // blocked transition into _Gsyscall/_Psyscall.
+ // tightly synchronized on the G status and preemption
+ // blocked transition into _Gsyscall.
preemptExtLock uint32
}
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index 1044a6d720..447c7399fc 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -286,7 +286,7 @@ func resumeG(state suspendGState) {
//
//go:nosplit
func canPreemptM(mp *m) bool {
- return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+ return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning && mp.curg != nil && readgstatus(mp.curg)&^_Gscan != _Gsyscall
}
//go:generate go run mkpreempt.go
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 6c16effc95..081b9a2825 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1654,29 +1654,21 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
sched.stopwait = gomaxprocs
sched.gcwaiting.Store(true)
preemptall()
- // stop current P
+
+ // Stop current P.
gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
gp.m.p.ptr().gcStopTime = start
sched.stopwait--
- // try to retake all P's in Psyscall status
- trace = traceAcquire()
+
+ // Try to retake all P's in syscalls.
for _, pp := range allp {
- s := pp.status
- if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
- if trace.ok() {
- trace.ProcSteal(pp, false)
- }
- sched.nGsyscallNoP.Add(1)
- pp.syscalltick++
- pp.gcStopTime = nanotime()
- sched.stopwait--
+ if thread, ok := setBlockOnExitSyscall(pp); ok {
+ thread.gcstopP()
+ thread.resume()
}
}
- if trace.ok() {
- traceRelease(trace)
- }
- // stop idle P's
+ // Stop idle Ps.
now := nanotime()
for {
pp, _ := pidleget(now)
@@ -1690,7 +1682,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
wait := sched.stopwait > 0
unlock(&sched.lock)
- // wait for remaining P's to stop voluntarily
+ // Wait for remaining Ps to stop voluntarily.
if wait {
for {
// wait for 100us, then try to re-preempt in case of any races
@@ -2156,9 +2148,9 @@ func forEachPInternal(fn func(*p)) {
}
preemptall()
- // Any P entering _Pidle or _Psyscall from now on will observe
+ // Any P entering _Pidle or a system call from now on will observe
// p.runSafePointFn == 1 and will call runSafePointFn when
- // changing its status to _Pidle/_Psyscall.
+ // changing its status to _Pidle.
// Run safe point function for all idle Ps. sched.pidle will
// not change because we hold sched.lock.
@@ -2175,25 +2167,19 @@ func forEachPInternal(fn func(*p)) {
// Run fn for the current P.
fn(pp)
- // Force Ps currently in _Psyscall into _Pidle and hand them
+ // Force Ps currently in a system call into _Pidle and hand them
// off to induce safe point function execution.
for _, p2 := range allp {
- s := p2.status
-
// We need to be fine-grained about tracing here, since handoffp
// might call into the tracer, and the tracer is non-reentrant.
- trace := traceAcquire()
- if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
- if trace.ok() {
- // It's important that we traceRelease before we call handoffp, which may also traceAcquire.
- trace.ProcSteal(p2, false)
- traceRelease(trace)
- }
- sched.nGsyscallNoP.Add(1)
- p2.syscalltick++
+ if atomic.Load(&p2.runSafePointFn) != 1 {
+ // Already ran it.
+ continue
+ }
+ if thread, ok := setBlockOnExitSyscall(p2); ok {
+ thread.takeP()
+ thread.resume()
handoffp(p2)
- } else if trace.ok() {
- traceRelease(trace)
}
}
@@ -2234,9 +2220,9 @@ func forEachPInternal(fn func(*p)) {
// }
//
// runSafePointFn must be checked on any transition in to _Pidle or
-// _Psyscall to avoid a race where forEachP sees that the P is running
-// just before the P goes into _Pidle/_Psyscall and neither forEachP
-// nor the P run the safe-point function.
+// when entering a system call to avoid a race where forEachP sees
+// that the P is running just before the P goes into _Pidle/system call
+// and neither forEachP nor the P run the safe-point function.
func runSafePointFn() {
p := getg().m.p.ptr()
// Resolve the race between forEachP running the safe-point
@@ -2571,7 +2557,7 @@ func oneNewExtraM() {
// So that the destructor would invoke dropm while the non-Go thread is exiting.
// This is much faster since it avoids expensive signal-related syscalls.
//
-// This always runs without a P, so //go:nowritebarrierrec is required.
+// This may run without a P, so //go:nowritebarrierrec is required.
//
// This may run with a different stack than was recorded in g0 (there is no
// call to callbackUpdateSystemStack prior to dropm), so this must be
@@ -4566,7 +4552,6 @@ func save(pc, sp, bp uintptr) {
//
//go:nosplit
func reentersyscall(pc, sp, bp uintptr) {
- trace := traceAcquire()
gp := getg()
// Disable preemption because during this function g is in Gsyscall status,
@@ -4580,17 +4565,23 @@ func reentersyscall(pc, sp, bp uintptr) {
gp.stackguard0 = stackPreempt
gp.throwsplit = true
+ // Copy the syscalltick over so we can identify if the P got stolen later.
+ gp.m.syscalltick = gp.m.p.ptr().syscalltick
+
+ pp := gp.m.p.ptr()
+ if pp.runSafePointFn != 0 {
+ // runSafePointFn may stack split if run on this stack
+ systemstack(runSafePointFn)
+ }
+ gp.m.oldp.set(pp)
+
// Leave SP around for GC and traceback.
save(pc, sp, bp)
gp.syscallsp = sp
gp.syscallpc = pc
gp.syscallbp = bp
- casgstatus(gp, _Grunning, _Gsyscall)
- if staticLockRanking {
- // When doing static lock ranking casgstatus can call
- // systemstack which clobbers g.sched.
- save(pc, sp, bp)
- }
+
+ // Double-check sp and bp.
if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
systemstack(func() {
print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@@ -4603,40 +4594,45 @@ func reentersyscall(pc, sp, bp uintptr) {
throw("entersyscall")
})
}
-
+ trace := traceAcquire()
if trace.ok() {
+ // Emit a trace event. Notably, actually emitting the event must happen before
+ // the casgstatus because it mutates the P, but the traceLocker must be held
+ // across the casgstatus so the transition is atomic with respect to the event.
systemstack(func() {
trace.GoSysCall()
- traceRelease(trace)
})
- // systemstack itself clobbers g.sched.{pc,sp} and we might
- // need them later when the G is genuinely blocked in a
- // syscall
+ // systemstack clobbered gp.sched, so restore it.
save(pc, sp, bp)
}
-
- if sched.sysmonwait.Load() {
- systemstack(entersyscall_sysmon)
+ if sched.gcwaiting.Load() {
+ // Optimization: If there's a pending STW, do the equivalent of
+ // entersyscallblock here at the last minute and immediately give
+ // away our P.
+ systemstack(func() {
+ entersyscallHandleGCWait(trace)
+ })
+ // systemstack clobbered gp.sched, so restore it.
save(pc, sp, bp)
}
-
- if gp.m.p.ptr().runSafePointFn != 0 {
- // runSafePointFn may stack split if run on this stack
- systemstack(runSafePointFn)
+ // As soon as we switch to _Gsyscall, we are in danger of losing our P.
+ // We must not touch it after this point.
+ casgstatus(gp, _Grunning, _Gsyscall)
+ if staticLockRanking {
+ // casgstatus clobbers gp.sched via systemstack under staticLockRanking. Restore it.
save(pc, sp, bp)
}
-
- gp.m.syscalltick = gp.m.p.ptr().syscalltick
- pp := gp.m.p.ptr()
- pp.m = 0
- gp.m.oldp.set(pp)
- gp.m.p = 0
- atomic.Store(&pp.status, _Psyscall)
- if sched.gcwaiting.Load() {
- systemstack(entersyscall_gcwait)
+ if trace.ok() {
+ // N.B. We don't need to go on the systemstack because traceRelease is very
+ // carefully recursively nosplit. This also means we don't need to worry
+ // about clobbering gp.sched.
+ traceRelease(trace)
+ }
+ if sched.sysmonwait.Load() {
+ systemstack(entersyscallWakeSysmon)
+ // systemstack clobbered gp.sched, so restore it.
save(pc, sp, bp)
}
-
gp.m.locks--
}
@@ -4663,7 +4659,7 @@ func entersyscall() {
reentersyscall(sys.GetCallerPC(), sys.GetCallerSP(), fp)
}
-func entersyscall_sysmon() {
+func entersyscallWakeSysmon() {
lock(&sched.lock)
if sched.sysmonwait.Load() {
sched.sysmonwait.Store(false)
@@ -4672,25 +4668,19 @@ func entersyscall_sysmon() {
unlock(&sched.lock)
}
-func entersyscall_gcwait() {
+func entersyscallHandleGCWait(trace traceLocker) {
gp := getg()
- pp := gp.m.oldp.ptr()
lock(&sched.lock)
- trace := traceAcquire()
- if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
+ if sched.stopwait > 0 {
+ // Set our P to _Pgcstop so the STW can take it.
+ pp := gp.m.p.ptr()
+ pp.m = 0
+ gp.m.p = 0
+ atomic.Store(&pp.status, _Pgcstop)
+
if trace.ok() {
- // This is a steal in the new tracer. While it's very likely
- // that we were the ones to put this P into _Psyscall, between
- // then and now it's totally possible it had been stolen and
- // then put back into _Psyscall for us to acquire here. In such
- // case ProcStop would be incorrect.
- //
- // TODO(mknyszek): Consider emitting a ProcStop instead when
- // gp.m.syscalltick == pp.syscalltick, since then we know we never
- // lost the P.
- trace.ProcSteal(pp, true)
- traceRelease(trace)
+ trace.ProcStop(pp)
}
sched.nGsyscallNoP.Add(1)
pp.gcStopTime = nanotime()
@@ -4698,8 +4688,6 @@ func entersyscall_gcwait() {
if sched.stopwait--; sched.stopwait == 0 {
notewakeup(&sched.stopnote)
}
- } else if trace.ok() {
- traceRelease(trace)
}
unlock(&sched.lock)
}
@@ -4744,6 +4732,22 @@ func entersyscallblock() {
throw("entersyscallblock")
})
}
+
+ // Once we switch to _Gsyscall, we can't safely touch
+ // our P anymore, so we need to hand it off beforehand.
+ // The tracer also needs to see the syscall before the P
+ // handoff, so the order here must be (1) trace,
+ // (2) handoff, (3) _Gsyscall switch.
+ trace := traceAcquire()
+ systemstack(func() {
+ if trace.ok() {
+ trace.GoSysCall()
+ }
+ handoffp(releasep())
+ })
+ // <--
+ // Caution: we're in a small window where we are in _Grunning without a P.
+ // -->
casgstatus(gp, _Grunning, _Gsyscall)
if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
systemstack(func() {
@@ -4757,8 +4761,11 @@ func entersyscallblock() {
throw("entersyscallblock")
})
}
-
- systemstack(entersyscallblock_handoff)
+ if trace.ok() {
+ systemstack(func() {
+ traceRelease(trace)
+ })
+ }
// Resave for traceback during blocked call.
save(sys.GetCallerPC(), sys.GetCallerSP(), getcallerfp())
@@ -4766,15 +4773,6 @@ func entersyscallblock() {
gp.m.locks--
}
-func entersyscallblock_handoff() {
- trace := traceAcquire()
- if trace.ok() {
- trace.GoSysCall()
- traceRelease(trace)
- }
- handoffp(releasep())
-}
-
// The goroutine g exited its system call.
// Arrange for it to run on a cpu again.
// This is called only from the go syscall library, not
@@ -4802,13 +4800,87 @@ func exitsyscall() {
if sys.GetCallerSP() > gp.syscallsp {
throw("exitsyscall: syscall frame is no longer valid")
}
-
gp.waitsince = 0
+
+ if sched.stopwait == freezeStopWait {
+ // Wedge ourselves if there's an outstanding freezetheworld.
+ // If we transition to running, we might end up with our traceback
+ // being taken twice.
+ systemstack(func() {
+ lock(&deadlock)
+ lock(&deadlock)
+ })
+ }
+
+ // Optimistically assume we're going to keep running, and switch to running.
+ // Before this point, our P wiring is not ours. Once we get past this point,
+ // we can access our P if we have it, otherwise we lost it.
+ //
+ // N.B. Because we're transitioning to _Grunning here, traceAcquire doesn't
+ // need to be held ahead of time. We're effectively atomic with respect to
+ // the tracer because we're non-preemptible and in the runtime. It can't stop
+ // us to read a bad status.
+ casgstatus(gp, _Gsyscall, _Grunning)
+
+ // Caution: we're in a window where we may be in _Grunning without a P.
+ // Either we will grab a P or call exitsyscall0, where we'll switch to
+ // _Grunnable.
+
+ // Grab and clear our old P.
oldp := gp.m.oldp.ptr()
- gp.m.oldp = 0
- if exitsyscallfast(oldp) {
- // When exitsyscallfast returns success, we have a P so can now use
- // write barriers
+ gp.m.oldp.set(nil)
+
+ // Check if we still have a P, and if not, try to acquire an idle P.
+ pp := gp.m.p.ptr()
+ if pp != nil {
+ // Fast path: we still have our P. Just emit a syscall exit event.
+ if trace := traceAcquire(); trace.ok() {
+ systemstack(func() {
+ // The truth is we truly never lost the P, but syscalltick
+ // is used to indicate whether the P should be treated as
+ // lost anyway. For example, when syscalltick is trashed by
+ // dropm.
+ //
+ // TODO(mknyszek): Consider a more explicit mechanism for this.
+ // Then syscalltick doesn't need to be trashed, and can be used
+ // exclusively by sysmon for deciding when it's time to retake.
+ if pp.syscalltick == gp.m.syscalltick {
+ trace.GoSysExit(false)
+ } else {
+ // Since we need to pretend we lost the P, but nobody ever
+ // took it, we need a ProcSteal event to model the loss.
+ // Then, continue with everything else we'd do if we lost
+ // the P.
+ trace.ProcSteal(pp)
+ trace.ProcStart()
+ trace.GoSysExit(true)
+ trace.GoStart()
+ }
+ traceRelease(trace)
+ })
+ }
+ } else {
+ // Slow path: we lost our P. Try to get another one.
+ systemstack(func() {
+ // Try to get some other P.
+ if pp := exitsyscallTryGetP(oldp); pp != nil {
+ // Install the P.
+ acquirepNoTrace(pp)
+
+ // We're going to start running again, so emit all the relevant events.
+ if trace := traceAcquire(); trace.ok() {
+ trace.ProcStart()
+ trace.GoSysExit(true)
+ trace.GoStart()
+ traceRelease(trace)
+ }
+ }
+ })
+ pp = gp.m.p.ptr()
+ }
+
+ // If we have a P, clean up and exit.
+ if pp != nil {
if goroutineProfile.active {
// Make sure that gp has had its stack written out to the goroutine
// profile, exactly as it was when the goroutine profiler first
@@ -4817,41 +4889,19 @@ func exitsyscall() {
tryRecordGoroutineProfileWB(gp)
})
}
- trace := traceAcquire()
- if trace.ok() {
- lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick
- systemstack(func() {
- // Write out syscall exit eagerly.
- //
- // It's important that we write this *after* we know whether we
- // lost our P or not (determined by exitsyscallfast).
- trace.GoSysExit(lostP)
- if lostP {
- // We lost the P at some point, even though we got it back here.
- // Trace that we're starting again, because there was a tracev2.GoSysBlock
- // call somewhere in exitsyscallfast (indicating that this goroutine
- // had blocked) and we're about to start running again.
- trace.GoStart()
- }
- })
- }
- // There's a cpu for us, so we can run.
- gp.m.p.ptr().syscalltick++
- // We need to cas the status and scan before resuming...
- casgstatus(gp, _Gsyscall, _Grunning)
- if trace.ok() {
- traceRelease(trace)
- }
+
+ // Increment the syscalltick for P, since we're exiting a syscall.
+ pp.syscalltick++
// Garbage collector isn't running (since we are),
// so okay to clear syscallsp.
gp.syscallsp = 0
gp.m.locks--
if gp.preempt {
- // restore the preemption request in case we've cleared it in newstack
+ // Restore the preemption request in case we cleared it in newstack.
gp.stackguard0 = stackPreempt
} else {
- // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+ // Otherwise restore the real stackGuard, we clobbered it in entersyscall/entersyscallblock.
gp.stackguard0 = gp.stack.lo + stackGuard
}
gp.throwsplit = false
@@ -4860,14 +4910,13 @@ func exitsyscall() {
// Scheduling of this goroutine is disabled.
Gosched()
}
-
return
}
-
+ // Slowest path: We couldn't get a P, so call into the scheduler.
gp.m.locks--
// Call the scheduler.
- mcall(exitsyscall0)
+ mcall(exitsyscallNoP)
// Scheduler returned, so we're allowed to run now.
// Delete the syscallsp information that we left for
@@ -4880,78 +4929,38 @@ func exitsyscall() {
gp.throwsplit = false
}
-//go:nosplit
-func exitsyscallfast(oldp *p) bool {
- // Freezetheworld sets stopwait but does not retake P's.
- if sched.stopwait == freezeStopWait {
- return false
- }
-
- // Try to re-acquire the last P.
- trace := traceAcquire()
- if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
- // There's a cpu for us, so we can run.
- wirep(oldp)
- exitsyscallfast_reacquired(trace)
- if trace.ok() {
- traceRelease(trace)
+// exitsyscall's attempt to try to get any P, if it's missing one.
+// Returns true on success.
+//
+// Must execute on the systemstack because exitsyscall is nosplit.
+//
+//go:systemstack
+func exitsyscallTryGetP(oldp *p) *p {
+ // Try to steal our old P back.
+ if oldp != nil {
+ if thread, ok := setBlockOnExitSyscall(oldp); ok {
+ thread.takeP()
+ thread.resume()
+ sched.nGsyscallNoP.Add(-1) // takeP adds 1.
+ return oldp
}
- return true
- }
- if trace.ok() {
- traceRelease(trace)
}
- // Try to get any other idle P.
+ // Try to get an idle P.
if sched.pidle != 0 {
- var ok bool
- systemstack(func() {
- ok = exitsyscallfast_pidle()
- })
- if ok {
- return true
+ lock(&sched.lock)
+ pp, _ := pidleget(0)
+ if pp != nil && sched.sysmonwait.Load() {
+ sched.sysmonwait.Store(false)
+ notewakeup(&sched.sysmonnote)
}
- }
- return false
-}
-
-// exitsyscallfast_reacquired is the exitsyscall path on which this G
-// has successfully reacquired the P it was running on before the
-// syscall.
-//
-//go:nosplit
-func exitsyscallfast_reacquired(trace traceLocker) {
- gp := getg()
- if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
- if trace.ok() {
- // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
- // tracev2.GoSysBlock for this syscall was already emitted,
- // but here we effectively retake the p from the new syscall running on the same p.
- systemstack(func() {
- // We're stealing the P. It's treated
- // as if it temporarily stopped running. Then, start running.
- trace.ProcSteal(gp.m.p.ptr(), true)
- trace.ProcStart()
- })
+ unlock(&sched.lock)
+ if pp != nil {
+ sched.nGsyscallNoP.Add(-1)
+ return pp
}
- gp.m.p.ptr().syscalltick++
- }
-}
-
-func exitsyscallfast_pidle() bool {
- lock(&sched.lock)
- pp, _ := pidleget(0)
- if pp != nil && sched.sysmonwait.Load() {
- sched.sysmonwait.Store(false)
- notewakeup(&sched.sysmonnote)
}
- unlock(&sched.lock)
- if pp != nil {
- sched.nGsyscallNoP.Add(-1)
- acquirep(pp)
- return true
- }
- return false
+ return nil
}
// exitsyscall slow path on g0.
@@ -4960,11 +4969,10 @@ func exitsyscallfast_pidle() bool {
// Called via mcall, so gp is the calling g from this M.
//
//go:nowritebarrierrec
-func exitsyscall0(gp *g) {
- var trace traceLocker
+func exitsyscallNoP(gp *g) {
traceExitingSyscall()
- trace = traceAcquire()
- casgstatus(gp, _Gsyscall, _Grunnable)
+ trace := traceAcquire()
+ casgstatus(gp, _Grunning, _Grunnable)
traceExitedSyscall()
if trace.ok() {
// Write out syscall exit eagerly.
@@ -6021,6 +6029,21 @@ func procresize(nprocs int32) *p {
//
//go:yeswritebarrierrec
func acquirep(pp *p) {
+ // Do the work.
+ acquirepNoTrace(pp)
+
+ // Emit the event.
+ trace := traceAcquire()
+ if trace.ok() {
+ trace.ProcStart()
+ traceRelease(trace)
+ }
+}
+
+// Internals of acquirep, just skipping the trace events.
+//
+//go:yeswritebarrierrec
+func acquirepNoTrace(pp *p) {
// Do the part that isn't allowed to have write barriers.
wirep(pp)
@@ -6029,12 +6052,6 @@ func acquirep(pp *p) {
// Perform deferred mcache flush before this P can allocate
// from a potentially stale mcache.
pp.mcache.prepareForSweep()
-
- trace := traceAcquire()
- if trace.ok() {
- trace.ProcStart()
- traceRelease(trace)
- }
}
// wirep is the first step of acquirep, which actually associates the
@@ -6382,73 +6399,205 @@ func retake(now int64) uint32 {
// temporarily drop the allpLock. Hence, we need to re-fetch
// allp each time around the loop.
for i := 0; i < len(allp); i++ {
+ // Quickly filter out non-running Ps. Running Ps are either
+ // in a syscall or are actually executing. Idle Ps don't
+ // need to be retaken.
+ //
+ // This is best-effort, so it's OK that it's racy. Our target
+ // is to retake Ps that have been running or in a syscall for
+ // a long time (milliseconds), so the state has plenty of time
+ // to stabilize.
pp := allp[i]
- if pp == nil {
- // This can happen if procresize has grown
+ if pp == nil || atomic.Load(&pp.status) != _Prunning {
+ // pp can be nil if procresize has grown
// allp but not yet created new Ps.
continue
}
pd := &pp.sysmontick
- s := pp.status
sysretake := false
- if s == _Prunning || s == _Psyscall {
- // Preempt G if it's running on the same schedtick for
- // too long. This could be from a single long-running
- // goroutine or a sequence of goroutines run via
- // runnext, which share a single schedtick time slice.
- t := int64(pp.schedtick)
- if int64(pd.schedtick) != t {
- pd.schedtick = uint32(t)
- pd.schedwhen = now
- } else if pd.schedwhen+forcePreemptNS <= now {
- preemptone(pp)
- // In case of syscall, preemptone() doesn't
- // work, because there is no M wired to P.
- sysretake = true
- }
+
+ // Preempt G if it's running on the same schedtick for
+ // too long. This could be from a single long-running
+ // goroutine or a sequence of goroutines run via
+ // runnext, which share a single schedtick time slice.
+ schedt := int64(pp.schedtick)
+ if int64(pd.schedtick) != schedt {
+ pd.schedtick = uint32(schedt)
+ pd.schedwhen = now
+ } else if pd.schedwhen+forcePreemptNS <= now {
+ preemptone(pp)
+ // If pp is in a syscall, preemptone doesn't work.
+ // The goroutine nor the thread can respond to a
+ // preemption request because they're not in Go code,
+ // so we need to take the P ourselves.
+ sysretake = true
}
- if s == _Psyscall {
- // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
- t := int64(pp.syscalltick)
- if !sysretake && int64(pd.syscalltick) != t {
- pd.syscalltick = uint32(t)
- pd.syscallwhen = now
- continue
- }
- // On the one hand we don't want to retake Ps if there is no other work to do,
- // but on the other hand we want to retake them eventually
- // because they can prevent the sysmon thread from deep sleep.
- if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
- continue
- }
- // Drop allpLock so we can take sched.lock.
- unlock(&allpLock)
- // Need to decrement number of idle locked M's
- // (pretending that one more is running) before the CAS.
- // Otherwise the M from which we retake can exit the syscall,
- // increment nmidle and report deadlock.
- incidlelocked(-1)
- trace := traceAcquire()
- if atomic.Cas(&pp.status, s, _Pidle) {
- if trace.ok() {
- trace.ProcSteal(pp, false)
- traceRelease(trace)
- }
- sched.nGsyscallNoP.Add(1)
- n++
- pp.syscalltick++
- handoffp(pp)
- } else if trace.ok() {
- traceRelease(trace)
- }
- incidlelocked(1)
- lock(&allpLock)
+
+ // Drop allpLock so we can take sched.lock.
+ unlock(&allpLock)
+
+ // Need to decrement number of idle locked M's (pretending that
+ // one more is running) before we take the P and resume.
+ // Otherwise the M from which we retake can exit the syscall,
+ // increment nmidle and report deadlock.
+ //
+ // Can't call incidlelocked once we setBlockOnExitSyscall, due
+ // to a lock ordering violation between sched.lock and _Gscan.
+ incidlelocked(-1)
+
+ // Try to prevent the P from continuing in the syscall, if it's in one at all.
+ thread, ok := setBlockOnExitSyscall(pp)
+ if !ok {
+ // Not in a syscall, or something changed out from under us.
+ goto done
}
+
+ // Retake the P if it's there for more than 1 sysmon tick (at least 20us).
+ if syst := int64(pp.syscalltick); !sysretake && int64(pd.syscalltick) != syst {
+ pd.syscalltick = uint32(syst)
+ pd.syscallwhen = now
+ thread.resume()
+ goto done
+ }
+
+ // On the one hand we don't want to retake Ps if there is no other work to do,
+ // but on the other hand we want to retake them eventually
+ // because they can prevent the sysmon thread from deep sleep.
+ if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
+ thread.resume()
+ goto done
+ }
+
+ // Take the P. Note: because we have the scan bit, the goroutine
+ // is at worst stuck spinning in exitsyscall.
+ thread.takeP()
+ thread.resume()
+ n++
+
+ // Handoff the P for some other thread to run it.
+ handoffp(pp)
+
+ // The P has been handed off to another thread, so risk of a false
+ // deadlock report while we hold onto it is gone.
+ done:
+ incidlelocked(1)
+ lock(&allpLock)
}
unlock(&allpLock)
return uint32(n)
}
+// syscallingThread represents a thread in a system call that temporarily
+// cannot advance out of the system call.
+type syscallingThread struct {
+ gp *g
+ mp *m
+ pp *p
+ status uint32
+}
+
+// setBlockOnExitSyscall prevents pp's thread from advancing out of
+// exitsyscall. On success, returns the g/m/p state of the thread
+// and true. At that point, the caller owns the g/m/p links referenced,
+// the goroutine is in _Gsyscall, and prevented from transitioning out
+// of it. On failure, it returns false, and none of these guarantees are
+// made.
+//
+// Callers must call resume on the resulting thread state once
+// they're done with thread, otherwise it will remain blocked forever.
+//
+// This function races with state changes on pp, and thus may fail
+// if pp is not in a system call, or exits a system call concurrently
+// with this function. However, this function is safe to call without
+// any additional synchronization.
+func setBlockOnExitSyscall(pp *p) (syscallingThread, bool) {
+ if pp.status != _Prunning {
+ return syscallingThread{}, false
+ }
+ // Be very careful here, these reads are intentionally racy.
+ // Once we notice the G is in _Gsyscall, acquire its scan bit,
+ // and validate that it's still connected to the *same* M and P,
+ // we can actually get to work. Holding the scan bit will prevent
+ // the G from exiting the syscall.
+ //
+ // Our goal here is to interrupt long syscalls. If it turns out
+ // that we're wrong and the G switched to another syscall while
+ // we were trying to do this, that's completely fine. It's
+ // probably making more frequent syscalls and the typical
+ // preemption paths should be effective.
+ mp := pp.m.ptr()
+ if mp == nil {
+ // Nothing to do.
+ return syscallingThread{}, false
+ }
+ gp := mp.curg
+ if gp == nil {
+ // Nothing to do.
+ return syscallingThread{}, false
+ }
+ status := readgstatus(gp) &^ _Gscan
+
+ // A goroutine is considered in a syscall, and may have a corresponding
+ // P, if it's in _Gsyscall *or* _Gdeadextra. In the latter case, it's an
+ // extra M goroutine.
+ if status != _Gsyscall && status != _Gdeadextra {
+ // Not in a syscall, nothing to do.
+ return syscallingThread{}, false
+ }
+ if !castogscanstatus(gp, status, status|_Gscan) {
+ // Not in _Gsyscall or _Gdeadextra anymore. Nothing to do.
+ return syscallingThread{}, false
+ }
+ if gp.m != mp || gp.m.p.ptr() != pp {
+ // This is not what we originally observed. Nothing to do.
+ casfrom_Gscanstatus(gp, status|_Gscan, status)
+ return syscallingThread{}, false
+ }
+ return syscallingThread{gp, mp, pp, status}, true
+}
+
+// gcstopP unwires the P attached to the syscalling thread
+// and moves it into the _Pgcstop state.
+//
+// The caller must be stopping the world.
+func (s syscallingThread) gcstopP() {
+ assertLockHeld(&sched.lock)
+
+ s.releaseP(_Pgcstop)
+ s.pp.gcStopTime = nanotime()
+ sched.stopwait--
+}
+
+// takeP unwires the P attached to the syscalling thread
+// and moves it into the _Pidle state.
+func (s syscallingThread) takeP() {
+ s.releaseP(_Pidle)
+}
+
+// releaseP unwires the P from the syscalling thread, moving
+// it to the provided state. Callers should prefer to use
+// takeP and gcstopP.
+func (s syscallingThread) releaseP(state uint32) {
+ if state != _Pidle && state != _Pgcstop {
+ throw("attempted to release P into a bad state")
+ }
+ trace := traceAcquire()
+ s.pp.m = 0
+ s.mp.p = 0
+ atomic.Store(&s.pp.status, state)
+ if trace.ok() {
+ trace.ProcSteal(s.pp)
+ traceRelease(trace)
+ }
+ sched.nGsyscallNoP.Add(1)
+ s.pp.syscalltick++
+}
+
+// resume allows a syscalling thread to advance beyond exitsyscall.
+func (s syscallingThread) resume() {
+ casfrom_Gscanstatus(s.gp, s.status|_Gscan, s.status)
+}
+
// Tell all goroutines that they have been preempted and they should stop.
// This function is purely best-effort. It can fail to inform a goroutine if a
// processor just started running it.
@@ -6486,6 +6635,10 @@ func preemptone(pp *p) bool {
if gp == nil || gp == mp.g0 {
return false
}
+ if readgstatus(gp)&^_Gscan == _Gsyscall {
+ // Don't bother trying to preempt a goroutine in a syscall.
+ return false
+ }
gp.preempt = true
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 721fae0b28..b346337d60 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -42,12 +42,16 @@ const (
// _Grunning means this goroutine may execute user code. The
// stack is owned by this goroutine. It is not on a run queue.
- // It is assigned an M and a P (g.m and g.m.p are valid).
+ // It is assigned an M (g.m is valid) and it usually has a P
+ // (g.m.p is valid), but there are small windows of time where
+ // it might not, namely upon entering and exiting _Gsyscall.
_Grunning // 2
// _Gsyscall means this goroutine is executing a system call.
// It is not executing user code. The stack is owned by this
// goroutine. It is not on a run queue. It is assigned an M.
+ // It may have a P attached, but it does not own it. Code
+ // executing in this state must not touch g.m.p.
_Gsyscall // 3
// _Gwaiting means this goroutine is blocked in the runtime.
@@ -131,22 +135,15 @@ const (
// run user code or the scheduler. Only the M that owns this P
// is allowed to change the P's status from _Prunning. The M
// may transition the P to _Pidle (if it has no more work to
- // do), _Psyscall (when entering a syscall), or _Pgcstop (to
- // halt for the GC). The M may also hand ownership of the P
- // off directly to another M (e.g., to schedule a locked G).
+ // do), or _Pgcstop (to halt for the GC). The M may also hand
+ // ownership of the P off directly to another M (for example,
+ // to schedule a locked G).
_Prunning
- // _Psyscall means a P is not running user code. It has
- // affinity to an M in a syscall but is not owned by it and
- // may be stolen by another M. This is similar to _Pidle but
- // uses lightweight transitions and maintains M affinity.
- //
- // Leaving _Psyscall must be done with a CAS, either to steal
- // or retake the P. Note that there's an ABA hazard: even if
- // an M successfully CASes its original P back to _Prunning
- // after a syscall, it must understand the P may have been
- // used by another M in the interim.
- _Psyscall
+ // _Psyscall_unused is a now-defunct state for a P. A P is
+ // identified as "in a system call" by looking at the goroutine's
+ // state.
+ _Psyscall_unused
// _Pgcstop means a P is halted for STW and owned by the M
// that stopped the world. The M that stopped the world
@@ -620,18 +617,27 @@ type m struct {
morebuf gobuf // gobuf arg to morestack
divmod uint32 // div/mod denominator for arm - known to liblink (cmd/internal/obj/arm/obj5.go)
- // Fields not known to debuggers.
- procid uint64 // for debuggers, but offset not hard-coded
- gsignal *g // signal-handling g
- goSigStack gsignalStack // Go-allocated signal handling stack
- sigmask sigset // storage for saved signal mask
- tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
- mstartfn func()
- curg *g // current running goroutine
- caughtsig guintptr // goroutine running during fatal signal
- p puintptr // attached p for executing go code (nil if not executing go code)
- nextp puintptr
- oldp puintptr // the p that was attached before executing a syscall
+ // Fields whose offsets are not known to debuggers.
+
+ procid uint64 // for debuggers, but offset not hard-coded
+ gsignal *g // signal-handling g
+ goSigStack gsignalStack // Go-allocated signal handling stack
+ sigmask sigset // storage for saved signal mask
+ tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
+ mstartfn func()
+ curg *g // current running goroutine
+ caughtsig guintptr // goroutine running during fatal signal
+
+ // p is the currently attached P for executing Go code, nil if not executing user Go code.
+ //
+ // A non-nil p implies exclusive ownership of the P, unless curg is in _Gsyscall.
+ // In _Gsyscall the scheduler may mutate this instead. The point of synchronization
+ // is the _Gscan bit on curg's status. The scheduler must arrange to prevent curg
+ // from transitioning out of _Gsyscall if it intends to mutate p.
+ p puintptr
+
+ nextp puintptr // The next P to install before executing. Implies exclusive ownership of this P.
+ oldp puintptr // The P that was attached before executing a syscall.
id int64
mallocing int32
throwing throwType
diff --git a/src/runtime/traceevent.go b/src/runtime/traceevent.go
index b0bc4c017d..8651275476 100644
--- a/src/runtime/traceevent.go
+++ b/src/runtime/traceevent.go
@@ -44,6 +44,13 @@ func (tl traceLocker) eventWriter(goStatus tracev2.GoStatus, procStatus tracev2.
if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), goStatus, gp.inMarkAssist, 0 /* no stack */).end()
}
+ return tl.rawEventWriter()
+}
+
+// rawEventWriter creates a new traceEventWriter without emitting any status events.
+//
+// It is the caller's responsibility to emit any status events, if necessary.
+func (tl traceLocker) rawEventWriter() traceEventWriter {
return traceEventWriter{tl}
}
diff --git a/src/runtime/traceruntime.go b/src/runtime/traceruntime.go
index 7df6f68b70..d3f82f6fce 100644
--- a/src/runtime/traceruntime.go
+++ b/src/runtime/traceruntime.go
@@ -534,10 +534,8 @@ func (tl traceLocker) GoSysExit(lostP bool) {
// ProcSteal indicates that our current M stole a P from another M.
//
-// inSyscall indicates that we're stealing the P from a syscall context.
-//
// The caller must have ownership of pp.
-func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
+func (tl traceLocker) ProcSteal(pp *p) {
// Grab the M ID we stole from.
mStolenFrom := pp.trace.mSyscallID
pp.trace.mSyscallID = -1
@@ -561,7 +559,7 @@ func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
// In the latter, we're a goroutine in a syscall.
goStatus := tracev2.GoRunning
procStatus := tracev2.ProcRunning
- if inSyscall {
+ if tl.mp.curg != nil && tl.mp.curg.syscallsp != 0 {
goStatus = tracev2.GoSyscall
procStatus = tracev2.ProcSyscallAbandoned
}
@@ -595,19 +593,27 @@ func (tl traceLocker) GoCreateSyscall(gp *g) {
// N.B. We should never trace a status for this goroutine (which we're currently running on),
// since we want this to appear like goroutine creation.
gp.trace.setStatusTraced(tl.gen)
- tl.eventWriter(tracev2.GoBad, tracev2.ProcBad).event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
+
+ // We might have a P left over on the thread from the last cgo callback,
+ // but in a syscall context, it is NOT ours. Act as if we do not have a P,
+ // and don't record a status.
+ tl.rawEventWriter().event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
}
// GoDestroySyscall indicates that a goroutine has transitioned from GoSyscall to dead.
//
-// Must not have a P.
-//
// This occurs when Go code returns back to C. On pthread platforms it occurs only when
// the C thread is destroyed.
func (tl traceLocker) GoDestroySyscall() {
- // N.B. If we trace a status here, we must never have a P, and we must be on a goroutine
- // that is in the syscall state.
- tl.eventWriter(tracev2.GoSyscall, tracev2.ProcBad).event(tracev2.EvGoDestroySyscall)
+ // Write the status for the goroutine if necessary.
+ if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
+ tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), tracev2.GoSyscall, false, 0 /* no stack */).end()
+ }
+
+ // We might have a P left over on the thread from the last cgo callback,
+ // but in a syscall context, it is NOT ours. Act as if we do not have a P,
+ // and don't record a status.
+ tl.rawEventWriter().event(tracev2.EvGoDestroySyscall)
}
// To access runtime functions from runtime/trace.
diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go
index f09bdfd355..6343126faf 100644
--- a/src/runtime/tracestatus.go
+++ b/src/runtime/tracestatus.go
@@ -62,15 +62,14 @@ func (w traceWriter) writeProcStatusForP(pp *p, inSTW bool) traceWriter {
}
case _Prunning:
status = tracev2.ProcRunning
- // There's a short window wherein the goroutine may have entered _Gsyscall
- // but it still owns the P (it's not in _Psyscall yet). The goroutine entering
- // _Gsyscall is the tracer's signal that the P its bound to is also in a syscall,
- // so we need to emit a status that matches. See #64318.
+ // A P is considered to be in a syscall if its attached G is. Since we fully
+ // own P, then the goroutine isn't going to transition and we can trivially
+ // check if the goroutine is in a syscall. This used to be just a small problematic
+ // window, but this is now the default since _Psyscall no longer exists. See #64318
+ // for the history on why it was needed while _Psyscall still existed.
if w.mp.p.ptr() == pp && w.mp.curg != nil && readgstatus(w.mp.curg)&^_Gscan == _Gsyscall {
status = tracev2.ProcSyscall
}
- case _Psyscall:
- status = tracev2.ProcSyscall
default:
throw("attempt to trace invalid or unsupported P status")
}