From 02b8e6978a86c2f4f3a604e8b05014d127f4020a Mon Sep 17 00:00:00 2001
From: Jeremy Jackins <jeremyjackins@gmail.com>
Date: Wed, 13 Apr 2016 18:16:21 +0900
Subject: runtime: find a home for orphaned comments

These comments were left behind after runtime.h was converted
from C to Go. I examined the original code and tried to move these
to the places that the most sense.

Change-Id: I8769d60234c0113d682f9de3bd8d6c34c450c188
Reviewed-on: https://go-review.googlesource.com/21969
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Run-TryBot: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/runtime/runtime2.go | 107 ++++++++++++++++--------------------------------
 1 file changed, 35 insertions(+), 72 deletions(-)

(limited to 'src/runtime/runtime2.go')

diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index e0137f7e97..0fdea400de 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -10,9 +10,7 @@ import (
 	"unsafe"
 )
 
-/*
- * defined constants
- */
+// defined constants
 const (
 	// G status
 	//
@@ -99,6 +97,10 @@ const (
 	_Pdead
 )
 
+// Mutual exclusion locks.  In the uncontended case,
+// as fast as spin locks (just a few user-level instructions),
+// but on the contention path they sleep in the kernel.
+// A zeroed Mutex is unlocked (no need to initialize each lock).
 type mutex struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -106,6 +108,26 @@ type mutex struct {
 	key uintptr
 }
 
+// sleep and wakeup on one-time events.
+// before any calls to notesleep or notewakeup,
+// must call noteclear to initialize the Note.
+// then, exactly one thread can call notesleep
+// and exactly one thread can call notewakeup (once).
+// once notewakeup has been called, the notesleep
+// will return.  future notesleep will return immediately.
+// subsequent noteclear must be called only after
+// previous notesleep has returned, e.g. it's disallowed
+// to call noteclear straight after notewakeup.
+//
+// notetsleep is like notesleep but wakes up after
+// a given number of nanoseconds even if the event
+// has not yet happened.  if a goroutine uses notetsleep to
+// wake up early, it must wait to call noteclear until it
+// can be sure that no other goroutine is calling
+// notewakeup.
+//
+// notesleep/notetsleep are generally called on g0,
+// notetsleepg is similar to notetsleep but is called on user g.
 type note struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -397,8 +419,8 @@ type m struct {
 	waittraceskip int
 	startingtrace bool
 	syscalltick   uint32
-	//#ifdef GOOS_windows
-	thread uintptr // thread handle
+	thread        uintptr // thread handle
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -406,7 +428,7 @@ type m struct {
 	libcallsp uintptr
 	libcallg  guintptr
 	syscall   libcall // stores syscall parameters on windows
-	//#endif
+
 	mOS
 }
 
@@ -530,10 +552,10 @@ type schedt struct {
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
 // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
 // External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m->locked record the nesting depth of calls to lockOSThread
+// The upper bits of m.locked record the nesting depth of calls to lockOSThread
 // (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
 // Internal locks can be recursive. For instance, a lock for cgo can occur while the main
 // goroutine is holding the lock during the initialization phase.
@@ -603,13 +625,6 @@ type forcegcstate struct {
 	idle uint32
 }
 
-/*
- * known to compiler
- */
-const (
-	_Structrnd = sys.RegSize
-)
-
 // startup_random_data holds random bytes initialized at startup. These come from
 // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
 var startupRandomData []byte
@@ -635,9 +650,7 @@ func extendRandom(r []byte, n int) {
 	}
 }
 
-/*
- * deferred subroutine calls
- */
+// deferred subroutine calls
 type _defer struct {
 	siz     int32
 	started bool
@@ -648,9 +661,7 @@ type _defer struct {
 	link    *_defer
 }
 
-/*
- * panics
- */
+// panics
 type _panic struct {
 	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
 	arg       interface{}    // argument to panic
@@ -659,10 +670,7 @@ type _panic struct {
 	aborted   bool           // the panic was aborted
 }
 
-/*
- * stack traces
- */
-
+// stack traces
 type stkframe struct {
 	fn       *_func     // function being run
 	pc       uintptr    // program counter within fn
@@ -682,10 +690,8 @@ const (
 	_TraceJumpStack                 // if traceback is on a systemstack, resume trace at g that called into it
 )
 
-const (
-	// The maximum number of frames we print for a traceback
-	_TracebackMaxFrames = 100
-)
+// The maximum number of frames we print for a traceback
+const _TracebackMaxFrames = 100
 
 var (
 	emptystring string
@@ -716,46 +722,3 @@ var (
 	islibrary bool // -buildmode=c-shared
 	isarchive bool // -buildmode=c-archive
 )
-
-/*
- * mutual exclusion locks.  in the uncontended case,
- * as fast as spin locks (just a few user-level instructions),
- * but on the contention path they sleep in the kernel.
- * a zeroed Mutex is unlocked (no need to initialize each lock).
- */
-
-/*
- * sleep and wakeup on one-time events.
- * before any calls to notesleep or notewakeup,
- * must call noteclear to initialize the Note.
- * then, exactly one thread can call notesleep
- * and exactly one thread can call notewakeup (once).
- * once notewakeup has been called, the notesleep
- * will return.  future notesleep will return immediately.
- * subsequent noteclear must be called only after
- * previous notesleep has returned, e.g. it's disallowed
- * to call noteclear straight after notewakeup.
- *
- * notetsleep is like notesleep but wakes up after
- * a given number of nanoseconds even if the event
- * has not yet happened.  if a goroutine uses notetsleep to
- * wake up early, it must wait to call noteclear until it
- * can be sure that no other goroutine is calling
- * notewakeup.
- *
- * notesleep/notetsleep are generally called on g0,
- * notetsleepg is similar to notetsleep but is called on user g.
- */
-// bool	runtime·notetsleep(Note*, int64);  // false - timeout
-// bool	runtime·notetsleepg(Note*, int64);  // false - timeout
-
-/*
- * Lock-free stack.
- * Initialize uint64 head to 0, compare with 0 to test for emptiness.
- * The stack does not keep pointers to nodes,
- * so they can be garbage collected if there are no other pointers to nodes.
- */
-
-// for mmap, we only pass the lower 32 bits of file offset to the
-// assembly routine; the higher bits (if required), should be provided
-// by the assembly routine as 0.
-- 
cgit v1.3


From a3703618eadeb74b60f2cb9a23fabe178d4b141d Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 5 Apr 2016 15:29:14 +0200
Subject: runtime: use per-goroutine sequence numbers in tracer

Currently tracer uses global sequencer and it introduces
significant slowdown on parallel machines (up to 10x).
Replace the global sequencer with per-goroutine sequencer.

If we assign per-goroutine sequence numbers to only 3 types
of events (start, unblock and syscall exit), it is enough to
restore consistent partial ordering of all events. Even these
events don't need sequence numbers all the time (if goroutine
starts on the same P where it was unblocked, then start does
not need sequence number).
The burden of restoring the order is put on trace parser.
Details of the algorithm are described in the comments.

On http benchmark with GOMAXPROCS=48:
no tracing: 5026 ns/op
tracing: 27803 ns/op (+453%)
with this change: 6369 ns/op (+26%, mostly for traceback)

Also trace size is reduced by ~22%. Average event size before: 4.63
bytes/event, after: 3.62 bytes/event.

Besides running trace tests, I've also tested with manually broken
cputicks (random skew for each event, per-P skew and episodic random skew).
In all cases broken timestamps were detected and no test failures.

Change-Id: I078bde421ccc386a66f6c2051ab207bcd5613efa
Reviewed-on: https://go-review.googlesource.com/21512
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/internal/trace/order.go       | 278 ++++++++++++++++++++++++++++++++++++++
 src/internal/trace/parser.go      | 208 +++++++++++++++-------------
 src/internal/trace/parser_test.go |   8 +-
 src/runtime/proc.go               |  25 +---
 src/runtime/runtime2.go           |  21 +--
 src/runtime/trace.go              | 121 +++++++++--------
 6 files changed, 478 insertions(+), 183 deletions(-)
 create mode 100644 src/internal/trace/order.go

(limited to 'src/runtime/runtime2.go')

diff --git a/src/internal/trace/order.go b/src/internal/trace/order.go
new file mode 100644
index 0000000000..f9ec44c745
--- /dev/null
+++ b/src/internal/trace/order.go
@@ -0,0 +1,278 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace
+
+import (
+	"fmt"
+	"sort"
+)
+
+type eventBatch struct {
+	events   []*Event
+	selected bool
+}
+
+type orderEvent struct {
+	ev    *Event
+	batch int
+	g     uint64
+	init  gState
+	next  gState
+}
+
+type gStatus int
+
+type gState struct {
+	seq    uint64
+	status gStatus
+}
+
+const (
+	gDead gStatus = iota
+	gRunnable
+	gRunning
+	gWaiting
+
+	unordered = ^uint64(0)
+	garbage   = ^uint64(0) - 1
+	noseq     = ^uint64(0)
+	seqinc    = ^uint64(0) - 1
+)
+
+// order1007 merges a set of per-P event batches into a single, consistent stream.
+// The high level idea is as follows. Events within an individual batch are in
+// correct order, because they are emitted by a single P. So we need to produce
+// a correct interleaving of the batches. To do this we take first unmerged event
+// from each batch (frontier). Then choose subset that is "ready" to be merged,
+// that is, events for which all dependencies are already merged. Then we choose
+// event with the lowest timestamp from the subset, merge it and repeat.
+// This approach ensures that we form a consistent stream even if timestamps are
+// incorrect (condition observed on some machines).
+func order1007(m map[int][]*Event) (events []*Event, err error) {
+	pending := 0
+	var batches []*eventBatch
+	for _, v := range m {
+		pending += len(v)
+		batches = append(batches, &eventBatch{v, false})
+	}
+	gs := make(map[uint64]gState)
+	var frontier []orderEvent
+	for ; pending != 0; pending-- {
+		for i, b := range batches {
+			if b.selected || len(b.events) == 0 {
+				continue
+			}
+			ev := b.events[0]
+			g, init, next := stateTransition(ev)
+			if !transitionReady(g, gs[g], init) {
+				continue
+			}
+			frontier = append(frontier, orderEvent{ev, i, g, init, next})
+			b.events = b.events[1:]
+			b.selected = true
+			// Get rid of "Local" events, they are intended merely for ordering.
+			switch ev.Type {
+			case EvGoStartLocal:
+				ev.Type = EvGoStart
+			case EvGoUnblockLocal:
+				ev.Type = EvGoUnblock
+			case EvGoSysExitLocal:
+				ev.Type = EvGoSysExit
+			}
+		}
+		if len(frontier) == 0 {
+			return nil, fmt.Errorf("no consistent ordering of events possible")
+		}
+		sort.Sort(orderEventList(frontier))
+		f := frontier[0]
+		frontier[0] = frontier[len(frontier)-1]
+		frontier = frontier[:len(frontier)-1]
+		events = append(events, f.ev)
+		transition(gs, f.g, f.init, f.next)
+		if !batches[f.batch].selected {
+			panic("frontier batch is not selected")
+		}
+		batches[f.batch].selected = false
+	}
+
+	// At this point we have a consistent stream of events.
+	// Make sure time stamps respect the ordering.
+	// The tests will skip (not fail) the test case if they see this error.
+	if !sort.IsSorted(eventList(events)) {
+		return nil, ErrTimeOrder
+	}
+
+	// The last part is giving correct timestamps to EvGoSysExit events.
+	// The problem with EvGoSysExit is that actual syscall exit timestamp (ev.Args[2])
+	// is potentially acquired long before event emission. So far we've used
+	// timestamp of event emission (ev.Ts).
+	// We could not set ev.Ts = ev.Args[2] earlier, because it would produce
+	// seemingly broken timestamps (misplaced event).
+	// We also can't simply update the timestamp and resort events, because
+	// if timestamps are broken we will misplace the event and later report
+	// logically broken trace (instead of reporting broken timestamps).
+	lastSysBlock := make(map[uint64]int64)
+	for _, ev := range events {
+		switch ev.Type {
+		case EvGoSysBlock, EvGoInSyscall:
+			lastSysBlock[ev.G] = ev.Ts
+		case EvGoSysExit:
+			ts := int64(ev.Args[2])
+			if ts == 0 {
+				continue
+			}
+			block := lastSysBlock[ev.G]
+			if block == 0 {
+				return nil, fmt.Errorf("stray syscall exit")
+			}
+			if ts < block {
+				return nil, ErrTimeOrder
+			}
+			ev.Ts = ts
+		}
+	}
+	sort.Sort(eventList(events))
+
+	return
+}
+
+// stateTransition returns goroutine state (sequence and status) when the event
+// becomes ready for merging (init) and the goroutine state after the event (next).
+func stateTransition(ev *Event) (g uint64, init, next gState) {
+	switch ev.Type {
+	case EvGoCreate:
+		g = ev.Args[0]
+		init = gState{0, gDead}
+		next = gState{1, gRunnable}
+	case EvGoWaiting, EvGoInSyscall:
+		g = ev.G
+		init = gState{1, gRunnable}
+		next = gState{2, gWaiting}
+	case EvGoStart:
+		g = ev.G
+		init = gState{ev.Args[1], gRunnable}
+		next = gState{ev.Args[1] + 1, gRunning}
+	case EvGoStartLocal:
+		// noseq means that this event is ready for merging as soon as
+		// frontier reaches it (EvGoStartLocal is emitted on the same P
+		// as the corresponding EvGoCreate/EvGoUnblock, and thus the latter
+		// is already merged).
+		// seqinc is a stub for cases when event increments g sequence,
+		// but since we don't know current seq we also don't know next seq.
+		g = ev.G
+		init = gState{noseq, gRunnable}
+		next = gState{seqinc, gRunning}
+	case EvGoBlock, EvGoBlockSend, EvGoBlockRecv, EvGoBlockSelect,
+		EvGoBlockSync, EvGoBlockCond, EvGoBlockNet, EvGoSleep, EvGoSysBlock:
+		g = ev.G
+		init = gState{noseq, gRunning}
+		next = gState{noseq, gWaiting}
+	case EvGoSched, EvGoPreempt:
+		g = ev.G
+		init = gState{noseq, gRunning}
+		next = gState{noseq, gRunnable}
+	case EvGoUnblock, EvGoSysExit:
+		g = ev.Args[0]
+		init = gState{ev.Args[1], gWaiting}
+		next = gState{ev.Args[1] + 1, gRunnable}
+	case EvGoUnblockLocal, EvGoSysExitLocal:
+		g = ev.Args[0]
+		init = gState{noseq, gWaiting}
+		next = gState{seqinc, gRunnable}
+	case EvGCStart:
+		g = garbage
+		init = gState{ev.Args[0], gDead}
+		next = gState{ev.Args[0] + 1, gDead}
+	default:
+		// no ordering requirements
+		g = unordered
+	}
+	return
+}
+
+func transitionReady(g uint64, curr, init gState) bool {
+	return g == unordered || (init.seq == noseq || init.seq == curr.seq) && init.status == curr.status
+}
+
+func transition(gs map[uint64]gState, g uint64, init, next gState) {
+	if g == unordered {
+		return
+	}
+	curr := gs[g]
+	if !transitionReady(g, curr, init) {
+		panic("event sequences are broken")
+	}
+	switch next.seq {
+	case noseq:
+		next.seq = curr.seq
+	case seqinc:
+		next.seq = curr.seq + 1
+	}
+	gs[g] = next
+}
+
+// order1005 merges a set of per-P event batches into a single, consistent stream.
+func order1005(m map[int][]*Event) (events []*Event, err error) {
+	for _, batch := range m {
+		events = append(events, batch...)
+	}
+	for _, ev := range events {
+		if ev.Type == EvGoSysExit {
+			// EvGoSysExit emission is delayed until the thread has a P.
+			// Give it the real sequence number and time stamp.
+			ev.seq = int64(ev.Args[1])
+			if ev.Args[2] != 0 {
+				ev.Ts = int64(ev.Args[2])
+			}
+		}
+	}
+	sort.Sort(eventSeqList(events))
+	if !sort.IsSorted(eventList(events)) {
+		return nil, ErrTimeOrder
+	}
+	return
+}
+
+type orderEventList []orderEvent
+
+func (l orderEventList) Len() int {
+	return len(l)
+}
+
+func (l orderEventList) Less(i, j int) bool {
+	return l[i].ev.Ts < l[j].ev.Ts
+}
+
+func (l orderEventList) Swap(i, j int) {
+	l[i], l[j] = l[j], l[i]
+}
+
+type eventList []*Event
+
+func (l eventList) Len() int {
+	return len(l)
+}
+
+func (l eventList) Less(i, j int) bool {
+	return l[i].Ts < l[j].Ts
+}
+
+func (l eventList) Swap(i, j int) {
+	l[i], l[j] = l[j], l[i]
+}
+
+type eventSeqList []*Event
+
+func (l eventSeqList) Len() int {
+	return len(l)
+}
+
+func (l eventSeqList) Less(i, j int) bool {
+	return l[i].seq < l[j].seq
+}
+
+func (l eventSeqList) Swap(i, j int) {
+	l[i], l[j] = l[j], l[i]
+}
diff --git a/src/internal/trace/parser.go b/src/internal/trace/parser.go
index d279ddeacf..e6f29445c1 100644
--- a/src/internal/trace/parser.go
+++ b/src/internal/trace/parser.go
@@ -11,7 +11,6 @@ import (
 	"io"
 	"os"
 	"os/exec"
-	"sort"
 	"strconv"
 	"strings"
 )
@@ -135,7 +134,12 @@ func readTrace(r io.Reader) (ver int, events []rawEvent, strings map[uint64]stri
 		}
 		off += n
 		typ := buf[0] << 2 >> 2
-		narg := buf[0] >> 6
+		narg := buf[0]>>6 + 1
+		inlineArgs := byte(4)
+		if ver < 1007 {
+			narg++
+			inlineArgs++
+		}
 		if typ == EvNone || typ >= EvCount || EventDescriptions[typ].minVersion > ver {
 			err = fmt.Errorf("unknown event type %v at offset 0x%x", typ, off0)
 			return
@@ -180,8 +184,8 @@ func readTrace(r io.Reader) (ver int, events []rawEvent, strings map[uint64]stri
 			continue
 		}
 		ev := rawEvent{typ: typ, off: off0}
-		if narg < 3 {
-			for i := 0; i < int(narg)+2; i++ { // sequence number and time stamp are present but not counted in narg
+		if narg < inlineArgs {
+			for i := 0; i < int(narg); i++ {
 				var v uint64
 				v, off, err = readVal(r, off)
 				if err != nil {
@@ -191,7 +195,7 @@ func readTrace(r io.Reader) (ver int, events []rawEvent, strings map[uint64]stri
 				ev.args = append(ev.args, v)
 			}
 		} else {
-			// If narg == 3, the first value is length of the event in bytes.
+			// More than inlineArgs args, the first value is length of the event in bytes.
 			var v uint64
 			v, off, err = readVal(r, off)
 			if err != nil {
@@ -250,34 +254,30 @@ func parseEvents(ver int, rawEvents []rawEvent, strings map[uint64]string) (even
 	var lastP int
 	lastGs := make(map[int]uint64) // last goroutine running on P
 	stacks = make(map[uint64][]*Frame)
+	batches := make(map[int][]*Event) // events by P
 	for _, raw := range rawEvents {
 		desc := EventDescriptions[raw.typ]
 		if desc.Name == "" {
 			err = fmt.Errorf("missing description for event type %v", raw.typ)
 			return
 		}
-		if raw.typ != EvStack {
-			narg := len(desc.Args)
-			if desc.Stack {
-				narg++
-			}
-			if raw.typ != EvBatch && raw.typ != EvFrequency && raw.typ != EvTimerGoroutine {
-				narg++ // sequence number
-				narg++ // timestamp
-			}
-			if len(raw.args) != narg {
-				err = fmt.Errorf("%v has wrong number of arguments at offset 0x%x: want %v, got %v",
-					desc.Name, raw.off, narg, len(raw.args))
-				return
-			}
+		narg := argNum(raw, ver)
+		if len(raw.args) != narg {
+			err = fmt.Errorf("%v has wrong number of arguments at offset 0x%x: want %v, got %v",
+				desc.Name, raw.off, narg, len(raw.args))
+			return
 		}
 		switch raw.typ {
 		case EvBatch:
 			lastGs[lastP] = lastG
 			lastP = int(raw.args[0])
 			lastG = lastGs[lastP]
-			lastSeq = int64(raw.args[1])
-			lastTs = int64(raw.args[2])
+			if ver < 1007 {
+				lastSeq = int64(raw.args[1])
+				lastTs = int64(raw.args[2])
+			} else {
+				lastTs = int64(raw.args[1])
+			}
 		case EvFrequency:
 			ticksPerSec = int64(raw.args[0])
 			if ticksPerSec <= 0 {
@@ -328,18 +328,26 @@ func parseEvents(ver int, rawEvents []rawEvent, strings map[uint64]string) (even
 			}
 		default:
 			e := &Event{Off: raw.off, Type: raw.typ, P: lastP, G: lastG}
-			e.seq = lastSeq + int64(raw.args[0])
-			e.Ts = lastTs + int64(raw.args[1])
-			lastSeq = e.seq
-			lastTs = e.Ts
-			for i := range desc.Args {
-				e.Args[i] = raw.args[i+2]
+			var argOffset int
+			if ver < 1007 {
+				e.seq = lastSeq + int64(raw.args[0])
+				e.Ts = lastTs + int64(raw.args[1])
+				lastSeq = e.seq
+				argOffset = 2
+			} else {
+				e.Ts = lastTs + int64(raw.args[0])
+				argOffset = 1
 			}
-			if desc.Stack {
-				e.StkID = raw.args[len(desc.Args)+2]
+			lastTs = e.Ts
+			for i := argOffset; i < narg; i++ {
+				if i == narg-1 && desc.Stack {
+					e.StkID = raw.args[i]
+				} else {
+					e.Args[i-argOffset] = raw.args[i]
+				}
 			}
 			switch raw.typ {
-			case EvGoStart:
+			case EvGoStart, EvGoStartLocal:
 				lastG = e.Args[0]
 				e.G = lastG
 			case EvGCStart, EvGCDone, EvGCScanStart, EvGCScanDone:
@@ -349,28 +357,30 @@ func parseEvents(ver int, rawEvents []rawEvent, strings map[uint64]string) (even
 				EvGoBlockSelect, EvGoBlockSync, EvGoBlockCond, EvGoBlockNet,
 				EvGoSysBlock:
 				lastG = 0
-			case EvGoSysExit:
-				// EvGoSysExit emission is delayed until the thread has a P.
-				// Give it the real sequence number and time stamp.
-				e.seq = int64(e.Args[1])
-				if e.Args[2] != 0 {
-					e.Ts = int64(e.Args[2])
-				}
+			case EvGoSysExit, EvGoWaiting, EvGoInSyscall:
+				e.G = e.Args[0]
 			}
-			events = append(events, e)
+			batches[lastP] = append(batches[lastP], e)
 		}
 	}
-	if len(events) == 0 {
+	if len(batches) == 0 {
 		err = fmt.Errorf("trace is empty")
 		return
 	}
-
-	// Sort by sequence number and translate cpu ticks to real time.
-	sort.Sort(eventList(events))
 	if ticksPerSec == 0 {
 		err = fmt.Errorf("no EvFrequency event")
 		return
 	}
+	if ver < 1007 {
+		events, err = order1005(batches)
+	} else {
+		events, err = order1007(batches)
+	}
+	if err != nil {
+		return
+	}
+
+	// Translate cpu ticks to real time.
 	minTs := events[0].Ts
 	// Use floating point to avoid integer overflows.
 	freq := 1e9 / float64(ticksPerSec)
@@ -382,7 +392,6 @@ func parseEvents(ver int, rawEvents []rawEvent, strings map[uint64]string) (even
 		}
 		if ev.Type == EvGoSysExit {
 			ev.P = SyscallP
-			ev.G = ev.Args[0]
 		}
 	}
 
@@ -543,19 +552,15 @@ func postProcessTrace(ver int, events []*Event) error {
 			p.evSweep.Link = ev
 			p.evSweep = nil
 		case EvGoWaiting:
-			g1 := gs[ev.Args[0]]
-			if g1.state != gRunnable {
-				return fmt.Errorf("g %v is not runnable before EvGoWaiting (offset %v, time %v)", ev.Args[0], ev.Off, ev.Ts)
+			if g.state != gRunnable {
+				return fmt.Errorf("g %v is not runnable before EvGoWaiting (offset %v, time %v)", ev.G, ev.Off, ev.Ts)
 			}
-			g1.state = gWaiting
-			gs[ev.Args[0]] = g1
+			g.state = gWaiting
 		case EvGoInSyscall:
-			g1 := gs[ev.Args[0]]
-			if g1.state != gRunnable {
-				return fmt.Errorf("g %v is not runnable before EvGoInSyscall (offset %v, time %v)", ev.Args[0], ev.Off, ev.Ts)
+			if g.state != gRunnable {
+				return fmt.Errorf("g %v is not runnable before EvGoInSyscall (offset %v, time %v)", ev.G, ev.Off, ev.Ts)
 			}
-			g1.state = gWaiting
-			gs[ev.Args[0]] = g1
+			g.state = gWaiting
 		case EvGoCreate:
 			if err := checkRunning(p, g, ev, true); err != nil {
 				return err
@@ -666,18 +671,6 @@ func postProcessTrace(ver int, events []*Event) error {
 	// TODO(dvyukov): restore stacks for EvGoStart events.
 	// TODO(dvyukov): test that all EvGoStart events has non-nil Link.
 
-	// Last, after all the other consistency checks,
-	// make sure time stamps respect sequence numbers.
-	// The tests will skip (not fail) the test case if they see this error,
-	// so check everything else that could possibly be wrong first.
-	lastTs := int64(0)
-	for _, ev := range events {
-		if ev.Ts < lastTs {
-			return ErrTimeOrder
-		}
-		lastTs = ev.Ts
-	}
-
 	return nil
 }
 
@@ -773,30 +766,51 @@ func readVal(r io.Reader, off0 int) (v uint64, off int, err error) {
 	return 0, 0, fmt.Errorf("bad value at offset 0x%x", off0)
 }
 
-type eventList []*Event
-
-func (l eventList) Len() int {
-	return len(l)
-}
-
-func (l eventList) Less(i, j int) bool {
-	return l[i].seq < l[j].seq
+// Print dumps events to stdout. For debugging.
+func Print(events []*Event) {
+	for _, ev := range events {
+		PrintEvent(ev)
+	}
 }
 
-func (l eventList) Swap(i, j int) {
-	l[i], l[j] = l[j], l[i]
+// PrintEvent dumps the event to stdout. For debugging.
+func PrintEvent(ev *Event) {
+	desc := EventDescriptions[ev.Type]
+	fmt.Printf("%v %v p=%v g=%v off=%v", ev.Ts, desc.Name, ev.P, ev.G, ev.Off)
+	for i, a := range desc.Args {
+		fmt.Printf(" %v=%v", a, ev.Args[i])
+	}
+	fmt.Printf("\n")
 }
 
-// Print dumps events to stdout. For debugging.
-func Print(events []*Event) {
-	for _, ev := range events {
-		desc := EventDescriptions[ev.Type]
-		fmt.Printf("%v %v p=%v g=%v off=%v", ev.Ts, desc.Name, ev.P, ev.G, ev.Off)
-		for i, a := range desc.Args {
-			fmt.Printf(" %v=%v", a, ev.Args[i])
+// argNum returns total number of args for the event accounting for timestamps,
+// sequence numbers and differences between trace format versions.
+func argNum(raw rawEvent, ver int) int {
+	desc := EventDescriptions[raw.typ]
+	if raw.typ == EvStack {
+		return len(raw.args)
+	}
+	narg := len(desc.Args)
+	if desc.Stack {
+		narg++
+	}
+	switch raw.typ {
+	case EvBatch, EvFrequency, EvTimerGoroutine:
+		if ver < 1007 {
+			narg++ // there was an unused arg before 1.7
+		}
+	case EvGCStart, EvGoStart, EvGoUnblock:
+		if ver < 1007 {
+			narg-- // 1.7 added an additional seq arg
+		}
+		fallthrough
+	default:
+		narg++ // timestamp
+		if ver < 1007 {
+			narg++ // sequence
 		}
-		fmt.Printf("\n")
 	}
+	return narg
 }
 
 // Event types in the trace.
@@ -809,21 +823,21 @@ const (
 	EvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
 	EvProcStart      = 5  // start of P [timestamp, thread id]
 	EvProcStop       = 6  // stop of P [timestamp]
-	EvGCStart        = 7  // GC start [timestamp, stack id]
+	EvGCStart        = 7  // GC start [timestamp, seq, stack id]
 	EvGCDone         = 8  // GC done [timestamp]
 	EvGCScanStart    = 9  // GC scan start [timestamp]
 	EvGCScanDone     = 10 // GC scan done [timestamp]
 	EvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
 	EvGCSweepDone    = 12 // GC sweep done [timestamp]
 	EvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
-	EvGoStart        = 14 // goroutine starts running [timestamp, goroutine id]
+	EvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
 	EvGoEnd          = 15 // goroutine ends [timestamp]
 	EvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
 	EvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
 	EvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
 	EvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
 	EvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	EvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, stack]
+	EvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
 	EvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
 	EvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
 	EvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
@@ -831,7 +845,7 @@ const (
 	EvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
 	EvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
 	EvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	EvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, real timestamp]
+	EvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
 	EvGoSysBlock     = 30 // syscall blocks [timestamp]
 	EvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
 	EvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
@@ -840,7 +854,10 @@ const (
 	EvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
 	EvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	EvString         = 37 // string dictionary entry [ID, length, string]
-	EvCount          = 38
+	EvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	EvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	EvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	EvCount          = 41
 )
 
 var EventDescriptions = [EvCount]struct {
@@ -850,27 +867,27 @@ var EventDescriptions = [EvCount]struct {
 	Args       []string
 }{
 	EvNone:           {"None", 1005, false, []string{}},
-	EvBatch:          {"Batch", 1005, false, []string{"p", "seq", "ticks"}},
-	EvFrequency:      {"Frequency", 1005, false, []string{"freq", "unused"}},
+	EvBatch:          {"Batch", 1005, false, []string{"p", "ticks"}}, // in 1.5 format it was {"p", "seq", "ticks"}
+	EvFrequency:      {"Frequency", 1005, false, []string{"freq"}},   // in 1.5 format it was {"freq", "unused"}
 	EvStack:          {"Stack", 1005, false, []string{"id", "siz"}},
 	EvGomaxprocs:     {"Gomaxprocs", 1005, true, []string{"procs"}},
 	EvProcStart:      {"ProcStart", 1005, false, []string{"thread"}},
 	EvProcStop:       {"ProcStop", 1005, false, []string{}},
-	EvGCStart:        {"GCStart", 1005, true, []string{}},
+	EvGCStart:        {"GCStart", 1005, true, []string{"seq"}}, // in 1.5 format it was {}
 	EvGCDone:         {"GCDone", 1005, false, []string{}},
 	EvGCScanStart:    {"GCScanStart", 1005, false, []string{}},
 	EvGCScanDone:     {"GCScanDone", 1005, false, []string{}},
 	EvGCSweepStart:   {"GCSweepStart", 1005, true, []string{}},
 	EvGCSweepDone:    {"GCSweepDone", 1005, false, []string{}},
 	EvGoCreate:       {"GoCreate", 1005, true, []string{"g", "stack"}},
-	EvGoStart:        {"GoStart", 1005, false, []string{"g"}},
+	EvGoStart:        {"GoStart", 1005, false, []string{"g", "seq"}}, // in 1.5 format it was {"g"}
 	EvGoEnd:          {"GoEnd", 1005, false, []string{}},
 	EvGoStop:         {"GoStop", 1005, true, []string{}},
 	EvGoSched:        {"GoSched", 1005, true, []string{}},
 	EvGoPreempt:      {"GoPreempt", 1005, true, []string{}},
 	EvGoSleep:        {"GoSleep", 1005, true, []string{}},
 	EvGoBlock:        {"GoBlock", 1005, true, []string{}},
-	EvGoUnblock:      {"GoUnblock", 1005, true, []string{"g"}},
+	EvGoUnblock:      {"GoUnblock", 1005, true, []string{"g", "seq"}}, // in 1.5 format it was {"g"}
 	EvGoBlockSend:    {"GoBlockSend", 1005, true, []string{}},
 	EvGoBlockRecv:    {"GoBlockRecv", 1005, true, []string{}},
 	EvGoBlockSelect:  {"GoBlockSelect", 1005, true, []string{}},
@@ -884,7 +901,10 @@ var EventDescriptions = [EvCount]struct {
 	EvGoInSyscall:    {"GoInSyscall", 1005, false, []string{"g"}},
 	EvHeapAlloc:      {"HeapAlloc", 1005, false, []string{"mem"}},
 	EvNextGC:         {"NextGC", 1005, false, []string{"mem"}},
-	EvTimerGoroutine: {"TimerGoroutine", 1005, false, []string{"g", "unused"}},
+	EvTimerGoroutine: {"TimerGoroutine", 1005, false, []string{"g"}}, // in 1.5 format it was {"g", "unused"}
 	EvFutileWakeup:   {"FutileWakeup", 1005, false, []string{}},
 	EvString:         {"String", 1007, false, []string{}},
+	EvGoStartLocal:   {"GoStartLocal", 1007, false, []string{"g"}},
+	EvGoUnblockLocal: {"GoUnblockLocal", 1007, true, []string{"g"}},
+	EvGoSysExitLocal: {"GoSysExitLocal", 1007, false, []string{"g", "ts"}},
 }
diff --git a/src/internal/trace/parser_test.go b/src/internal/trace/parser_test.go
index 337d5a85d7..340f106484 100644
--- a/src/internal/trace/parser_test.go
+++ b/src/internal/trace/parser_test.go
@@ -89,10 +89,10 @@ func TestParseVersion(t *testing.T) {
 func TestTimestampOverflow(t *testing.T) {
 	// Test that parser correctly handles large timestamps (long tracing).
 	w := newWriter()
-	w.emit(EvBatch, 0, 0, 0)
-	w.emit(EvFrequency, 1e9, 0)
+	w.emit(EvBatch, 0, 0)
+	w.emit(EvFrequency, 1e9)
 	for ts := uint64(1); ts < 1e16; ts *= 2 {
-		w.emit(EvGoCreate, 1, ts, ts, 1, 0)
+		w.emit(EvGoCreate, ts, ts, 0, 0)
 	}
 	if _, err := Parse(w, ""); err != nil {
 		t.Fatalf("failed to parse: %v", err)
@@ -110,7 +110,7 @@ func newWriter() *writer {
 }
 
 func (w *writer) emit(typ byte, args ...uint64) {
-	nargs := byte(len(args)) - 2
+	nargs := byte(len(args)) - 1
 	if nargs > 3 {
 		nargs = 3
 	}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 98a986cd63..a847823da4 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1796,23 +1796,7 @@ func execute(gp *g, inheritTime bool) {
 		// GoSysExit has to happen when we have a P, but before GoStart.
 		// So we emit it here.
 		if gp.syscallsp != 0 && gp.sysblocktraced {
-			// Since gp.sysblocktraced is true, we must emit an event.
-			// There is a race between the code that initializes sysexitseq
-			// and sysexitticks (in exitsyscall, which runs without a P,
-			// and therefore is not stopped with the rest of the world)
-			// and the code that initializes a new trace.
-			// The recorded sysexitseq and sysexitticks must therefore
-			// be treated as "best effort". If they are valid for this trace,
-			// then great, use them for greater accuracy.
-			// But if they're not valid for this trace, assume that the
-			// trace was started after the actual syscall exit (but before
-			// we actually managed to start the goroutine, aka right now),
-			// and assign a fresh time stamp to keep the log consistent.
-			seq, ts := gp.sysexitseq, gp.sysexitticks
-			if seq == 0 || int64(seq)-int64(trace.seqStart) < 0 {
-				seq, ts = tracestamp()
-			}
-			traceGoSysExit(seq, ts)
+			traceGoSysExit(gp.sysexitticks)
 		}
 		traceGoStart()
 	}
@@ -2481,7 +2465,6 @@ func exitsyscall(dummy int32) {
 	}
 
 	_g_.sysexitticks = 0
-	_g_.sysexitseq = 0
 	if trace.enabled {
 		// Wait till traceGoSysBlock event is emitted.
 		// This ensures consistency of the trace (the goroutine is started after it is blocked).
@@ -2492,7 +2475,7 @@ func exitsyscall(dummy int32) {
 		// Tracing code can invoke write barriers that cannot run without a P.
 		// So instead we remember the syscall exit time and emit the event
 		// in execute when we have a P.
-		_g_.sysexitseq, _g_.sysexitticks = tracestamp()
+		_g_.sysexitticks = cputicks()
 	}
 
 	_g_.m.locks--
@@ -2540,7 +2523,7 @@ func exitsyscallfast() bool {
 					// Denote blocking of the new syscall.
 					traceGoSysBlock(_g_.m.p.ptr())
 					// Denote completion of the current syscall.
-					traceGoSysExit(tracestamp())
+					traceGoSysExit(0)
 				})
 			}
 			_g_.m.p.ptr().syscalltick++
@@ -2564,7 +2547,7 @@ func exitsyscallfast() bool {
 						osyield()
 					}
 				}
-				traceGoSysExit(tracestamp())
+				traceGoSysExit(0)
 			}
 		})
 		if ok {
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 0fdea400de..8cfe6b06e6 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -332,16 +332,17 @@ type g struct {
 	waitsince      int64  // approx time when the g become blocked
 	waitreason     string // if status==Gwaiting
 	schedlink      guintptr
-	preempt        bool   // preemption signal, duplicates stackguard0 = stackpreempt
-	paniconfault   bool   // panic (instead of crash) on unexpected fault address
-	preemptscan    bool   // preempted g does scan for gc
-	gcscandone     bool   // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool   // false at start of gc cycle, true if G has not run since last scan
-	throwsplit     bool   // must not split stack
-	raceignore     int8   // ignore race detection events
-	sysblocktraced bool   // StartTrace has emitted EvGoInSyscall about this goroutine
-	sysexitticks   int64  // cputicks when syscall has returned (for tracing)
-	sysexitseq     uint64 // trace seq when syscall has returned (for tracing)
+	preempt        bool     // preemption signal, duplicates stackguard0 = stackpreempt
+	paniconfault   bool     // panic (instead of crash) on unexpected fault address
+	preemptscan    bool     // preempted g does scan for gc
+	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan
+	throwsplit     bool     // must not split stack
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
 	lockedm        *m
 	sig            uint32
 	writebuf       []byte
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 06fbdfac94..092f941f0c 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -13,7 +13,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -27,21 +26,21 @@ const (
 	traceEvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
 	traceEvProcStart      = 5  // start of P [timestamp, thread id]
 	traceEvProcStop       = 6  // stop of P [timestamp]
-	traceEvGCStart        = 7  // GC start [timestamp, stack id]
+	traceEvGCStart        = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone         = 8  // GC done [timestamp]
 	traceEvGCScanStart    = 9  // GC scan start [timestamp]
 	traceEvGCScanDone     = 10 // GC scan done [timestamp]
 	traceEvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone    = 12 // GC sweep done [timestamp]
 	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
-	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id]
+	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
 	traceEvGoEnd          = 15 // goroutine ends [timestamp]
 	traceEvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
 	traceEvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
 	traceEvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
 	traceEvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
 	traceEvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, stack]
+	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
 	traceEvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
 	traceEvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
 	traceEvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
@@ -49,7 +48,7 @@ const (
 	traceEvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
 	traceEvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
 	traceEvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, real timestamp]
+	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
 	traceEvGoSysBlock     = 30 // syscall blocks [timestamp]
 	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
 	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
@@ -58,7 +57,10 @@ const (
 	traceEvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
 	traceEvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	traceEvString         = 37 // string dictionary entry [ID, length, string]
-	traceEvCount          = 38
+	traceEvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvCount          = 41
 )
 
 const (
@@ -105,6 +107,7 @@ var trace struct {
 	ticksEnd      int64       // cputicks when tracing was stopped
 	timeStart     int64       // nanotime when tracing was started
 	timeEnd       int64       // nanotime when tracing was stopped
+	seqGC         uint64      // GC start/done sequencer
 	reading       traceBufPtr // buffer currently handed off to user
 	empty         traceBufPtr // stack of empty buffers
 	fullHead      traceBufPtr // queue of full buffers
@@ -122,31 +125,9 @@ var trace struct {
 	buf     traceBufPtr // global trace buffer, used when running without a p
 }
 
-var traceseq uint64 // global trace sequence number
-
-// tracestamp returns a consistent sequence number, time stamp pair
-// for use in a trace. We need to make sure that time stamp ordering
-// (assuming synchronized CPUs) and sequence ordering match.
-// To do that, we increment traceseq, grab ticks, and increment traceseq again.
-// We treat odd traceseq as a sign that another thread is in the middle
-// of the sequence and spin until it is done.
-// Not splitting stack to avoid preemption, just in case the call sites
-// that used to call xadd64 and cputicks are sensitive to that.
-//go:nosplit
-func tracestamp() (seq uint64, ts int64) {
-	seq = atomic.Load64(&traceseq)
-	for seq&1 != 0 || !atomic.Cas64(&traceseq, seq, seq+1) {
-		seq = atomic.Load64(&traceseq)
-	}
-	ts = cputicks()
-	atomic.Store64(&traceseq, seq+2)
-	return seq >> 1, ts
-}
-
 // traceBufHeader is per-P tracing buffer.
 type traceBufHeader struct {
 	link      traceBufPtr             // in trace.empty/full
-	lastSeq   uint64                  // sequence number of last event
 	lastTicks uint64                  // when we wrote the last event
 	pos       int                     // next write offset in arr
 	stk       [traceStackSize]uintptr // scratch buffer for traceback
@@ -194,13 +175,6 @@ func StartTrace() error {
 		return errorString("tracing is already enabled")
 	}
 
-	trace.seqStart, trace.ticksStart = tracestamp()
-	trace.timeStart = nanotime()
-	trace.headerWritten = false
-	trace.footerWritten = false
-	trace.strings = make(map[string]uint64)
-	trace.stringSeq = 0
-
 	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
 	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
 	// That would lead to an inconsistent trace:
@@ -213,12 +187,15 @@ func StartTrace() error {
 	for _, gp := range allgs {
 		status := readgstatus(gp)
 		if status != _Gdead {
-			traceGoCreate(gp, gp.startpc)
+			traceGoCreate(gp, gp.startpc) // also resets gp.traceseq/tracelastp
 		}
 		if status == _Gwaiting {
+			// traceEvGoWaiting is implied to have seq=1.
+			gp.traceseq++
 			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
 		}
 		if status == _Gsyscall {
+			gp.traceseq++
 			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
 		} else {
 			gp.sysblocktraced = false
@@ -226,6 +203,17 @@ func StartTrace() error {
 	}
 	traceProcStart()
 	traceGoStart()
+	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
+	// If we do it the other way around, it is possible that exitsyscall will
+	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
+	// It will lead to a false conclusion that cputicks is broken.
+	trace.ticksStart = cputicks()
+	trace.timeStart = nanotime()
+	trace.headerWritten = false
+	trace.footerWritten = false
+	trace.strings = make(map[string]uint64)
+	trace.stringSeq = 0
+	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
@@ -382,11 +370,9 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		data = traceAppend(data, 0)
 		if timers.gp != nil {
 			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
 			data = traceAppend(data, uint64(timers.gp.goid))
-			data = traceAppend(data, 0)
 		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
@@ -494,19 +480,14 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 		(*bufp).set(buf)
 	}
 
-	seq, ticksraw := tracestamp()
-	seqDiff := seq - buf.lastSeq
-	ticks := uint64(ticksraw) / traceTickDiv
+	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
 	if buf.pos == 0 {
 		buf.byte(traceEvBatch | 1<<traceArgCountShift)
 		buf.varint(uint64(pid))
-		buf.varint(seq)
 		buf.varint(ticks)
-		seqDiff = 0
 		tickDiff = 0
 	}
-	buf.lastSeq = seq
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -525,7 +506,6 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 		buf.varint(0)
 		lenp = &buf.arr[buf.pos-1]
 	}
-	buf.varint(seqDiff)
 	buf.varint(tickDiff)
 	for _, a := range args {
 		buf.varint(a)
@@ -892,7 +872,8 @@ func traceProcStop(pp *p) {
 }
 
 func traceGCStart() {
-	traceEvent(traceEvGCStart, 3)
+	traceEvent(traceEvGCStart, 3, trace.seqGC)
+	trace.seqGC++
 }
 
 func traceGCDone() {
@@ -916,13 +897,23 @@ func traceGCSweepDone() {
 }
 
 func traceGoCreate(newg *g, pc uintptr) {
+	newg.traceseq = 0
+	newg.tracelastp = getg().m.p
 	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
 	id := trace.stackTab.put([]uintptr{pc + sys.PCQuantum})
 	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
 }
 
 func traceGoStart() {
-	traceEvent(traceEvGoStart, -1, uint64(getg().m.curg.goid))
+	_g_ := getg().m.curg
+	_p_ := _g_.m.p
+	_g_.traceseq++
+	if _g_.tracelastp == _p_ {
+		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
+	} else {
+		_g_.tracelastp = _p_
+		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
+	}
 }
 
 func traceGoEnd() {
@@ -930,10 +921,14 @@ func traceGoEnd() {
 }
 
 func traceGoSched() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoSched, 1)
 }
 
 func traceGoPreempt() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoPreempt, 1)
 }
 
@@ -945,19 +940,37 @@ func traceGoPark(traceEv byte, skip int, gp *g) {
 }
 
 func traceGoUnpark(gp *g, skip int) {
-	traceEvent(traceEvGoUnblock, skip, uint64(gp.goid))
+	_p_ := getg().m.p
+	gp.traceseq++
+	if gp.tracelastp == _p_ {
+		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
+	} else {
+		gp.tracelastp = _p_
+		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
+	}
 }
 
 func traceGoSysCall() {
 	traceEvent(traceEvGoSysCall, 1)
 }
 
-func traceGoSysExit(seq uint64, ts int64) {
-	if int64(seq)-int64(trace.seqStart) < 0 {
-		// The timestamp was obtained during a previous tracing session, ignore.
-		return
-	}
-	traceEvent(traceEvGoSysExit, -1, uint64(getg().m.curg.goid), seq, uint64(ts)/traceTickDiv)
+func traceGoSysExit(ts int64) {
+	if ts != 0 && ts < trace.ticksStart {
+		// There is a race between the code that initializes sysexitticks
+		// (in exitsyscall, which runs without a P, and therefore is not
+		// stopped with the rest of the world) and the code that initializes
+		// a new trace. The recorded sysexitticks must therefore be treated
+		// as "best effort". If they are valid for this trace, then great,
+		// use them for greater accuracy. But if they're not valid for this
+		// trace, assume that the trace was started after the actual syscall
+		// exit (but before we actually managed to start the goroutine,
+		// aka right now), and assign a fresh time stamp to keep the log consistent.
+		ts = 0
+	}
+	_g_ := getg().m.curg
+	_g_.traceseq++
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
 }
 
 func traceGoSysBlock(pp *p) {
-- 
cgit v1.3


From 1a2cf91f5e9e3dfb0873e61ed6907cc365857f6c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Fri, 11 Mar 2016 16:27:51 -0500
Subject: runtime: split gfree list into with-stacks and without-stacks

Currently all free Gs are added to one list. Split this into two
lists: one for free Gs with cached stacks and one for Gs without
cached stacks.

This lets us preferentially allocate Gs that already have a stack, but
more importantly, it sets us up to free cached G stacks concurrently.

Change-Id: Idbe486f708997e1c9d166662995283f02d1eeb3c
Reviewed-on: https://go-review.googlesource.com/20664
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/runtime/proc.go     | 34 ++++++++++++++++++++++++++--------
 src/runtime/runtime2.go |  7 ++++---
 2 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'src/runtime/runtime2.go')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a847823da4..9c840882b6 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2798,8 +2798,13 @@ func gfput(_p_ *p, gp *g) {
 			_p_.gfreecnt--
 			gp = _p_.gfree
 			_p_.gfree = gp.schedlink.ptr()
-			gp.schedlink.set(sched.gfree)
-			sched.gfree = gp
+			if gp.stack.lo == 0 {
+				gp.schedlink.set(sched.gfreeNoStack)
+				sched.gfreeNoStack = gp
+			} else {
+				gp.schedlink.set(sched.gfreeStack)
+				sched.gfreeStack = gp
+			}
 			sched.ngfree++
 		}
 		unlock(&sched.gflock)
@@ -2811,12 +2816,20 @@ func gfput(_p_ *p, gp *g) {
 func gfget(_p_ *p) *g {
 retry:
 	gp := _p_.gfree
-	if gp == nil && sched.gfree != nil {
+	if gp == nil && (sched.gfreeStack != nil || sched.gfreeNoStack != nil) {
 		lock(&sched.gflock)
-		for _p_.gfreecnt < 32 && sched.gfree != nil {
+		for _p_.gfreecnt < 32 {
+			if sched.gfreeStack != nil {
+				// Prefer Gs with stacks.
+				gp = sched.gfreeStack
+				sched.gfreeStack = gp.schedlink.ptr()
+			} else if sched.gfreeNoStack != nil {
+				gp = sched.gfreeNoStack
+				sched.gfreeNoStack = gp.schedlink.ptr()
+			} else {
+				break
+			}
 			_p_.gfreecnt++
-			gp = sched.gfree
-			sched.gfree = gp.schedlink.ptr()
 			sched.ngfree--
 			gp.schedlink.set(_p_.gfree)
 			_p_.gfree = gp
@@ -2853,8 +2866,13 @@ func gfpurge(_p_ *p) {
 		_p_.gfreecnt--
 		gp := _p_.gfree
 		_p_.gfree = gp.schedlink.ptr()
-		gp.schedlink.set(sched.gfree)
-		sched.gfree = gp
+		if gp.stack.lo == 0 {
+			gp.schedlink.set(sched.gfreeNoStack)
+			sched.gfreeNoStack = gp
+		} else {
+			gp.schedlink.set(sched.gfreeStack)
+			sched.gfreeStack = gp
+		}
 		sched.ngfree++
 	}
 	unlock(&sched.gflock)
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 8cfe6b06e6..0a988ce469 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -523,9 +523,10 @@ type schedt struct {
 	runqsize int32
 
 	// Global cache of dead G's.
-	gflock mutex
-	gfree  *g
-	ngfree int32
+	gflock       mutex
+	gfreeStack   *g
+	gfreeNoStack *g
+	ngfree       int32
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
-- 
cgit v1.3


From 2a889b9d931e58166350f785b16edc51e28ef19b Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Fri, 4 Mar 2016 11:58:26 -0500
Subject: runtime: make stack re-scan O(# dirty stacks)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the stack re-scan during mark termination is O(# stacks)
because we enqueue a root marking job for every goroutine. It takes
~34ns to process this root marking job for a valid (clean) stack, so
at around 300k goroutines we exceed the 10ms pause goal. A non-trivial
portion of this time is spent simply taking the cache miss to check
the gcscanvalid flag, so simply optimizing the path that handles clean
stacks can only improve this so much.

Fix this by keeping an explicit list of goroutines with dirty stacks
that need to be rescanned. When a goroutine first transitions to
running after a stack scan and marks its stack dirty, it adds itself
to this list. We enqueue root marking jobs only for the goroutines in
this list, so this improves stack re-scanning asymptotically by
completely eliminating time spent on clean goroutines.

This reduces mark termination time for 500k idle goroutines from 15ms
to 238µs. Overall performance effect is negligible.

name \ 95%ile-time/markTerm     old           new         delta
IdleGs/gs:500000/gomaxprocs:12  15000µs ± 0%  238µs ± 5%  -98.41% (p=0.000 n=10+10)

name              old time/op  new time/op  delta
XBenchGarbage-12  2.30ms ± 3%  2.29ms ± 1%  -0.43%  (p=0.049 n=17+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.57s ± 3%     2.59s ± 2%    ~     (p=0.141 n=19+20)
Fannkuch11-12                2.09s ± 0%     2.10s ± 1%  +0.53%  (p=0.000 n=19+19)
FmtFprintfEmpty-12          45.3ns ± 3%    45.2ns ± 2%    ~     (p=0.845 n=20+20)
FmtFprintfString-12          129ns ± 0%     127ns ± 0%  -1.55%  (p=0.000 n=16+16)
FmtFprintfInt-12             123ns ± 0%     119ns ± 1%  -3.24%  (p=0.000 n=19+19)
FmtFprintfIntInt-12          195ns ± 1%     189ns ± 1%  -3.11%  (p=0.000 n=17+17)
FmtFprintfPrefixedInt-12     193ns ± 1%     187ns ± 1%  -3.06%  (p=0.000 n=19+19)
FmtFprintfFloat-12           254ns ± 0%     255ns ± 1%  +0.35%  (p=0.001 n=14+17)
FmtManyArgs-12               781ns ± 0%     770ns ± 0%  -1.48%  (p=0.000 n=16+19)
GobDecode-12                7.00ms ± 1%    6.98ms ± 1%    ~     (p=0.563 n=19+19)
GobEncode-12                5.91ms ± 1%    5.92ms ± 0%    ~     (p=0.118 n=19+18)
Gzip-12                      219ms ± 1%     215ms ± 1%  -1.81%  (p=0.000 n=18+18)
Gunzip-12                   37.2ms ± 0%    37.4ms ± 0%  +0.45%  (p=0.000 n=17+19)
HTTPClientServer-12         76.9µs ± 3%    77.5µs ± 2%  +0.81%  (p=0.030 n=20+19)
JSONEncode-12               15.0ms ± 0%    14.8ms ± 1%  -0.88%  (p=0.001 n=15+19)
JSONDecode-12               50.6ms ± 0%    53.2ms ± 2%  +5.07%  (p=0.000 n=17+19)
Mandelbrot200-12            4.05ms ± 0%    4.05ms ± 1%    ~     (p=0.581 n=16+17)
GoParse-12                  3.34ms ± 1%    3.30ms ± 1%  -1.21%  (p=0.000 n=15+20)
RegexpMatchEasy0_32-12      69.6ns ± 1%    69.8ns ± 2%    ~     (p=0.566 n=19+19)
RegexpMatchEasy0_1K-12       238ns ± 1%     236ns ± 0%  -0.91%  (p=0.000 n=17+13)
RegexpMatchEasy1_32-12      69.8ns ± 1%    70.0ns ± 1%  +0.23%  (p=0.026 n=17+16)
RegexpMatchEasy1_1K-12       371ns ± 1%     363ns ± 1%  -2.07%  (p=0.000 n=19+19)
RegexpMatchMedium_32-12      107ns ± 2%     106ns ± 1%  -0.51%  (p=0.031 n=18+20)
RegexpMatchMedium_1K-12     33.0µs ± 0%    32.9µs ± 0%  -0.30%  (p=0.004 n=16+16)
RegexpMatchHard_32-12       1.70µs ± 0%    1.70µs ± 0%  +0.45%  (p=0.000 n=16+17)
RegexpMatchHard_1K-12       51.1µs ± 2%    51.4µs ± 1%  +0.53%  (p=0.000 n=17+19)
Revcomp-12                   378ms ± 1%     385ms ± 1%  +1.92%  (p=0.000 n=19+18)
Template-12                 64.3ms ± 2%    65.0ms ± 2%  +1.09%  (p=0.001 n=19+19)
TimeParse-12                 315ns ± 1%     317ns ± 2%    ~     (p=0.108 n=18+20)
TimeFormat-12                360ns ± 1%     337ns ± 0%  -6.30%  (p=0.000 n=18+13)
[Geo mean]                  51.8µs         51.6µs       -0.48%

Change-Id: Icf8994671476840e3998236e15407a505d4c760c
Reviewed-on: https://go-review.googlesource.com/20700
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/runtime/mgc.go      |  18 ++++++-
 src/runtime/mgcmark.go  | 133 +++++++++++++++++++++++++++++++++++++++++-------
 src/runtime/proc.go     |  33 +++++++++++-
 src/runtime/runtime2.go |  11 +++-
 4 files changed, 171 insertions(+), 24 deletions(-)

(limited to 'src/runtime/runtime2.go')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 425ed3a160..328ff4cd88 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -762,7 +762,7 @@ var work struct {
 	alldone note
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
 
 	// markrootDone indicates that roots have been marked at least
 	// once during the current GC cycle. This is checked by root
@@ -830,6 +830,14 @@ var work struct {
 		head, tail guintptr
 	}
 
+	// rescan is a list of G's that need to be rescanned during
+	// mark termination. A G adds itself to this list when it
+	// first invalidates its stack scan.
+	rescan struct {
+		lock mutex
+		list []guintptr
+	}
+
 	// Timing/utilization stats for this cycle.
 	stwprocs, maxprocs                 int32
 	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
@@ -1736,14 +1744,22 @@ func gcCopySpans() {
 func gcResetMarkState() {
 	// This may be called during a concurrent phase, so make sure
 	// allgs doesn't change.
+	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
+		// Accessing gcRescan is unsafe.
+		throw("bad GC phase")
+	}
 	lock(&allglock)
 	for _, gp := range allgs {
 		gp.gcscandone = false  // set to true in gcphasework
 		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcRescan = -1
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
 
+	// Clear rescan list.
+	work.rescan.list = work.rescan.list[:0]
+
 	work.bytesMarked = 0
 	work.initialHeapLive = memstats.heap_live
 	work.markrootDone = false
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index bad7c7e92b..7f481dee22 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -32,6 +32,8 @@ const (
 //
 // The caller must have call gcCopySpans().
 //
+// The world must be stopped.
+//
 //go:nowritebarrier
 func gcMarkRootPrepare() {
 	// Compute how many data and BSS root blocks there are.
@@ -63,24 +65,31 @@ func gcMarkRootPrepare() {
 		// after concurrent mark. In STW GC, this will happen
 		// during mark termination.
 		work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
+
+		// On the first markroot, we need to scan all Gs. Gs
+		// may be created after this point, but it's okay that
+		// we ignore them because they begin life without any
+		// roots, so there's nothing to scan, and any roots
+		// they create during the concurrent phase will be
+		// scanned during mark termination. During mark
+		// termination, allglen isn't changing, so we'll scan
+		// all Gs.
+		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		work.nRescanRoots = 0
 	} else {
 		// We've already scanned span roots and kept the scan
 		// up-to-date during concurrent mark.
 		work.nSpanRoots = 0
-	}
 
-	// Snapshot of allglen. During concurrent scan, we just need
-	// to be consistent about how many markroot jobs we create and
-	// how many Gs we check. Gs may be created after this point,
-	// but it's okay that we ignore them because they begin life
-	// without any roots, so there's nothing to scan, and any
-	// roots they create during the concurrent phase will be
-	// scanned during mark termination. During mark termination,
-	// allglen isn't changing, so we'll scan all Gs.
-	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		// On the second pass of markroot, we're just scanning
+		// dirty stacks. It's safe to access rescan since the
+		// world is stopped.
+		work.nStackRoots = 0
+		work.nRescanRoots = len(work.rescan.list)
+	}
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -92,11 +101,24 @@ func gcMarkRootCheck() {
 	}
 
 	lock(&allglock)
-	// Check that gc work is done.
-	for i := 0; i < work.nStackRoots; i++ {
-		gp := allgs[i]
-		if !gp.gcscandone {
-			throw("scan missed a g")
+	// Check that stacks have been scanned.
+	if gcphase == _GCmarktermination {
+		for i := 0; i < len(allgs); i++ {
+			gp := allgs[i]
+			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+				println("gp", gp, "goid", gp.goid,
+					"status", readgstatus(gp),
+					"gcscandone", gp.gcscandone,
+					"gcscanvalid", gp.gcscanvalid)
+				throw("scan missed a g")
+			}
+		}
+	} else {
+		for i := 0; i < work.nStackRoots; i++ {
+			gp := allgs[i]
+			if !gp.gcscandone {
+				throw("scan missed a g")
+			}
 		}
 	}
 	unlock(&allglock)
@@ -109,12 +131,18 @@ var oneptrmask = [...]uint8{1}
 //
 // Preemption must be disabled (because this uses a gcWork).
 //
+// nowritebarrier is only advisory here.
+//
 //go:nowritebarrier
 func markroot(gcw *gcWork, i uint32) {
+	// TODO(austin): This is a bit ridiculous. Compute and store
+	// the bases in gcMarkRootPrepare instead of the counts.
 	baseData := uint32(fixedRootCount)
 	baseBSS := baseData + uint32(work.nDataRoots)
 	baseSpans := baseBSS + uint32(work.nBSSRoots)
 	baseStacks := baseSpans + uint32(work.nSpanRoots)
+	baseRescan := baseStacks + uint32(work.nStackRoots)
+	end := baseRescan + uint32(work.nRescanRoots)
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
@@ -151,10 +179,14 @@ func markroot(gcw *gcWork, i uint32) {
 
 	default:
 		// the rest is scanning goroutine stacks
-		if uintptr(i-baseStacks) >= allglen {
+		var gp *g
+		if baseStacks <= i && i < baseRescan {
+			gp = allgs[i-baseStacks]
+		} else if baseRescan <= i && i < end {
+			gp = work.rescan.list[i-baseRescan].ptr()
+		} else {
 			throw("markroot: bad index")
 		}
-		gp := allgs[i-baseStacks]
 
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
@@ -163,13 +195,14 @@ func markroot(gcw *gcWork, i uint32) {
 			gp.waitsince = work.tstart
 		}
 
-		if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC {
+		if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC && readgstatus(gp) != _Gdead {
 			// GC background workers may be
 			// non-preemptible, so we may deadlock if we
 			// try to scan them during a concurrent phase.
 			// They also have tiny stacks, so just ignore
 			// them until mark termination.
 			gp.gcscandone = true
+			queueRescan(gp)
 			break
 		}
 
@@ -721,6 +754,14 @@ func scanstack(gp *g) {
 		gcw.dispose()
 	}
 	gcUnlockStackBarriers(gp)
+	if gcphase == _GCmark {
+		// gp may have added itself to the rescan list between
+		// when GC started and now. It's clean now, so remove
+		// it. This isn't safe during mark termination because
+		// mark termination is consuming this list, but it's
+		// also not necessary.
+		dequeueRescan(gp)
+	}
 	gp.gcscanvalid = true
 }
 
@@ -797,6 +838,60 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
 	}
 }
 
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+	if gcphase == _GCoff {
+		gp.gcscanvalid = false
+		return
+	}
+	if gp.gcRescan != -1 {
+		throw("g already on rescan list")
+	}
+
+	lock(&work.rescan.lock)
+	gp.gcscanvalid = false
+
+	// Recheck gcphase under the lock in case there was a phase change.
+	if gcphase == _GCoff {
+		unlock(&work.rescan.lock)
+		return
+	}
+	if len(work.rescan.list) == cap(work.rescan.list) {
+		throw("rescan list overflow")
+	}
+	n := len(work.rescan.list)
+	gp.gcRescan = int32(n)
+	work.rescan.list = work.rescan.list[:n+1]
+	work.rescan.list[n].set(gp)
+	unlock(&work.rescan.lock)
+}
+
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+	if gp.gcRescan == -1 {
+		return
+	}
+	if gcphase == _GCoff {
+		gp.gcRescan = -1
+		return
+	}
+
+	lock(&work.rescan.lock)
+	if work.rescan.list[gp.gcRescan].ptr() != gp {
+		throw("bad dequeueRescan")
+	}
+	// Careful: gp may itself be the last G on the list.
+	last := work.rescan.list[len(work.rescan.list)-1]
+	work.rescan.list[gp.gcRescan] = last
+	last.ptr().gcRescan = gp.gcRescan
+	gp.gcRescan = -1
+	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+	unlock(&work.rescan.lock)
+}
+
 type gcDrainFlags int
 
 const (
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index dcdc7bedb8..ee732e3cf7 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -402,6 +402,16 @@ func allgadd(gp *g) {
 	lock(&allglock)
 	allgs = append(allgs, gp)
 	allglen = uintptr(len(allgs))
+
+	// Grow GC rescan list if necessary.
+	if len(allgs) > cap(work.rescan.list) {
+		lock(&work.rescan.lock)
+		l := work.rescan.list
+		// Let append do the heavy lifting, but keep the
+		// length the same.
+		work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
+		unlock(&work.rescan.lock)
+	}
 	unlock(&allglock)
 }
 
@@ -754,8 +764,9 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
+	if newval == _Grunning && gp.gcscanvalid {
+		// Run queueRescan on the system stack so it has more space.
+		systemstack(func() { queueRescan(gp) })
 	}
 }
 
@@ -1405,6 +1416,8 @@ func newextram() {
 	gp.syscallpc = gp.sched.pc
 	gp.syscallsp = gp.sched.sp
 	gp.stktopsp = gp.sched.sp
+	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
+	gp.gcRescan = -1
 	// malg returns status as Gidle, change to Gsyscall before adding to allg
 	// where GC will see it.
 	casgstatus(gp, _Gidle, _Gsyscall)
@@ -2210,6 +2223,10 @@ func goexit0(gp *g) {
 	gp.waitreason = ""
 	gp.param = nil
 
+	// Note that gp's stack scan is now "valid" because it has no
+	// stack. We could dequeueRescan, but that takes a lock and
+	// isn't really necessary.
+	gp.gcscanvalid = true
 	dropg()
 
 	if _g_.m.locked&^_LockExternal != 0 {
@@ -2700,6 +2717,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	if newg == nil {
 		newg = malg(_StackMin)
 		casgstatus(newg, _Gidle, _Gdead)
+		newg.gcRescan = -1
 		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
 	}
 	if newg.stack.hi == 0 {
@@ -2733,6 +2751,17 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	if isSystemGoroutine(newg) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
+	// The stack is dirty from the argument frame, so queue it for
+	// scanning. Do this before setting it to runnable so we still
+	// own the G. If we're recycling a G, it may already be on the
+	// rescan list.
+	if newg.gcRescan == -1 {
+		queueRescan(newg)
+	} else {
+		// The recycled G is already on the rescan list. Just
+		// mark the stack dirty.
+		newg.gcscanvalid = false
+	}
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 0a988ce469..d35b897c3e 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -336,7 +336,7 @@ type g struct {
 	paniconfault   bool     // panic (instead of crash) on unexpected fault address
 	preemptscan    bool     // preempted g does scan for gc
 	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
 	throwsplit     bool     // must not split stack
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
@@ -354,7 +354,14 @@ type g struct {
 	racectx        uintptr
 	waiting        *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
 
-	// Per-G gcController state
+	// Per-G GC state
+
+	// gcRescan is this G's index in work.rescan.list. If this is
+	// -1, this G is not on the rescan list.
+	//
+	// If gcphase != _GCoff and this G is visible to the garbage
+	// collector, writes to this are protected by work.rescan.lock.
+	gcRescan int32
 
 	// gcAssistBytes is this G's GC assist credit in terms of
 	// bytes allocated. If this is positive, then the G has credit
-- 
cgit v1.3