[dev.boringcrypto] all: merge master (nearly Go 1.10 beta 1) into dev.boringcrypto

This is a git merge of master into dev.boringcrypto. The branch was previously based on release-branch.go1.9, so there are a handful of spurious conflicts that would also arise if trying to merge master into release-branch.go1.9 (which we never do). Those have all been resolved by taking the original file from master, discarding any Go 1.9-specific edits. all.bash passes on darwin/amd64, which is to say without actually using BoringCrypto. Go 1.10-related fixes to BoringCrypto itself will be in a followup CL. This CL is just the merge. Change-Id: I4c97711fec0fb86761913dcde28d25c001246c35
author: Russ Cox <rsc@golang.org> 2017-12-06 00:35:28 -0500
committer: Russ Cox <rsc@golang.org> 2017-12-06 01:03:36 -0500
commit: 185e6094fd968b35b80e56aad1286c66bb2cc261 (patch)
tree: 411babe570d6faa1e99251a9167123afd07407d2 /src/runtime
parent: c36033a379a4907fb75309416ffcf2904e613ab9 (diff)
parent: a032f74bf0b40a94669159e7d7e96722eb76199b (diff)
download: go-185e6094fd968b35b80e56aad1286c66bb2cc261.tar.xz
303 files changed, 9047 insertions, 4778 deletions
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 8d388da5a2..89125f48ba 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -47,26 +47,25 @@ type typeAlg struct {
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
+
 func memhash8(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 1)
 }
+
 func memhash16(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 2)
 }
-func memhash32(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 4)
-}
-func memhash64(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 8)
-}
+
 func memhash128(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 16)
 }
 
-// memhash_varlen is defined in assembly because it needs access
-// to the closure. It appears here to provide an argument
-// signature for the assembly routine.
-func memhash_varlen(p unsafe.Pointer, h uintptr) uintptr
+//go:nosplit
+func memhash_varlen(p unsafe.Pointer, h uintptr) uintptr {
+	ptr := getclosureptr()
+	size := *(*uintptr)(unsafe.Pointer(ptr + unsafe.Sizeof(h)))
+	return memhash(p, h, size)
+}
 
 var algarray = [alg_max]typeAlg{
 	alg_NOEQ:     {nil, nil},
diff --git a/src/runtime/append_test.go b/src/runtime/append_test.go
index 6bd8f3bd95..ef1e812c0d 100644
--- a/src/runtime/append_test.go
+++ b/src/runtime/append_test.go
@@ -18,42 +18,52 @@ func BenchmarkMakeSlice(b *testing.B) {
 	}
 }
 
-func BenchmarkGrowSliceBytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]byte(nil), x...)
-	}
-}
-
-func BenchmarkGrowSliceInts(b *testing.B) {
-	b.StopTimer()
-	var x = make([]int, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]int(nil), x...)
-	}
-}
-
-func BenchmarkGrowSlicePtr(b *testing.B) {
-	b.StopTimer()
-	var x = make([]*byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]*byte(nil), x...)
-	}
-}
+type (
+	struct24 struct{ a, b, c int64 }
+	struct32 struct{ a, b, c, d int64 }
+	struct40 struct{ a, b, c, d, e int64 }
+)
 
-type struct24 struct{ a, b, c int64 }
+func BenchmarkGrowSlice(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		x := make([]byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]byte(nil), x...)
+		}
+	})
+	b.Run("Int", func(b *testing.B) {
+		x := make([]int, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]int(nil), x...)
+		}
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		x := make([]*byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]*byte(nil), x...)
+		}
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("24", func(b *testing.B) {
+			x := make([]struct24, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct24(nil), x...)
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			x := make([]struct32, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct32(nil), x...)
+			}
+		})
+		b.Run("40", func(b *testing.B) {
+			x := make([]struct40, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct40(nil), x...)
+			}
+		})
 
-func BenchmarkGrowSliceStruct24Bytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]struct24, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]struct24(nil), x...)
-	}
+	})
 }
 
 func BenchmarkAppend(b *testing.B) {
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 5bbf2866f3..80a145187c 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -7,10 +7,93 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+// _rt0_386 is common startup code for most 386 systems when using
+// internal linking. This is the entry point for the program from the
+// kernel for an ordinary -buildmode=exe program. The stack holds the
+// number of arguments and the C-style argv.
+TEXT _rt0_386(SB),NOSPLIT,$8
+	MOVL	8(SP), AX	// argc
+	LEAL	12(SP), BX	// argv
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	JMP	runtime·rt0_go(SB)
+
+// _rt0_386_lib is common startup code for most 386 systems when
+// using -buildmode=c-archive or -buildmode=c-shared. The linker will
+// arrange to invoke this function as a global constructor (for
+// c-archive) or when the shared library is loaded (for c-shared).
+// We expect argc and argv to be passed on the stack following the
+// usual C ABI.
+TEXT _rt0_386_lib(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	PUSHL	BX
+	PUSHL	SI
+	PUSHL	DI
+
+	MOVL	8(BP), AX
+	MOVL	AX, _rt0_386_lib_argc<>(SB)
+	MOVL	12(BP), AX
+	MOVL	AX, _rt0_386_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	CALL	runtime·libpreinit(SB)
+
+	SUBL	$8, SP
+
+	// Create a new thread to do the runtime initialization.
+	MOVL	_cgo_sys_thread_create(SB), AX
+	TESTL	AX, AX
+	JZ	nocgo
+
+	// Align stack to call C function.
+	// We moved SP to BP above, but BP was clobbered by the libpreinit call.
+	MOVL	SP, BP
+	ANDL	$~15, SP
+
+	MOVL	$_rt0_386_lib_go(SB), BX
+	MOVL	BX, 0(SP)
+	MOVL	$0, 4(SP)
+
+	CALL	AX
+
+	MOVL	BP, SP
+
+	JMP	restore
+
+nocgo:
+	MOVL	$0x800000, 0(SP)                    // stacksize = 8192KB
+	MOVL	$_rt0_386_lib_go(SB), AX
+	MOVL	AX, 4(SP)                           // fn
+	CALL	runtime·newosproc0(SB)
+
+restore:
+	ADDL	$8, SP
+	POPL	DI
+	POPL	SI
+	POPL	BX
+	POPL	BP
+	RET
+
+// _rt0_386_lib_go initializes the Go runtime.
+// This is started in a separate thread by _rt0_386_lib.
+TEXT _rt0_386_lib_go(SB),NOSPLIT,$8
+	MOVL	_rt0_386_lib_argc<>(SB), AX
+	MOVL	AX, 0(SP)
+	MOVL	_rt0_386_lib_argv<>(SB), AX
+	MOVL	AX, 4(SP)
+	JMP	runtime·rt0_go(SB)
+
+DATA _rt0_386_lib_argc<>(SB)/4, $0
+GLOBL _rt0_386_lib_argc<>(SB),NOPTR, $4
+DATA _rt0_386_lib_argv<>(SB)/4, $0
+GLOBL _rt0_386_lib_argv<>(SB),NOPTR, $4
+
 TEXT runtime·rt0_go(SB),NOSPLIT,$0
-	// copy arguments forward on an even stack
-	MOVL	argc+0(FP), AX
-	MOVL	argv+4(FP), BX
+	// Copy arguments forward on an even stack.
+	// Users of this function jump to it, they don't call it.
+	MOVL	0(SP), AX
+	MOVL	4(SP), BX
 	SUBL	$128, SP		// plenty of scratch
 	ANDL	$~15, SP
 	MOVL	AX, 120(SP)		// save argc, argv away
@@ -279,18 +362,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $0-4
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $8-4
 	MOVL	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVL	gobuf_ctxt(BX), DX
-	TESTL	DX, DX
-	JZ	nilctxt
-	LEAL	gobuf_ctxt(BX), AX
-	MOVL	AX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVL	buf+0(FP), BX
-
-nilctxt:
 	MOVL	gobuf_g(BX), DX
 	MOVL	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -403,11 +474,12 @@ switch:
 	RET
 
 noswitch:
-	// already on system stack, just call directly
+	// already on system stack; tail call the function
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVL	DI, DX
 	MOVL	0(DI), DI
-	CALL	DI
-	RET
+	JMP	DI
 
 /*
  * support for morestack
@@ -453,7 +525,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$0-0
 	MOVL	SI, (g_sched+gobuf_g)(SI)
 	LEAL	4(SP), AX	// f's SP
 	MOVL	AX, (g_sched+gobuf_sp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BP
@@ -461,10 +533,8 @@ TEXT runtime·morestack(SB),NOSPLIT,$0-0
 	MOVL	(g_sched+gobuf_sp)(BP), AX
 	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
 	MOVL	AX, SP
-	PUSHL	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVL	$0, 0x1003	// crash if newstack returns
-	POPL	DX	// keep balance check happy
 	RET
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
@@ -849,12 +919,6 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
 	INT	$3
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
-	MOVL	argp+0(FP),AX		// addr of first arg
-	MOVL	-4(AX),AX		// get calling pc
-	MOVL	AX, ret+4(FP)
-	RET
-
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-8
 	CMPB	runtime·support_sse2(SB), $1
@@ -885,23 +949,6 @@ TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
 TEXT runtime·emptyfunc(SB),0,$0-0
 	RET
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVL	p+0(FP), AX
-	MOVL	h+4(FP), BX
-	MOVL	4(DX), CX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	MOVL	CX, 8(SP)
-	CALL	runtime·memhash(SB)
-	MOVL	12(SP), AX
-	MOVL	AX, ret+8(FP)
-	RET
-
 // hash function using AES hardware instructions
 TEXT runtime·aeshash(SB),NOSPLIT,$0-16
 	MOVL	p+0(FP), AX	// ptr to data
@@ -1323,23 +1370,6 @@ eq:
 	MOVB    $1, ret+8(FP)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-17
-	MOVL	s1_base+0(FP), SI
-	MOVL	s2_base+8(FP), DI
-	CMPL	SI, DI
-	JEQ	same
-	MOVL	s1_len+4(FP), BX
-	LEAL	ret+16(FP), AX
-	JMP	runtime·memeqbody(SB)
-same:
-	MOVB	$1, ret+16(FP)
-	RET
-
 TEXT bytes·Equal(SB),NOSPLIT,$0-25
 	MOVL	a_len+4(FP), BX
 	MOVL	b_len+16(FP), CX
@@ -1637,19 +1667,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-// Prefetching doesn't seem to help.
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
-	RET
-
 // Add a module's moduledata to the linked list of moduledata objects. This
 // is called from .init_array by a function generated in the linker and so
 // follows the platform ABI wrt register preservation -- it only touches AX,
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 6405be92de..576a61ca6c 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -7,6 +7,83 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+// _rt0_amd64 is common startup code for most amd64 systems when using
+// internal linking. This is the entry point for the program from the
+// kernel for an ordinary -buildmode=exe program. The stack holds the
+// number of arguments and the C-style argv.
+TEXT _rt0_amd64(SB),NOSPLIT,$-8
+	MOVQ	0(SP), DI	// argc
+	LEAQ	8(SP), SI	// argv
+	JMP	runtime·rt0_go(SB)
+
+// main is common startup code for most amd64 systems when using
+// external linking. The C startup code will call the symbol "main"
+// passing argc and argv in the usual C ABI registers DI and SI.
+TEXT main(SB),NOSPLIT,$-8
+	JMP	runtime·rt0_go(SB)
+
+// _rt0_amd64_lib is common startup code for most amd64 systems when
+// using -buildmode=c-archive or -buildmode=c-shared. The linker will
+// arrange to invoke this function as a global constructor (for
+// c-archive) or when the shared library is loaded (for c-shared).
+// We expect argc and argv to be passed in the usual C ABI registers
+// DI and SI.
+TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
+	// Align stack per ELF ABI requirements.
+	MOVQ	SP, AX
+	ANDQ	$~15, SP
+	// Save C ABI callee-saved registers, as caller may need them.
+	MOVQ	BX, 0x10(SP)
+	MOVQ	BP, 0x18(SP)
+	MOVQ	R12, 0x20(SP)
+	MOVQ	R13, 0x28(SP)
+	MOVQ	R14, 0x30(SP)
+	MOVQ	R15, 0x38(SP)
+	MOVQ	AX, 0x40(SP)
+
+	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
+	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	CALL	runtime·libpreinit(SB)
+
+	// Create a new thread to finish Go runtime initialization.
+	MOVQ	_cgo_sys_thread_create(SB), AX
+	TESTQ	AX, AX
+	JZ	nocgo
+	MOVQ	$_rt0_amd64_lib_go(SB), DI
+	MOVQ	$0, SI
+	CALL	AX
+	JMP	restore
+
+nocgo:
+	MOVQ	$0x800000, 0(SP)		// stacksize
+	MOVQ	$_rt0_amd64_lib_go(SB), AX
+	MOVQ	AX, 8(SP)			// fn
+	CALL	runtime·newosproc0(SB)
+
+restore:
+	MOVQ	0x10(SP), BX
+	MOVQ	0x18(SP), BP
+	MOVQ	0x20(SP), R12
+	MOVQ	0x28(SP), R13
+	MOVQ	0x30(SP), R14
+	MOVQ	0x38(SP), R15
+	MOVQ	0x40(SP), SP
+	RET
+
+// _rt0_amd64_lib_go initializes the Go runtime.
+// This is started in a separate thread by _rt0_amd64_lib.
+TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
+	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
+	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
+	JMP	runtime·rt0_go(SB)
+
+DATA _rt0_amd64_lib_argc<>(SB)/8, $0
+GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_amd64_lib_argv<>(SB)/8, $0
+GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
+
 TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	// copy arguments forward on an even stack
 	MOVQ	DI, AX		// argc
@@ -227,18 +304,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $0-8
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVQ	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVQ	gobuf_ctxt(BX), AX
-	TESTQ	AX, AX
-	JZ	nilctxt
-	LEAQ	gobuf_ctxt(BX), AX
-	MOVQ	AX, 0(SP)
-	MOVQ	$0, 8(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVQ	buf+0(FP), BX
-
-nilctxt:
 	MOVQ	gobuf_g(BX), DX
 	MOVQ	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -354,11 +419,12 @@ switch:
 	RET
 
 noswitch:
-	// already on m stack, just call directly
+	// already on m stack; tail call the function
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVQ	DI, DX
 	MOVQ	0(DI), DI
-	CALL	DI
-	RET
+	JMP	DI
 
 /*
  * support for morestack
@@ -405,16 +471,14 @@ TEXT runtime·morestack(SB),NOSPLIT,$0-0
 	LEAQ	8(SP), AX // f's SP
 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
 	MOVQ	BP, (g_sched+gobuf_bp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVQ	m_g0(BX), BX
 	MOVQ	BX, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(BX), SP
-	PUSHQ	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVQ	$0, 0x1003	// crash if newstack returns
-	POPQ	DX	// keep balance check happy
 	RET
 
 // morestack but not preserving ctxt.
@@ -833,12 +897,6 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
 	INT	$3
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVQ	argp+0(FP),AX		// addr of first arg
-	MOVQ	-8(AX),AX		// get calling pc
-	MOVQ	AX, ret+8(FP)
-	RET
-
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
@@ -854,23 +912,6 @@ done:
 	MOVQ	AX, ret+0(FP)
 	RET
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVQ	p+0(FP), AX
-	MOVQ	h+8(FP), BX
-	MOVQ	8(DX), CX
-	MOVQ	AX, 0(SP)
-	MOVQ	BX, 8(SP)
-	MOVQ	CX, 16(SP)
-	CALL	runtime·memhash(SB)
-	MOVQ	24(SP), AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
 // hash function using AES hardware instructions
 TEXT runtime·aeshash(SB),NOSPLIT,$0-32
 	MOVQ	p+0(FP), AX	// ptr to data
@@ -1343,23 +1384,6 @@ eq:
 	MOVB	$1, ret+16(FP)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVQ	s1_base+0(FP), SI
-	MOVQ	s2_base+16(FP), DI
-	CMPQ	SI, DI
-	JEQ	eq
-	MOVQ	s1_len+8(FP), BX
-	LEAQ	ret+32(FP), AX
-	JMP	runtime·memeqbody(SB)
-eq:
-	MOVB	$1, ret+32(FP)
-	RET
-
 // a in SI
 // b in DI
 // count in BX
@@ -2339,26 +2363,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
-	MOVQ	addr+0(FP), AX
-	PREFETCHT0	(AX)
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
-	MOVQ	addr+0(FP), AX
-	PREFETCHT1	(AX)
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
-	MOVQ	addr+0(FP), AX
-	PREFETCHT2	(AX)
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
-	MOVQ	addr+0(FP), AX
-	PREFETCHNTA	(AX)
-	RET
-
 // This is called from .init_array and follows the platform, not Go, ABI.
 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
 	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
@@ -2367,3 +2371,87 @@ TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
 	MOVQ	DI, runtime·lastmoduledatap(SB)
 	POPQ	R15
 	RET
+
+// gcWriteBarrier performs a heap pointer write and informs the GC.
+//
+// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
+// - DI is the destination of the write
+// - AX is the value being written at DI
+// It clobbers FLAGS. It does not clobber any general-purpose registers,
+// but may clobber others (e.g., SSE registers).
+TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
+	// Save the registers clobbered by the fast path. This is slightly
+	// faster than having the caller spill these.
+	MOVQ	R14, 104(SP)
+	MOVQ	R13, 112(SP)
+	// TODO: Consider passing g.m.p in as an argument so they can be shared
+	// across a sequence of write barriers.
+	get_tls(R13)
+	MOVQ	g(R13), R13
+	MOVQ	g_m(R13), R13
+	MOVQ	m_p(R13), R13
+	MOVQ	(p_wbBuf+wbBuf_next)(R13), R14
+	// Increment wbBuf.next position.
+	LEAQ	16(R14), R14
+	MOVQ	R14, (p_wbBuf+wbBuf_next)(R13)
+	CMPQ	R14, (p_wbBuf+wbBuf_end)(R13)
+	// Record the write.
+	MOVQ	AX, -16(R14)	// Record value
+	MOVQ	(DI), R13	// TODO: This turns bad writes into bad reads.
+	MOVQ	R13, -8(R14)	// Record *slot
+	// Is the buffer full? (flags set in CMPQ above)
+	JEQ	flush
+ret:
+	MOVQ	104(SP), R14
+	MOVQ	112(SP), R13
+	// Do the write.
+	MOVQ	AX, (DI)
+	RET
+
+flush:
+	// Save all general purpose registers since these could be
+	// clobbered by wbBufFlush and were not saved by the caller.
+	// It is possible for wbBufFlush to clobber other registers
+	// (e.g., SSE registers), but the compiler takes care of saving
+	// those in the caller if necessary. This strikes a balance
+	// with registers that are likely to be used.
+	//
+	// We don't have type information for these, but all code under
+	// here is NOSPLIT, so nothing will observe these.
+	//
+	// TODO: We could strike a different balance; e.g., saving X0
+	// and not saving GP registers that are less likely to be used.
+	MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
+	MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
+	MOVQ	BX, 16(SP)
+	MOVQ	CX, 24(SP)
+	MOVQ	DX, 32(SP)
+	// DI already saved
+	MOVQ	SI, 40(SP)
+	MOVQ	BP, 48(SP)
+	MOVQ	R8, 56(SP)
+	MOVQ	R9, 64(SP)
+	MOVQ	R10, 72(SP)
+	MOVQ	R11, 80(SP)
+	MOVQ	R12, 88(SP)
+	// R13 already saved
+	// R14 already saved
+	MOVQ	R15, 96(SP)
+
+	// This takes arguments DI and AX
+	CALL	runtime·wbBufFlush(SB)
+
+	MOVQ	0(SP), DI
+	MOVQ	8(SP), AX
+	MOVQ	16(SP), BX
+	MOVQ	24(SP), CX
+	MOVQ	32(SP), DX
+	MOVQ	40(SP), SI
+	MOVQ	48(SP), BP
+	MOVQ	56(SP), R8
+	MOVQ	64(SP), R9
+	MOVQ	72(SP), R10
+	MOVQ	80(SP), R11
+	MOVQ	88(SP), R12
+	MOVQ	96(SP), R15
+	JMP	ret
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index 6367b3fef4..7fee79aefb 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -198,18 +198,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $0-4
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $8-4
 	MOVL	buf+0(FP), BX		// gobuf
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVL	gobuf_ctxt(BX), DX
-	TESTL	DX, DX
-	JZ	nilctxt
-	LEAL	gobuf_ctxt(BX), AX
-	MOVL	AX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	runtime·writebarrierptr_prewrite(SB)
-	MOVL	buf+0(FP), BX
-
-nilctxt:
 	MOVL	gobuf_g(BX), DX
 	MOVL	0(DX), CX		// make sure g != nil
 	get_tls(CX)
@@ -318,10 +306,11 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVL	DI, DX
 	MOVL	0(DI), DI
-	CALL	DI
-	RET
+	JMP	DI
 
 /*
  * support for morestack
@@ -368,16 +357,14 @@ TEXT runtime·morestack(SB),NOSPLIT,$0-0
 	MOVL	SI, (g_sched+gobuf_g)(SI)
 	LEAL	8(SP), AX // f's SP
 	MOVL	AX, (g_sched+gobuf_sp)(SI)
-	// newstack will fill gobuf.ctxt.
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
 
 	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BX
 	MOVL	BX, g(CX)
 	MOVL	(g_sched+gobuf_sp)(BX), SP
-	PUSHQ	DX	// ctxt argument
 	CALL	runtime·newstack(SB)
 	MOVL	$0, 0x1003	// crash if newstack returns
-	POPQ	DX	// keep balance check happy
 	RET
 
 // morestack trampolines
@@ -559,30 +546,6 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
 	MOVL	0, AX
 	RET
 
-TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8
-	MOVL	ptr+0(FP), DI
-	MOVL	n+4(FP), CX
-	MOVQ	CX, BX
-	ANDQ	$3, BX
-	SHRQ	$2, CX
-	MOVQ	$0, AX
-	CLD
-	REP
-	STOSL
-	MOVQ	BX, CX
-	REP
-	STOSB
-	// Note: we zero only 4 bytes at a time so that the tail is at most
-	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
-	// See issue 13160.
-	RET
-
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
-	MOVL	argp+0(FP),AX		// addr of first arg
-	MOVL	-8(AX),AX		// get calling pc
-	MOVL	AX, ret+8(FP)
-	RET
-
 // int64 runtime·cputicks(void)
 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	RDTSC
@@ -591,23 +554,6 @@ TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	MOVQ	AX, ret+0(FP)
 	RET
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$24-12
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVL	p+0(FP), AX
-	MOVL	h+4(FP), BX
-	MOVL	4(DX), CX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	MOVL	CX, 8(SP)
-	CALL	runtime·memhash(SB)
-	MOVL	16(SP), AX
-	MOVL	AX, ret+8(FP)
-	RET
-
 // hash function using AES hardware instructions
 // For now, our one amd64p32 system (NaCl) does not
 // support using AES instructions, so have not bothered to
@@ -658,24 +604,6 @@ eq:
 	MOVB    $1, ret+8(FP)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-17
-	MOVL	s1_base+0(FP), SI
-	MOVL	s2_base+8(FP), DI
-	CMPL	SI, DI
-	JEQ	same
-	MOVL	s1_len+4(FP), BX
-	CALL	runtime·memeqbody(SB)
-	MOVB	AX, ret+16(FP)
-	RET
-same:
-	MOVB	$1, ret+16(FP)
-	RET
-
 // a in SI
 // b in DI
 // count in BX
@@ -1042,27 +970,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT0	(AX)
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT1	(AX)
-	RET
-
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT2	(AX)
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHNTA	(AX)
-	RET
-
 TEXT ·checkASM(SB),NOSPLIT,$0-1
 	MOVB	$1, ret+0(FP)
 	RET
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 09b6759749..306984e8f7 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -7,14 +7,112 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+// _rt0_arm is common startup code for most ARM systems when using
+// internal linking. This is the entry point for the program from the
+// kernel for an ordinary -buildmode=exe program. The stack holds the
+// number of arguments and the C-style argv.
+TEXT _rt0_arm(SB),NOSPLIT,$-4
+	MOVW	(R13), R0	// argc
+	MOVW	$4(R13), R1		// argv
+	B	runtime·rt0_go(SB)
+
+// main is common startup code for most ARM systems when using
+// external linking. The C startup code will call the symbol "main"
+// passing argc and argv in the usual C ABI registers R0 and R1.
+TEXT main(SB),NOSPLIT,$-4
+	B	runtime·rt0_go(SB)
+
+// _rt0_arm_lib is common startup code for most ARM systems when
+// using -buildmode=c-archive or -buildmode=c-shared. The linker will
+// arrange to invoke this function as a global constructor (for
+// c-archive) or when the shared library is loaded (for c-shared).
+// We expect argc and argv to be passed in the usual C ABI registers
+// R0 and R1.
+TEXT _rt0_arm_lib(SB),NOSPLIT,$104
+	// Preserve callee-save registers. Raspberry Pi's dlopen(), for example,
+	// actually cares that R11 is preserved.
+	MOVW	R4, 12(R13)
+	MOVW	R5, 16(R13)
+	MOVW	R6, 20(R13)
+	MOVW	R7, 24(R13)
+	MOVW	R8, 28(R13)
+	MOVW	R11, 32(R13)
+
+	// Skip floating point registers on GOARM < 6.
+	MOVB    runtime·goarm(SB), R11
+	CMP	$6, R11
+	BLT	skipfpsave
+	MOVD	F8, (32+8*1)(R13)
+	MOVD	F9, (32+8*2)(R13)
+	MOVD	F10, (32+8*3)(R13)
+	MOVD	F11, (32+8*4)(R13)
+	MOVD	F12, (32+8*5)(R13)
+	MOVD	F13, (32+8*6)(R13)
+	MOVD	F14, (32+8*7)(R13)
+	MOVD	F15, (32+8*8)(R13)
+skipfpsave:
+	// Save argc/argv.
+	MOVW	R0, _rt0_arm_lib_argc<>(SB)
+	MOVW	R1, _rt0_arm_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	CALL	runtime·libpreinit(SB)
+
+	// Create a new thread to do the runtime initialization.
+	MOVW	_cgo_sys_thread_create(SB), R2
+	CMP	$0, R2
+	BEQ	nocgo
+	MOVW	$_rt0_arm_lib_go<>(SB), R0
+	MOVW	$0, R1
+	BL	(R2)
+	B	rr
+nocgo:
+	MOVW	$0x800000, R0                     // stacksize = 8192KB
+	MOVW	$_rt0_arm_lib_go<>(SB), R1  // fn
+	MOVW	R0, 4(R13)
+	MOVW	R1, 8(R13)
+	BL	runtime·newosproc0(SB)
+rr:
+	// Restore callee-save registers and return.
+	MOVB    runtime·goarm(SB), R11
+	CMP	$6, R11
+	BLT	skipfprest
+	MOVD	(32+8*1)(R13), F8
+	MOVD	(32+8*2)(R13), F9
+	MOVD	(32+8*3)(R13), F10
+	MOVD	(32+8*4)(R13), F11
+	MOVD	(32+8*5)(R13), F12
+	MOVD	(32+8*6)(R13), F13
+	MOVD	(32+8*7)(R13), F14
+	MOVD	(32+8*8)(R13), F15
+skipfprest:
+	MOVW	12(R13), R4
+	MOVW	16(R13), R5
+	MOVW	20(R13), R6
+	MOVW	24(R13), R7
+	MOVW	28(R13), R8
+	MOVW	32(R13), R11
+	RET
+
+// _rt0_arm_lib_go initializes the Go runtime.
+// This is started in a separate thread by _rt0_arm_lib.
+TEXT _rt0_arm_lib_go<>(SB),NOSPLIT,$8
+	MOVW	_rt0_arm_lib_argc<>(SB), R0
+	MOVW	_rt0_arm_lib_argv<>(SB), R1
+	B	runtime·rt0_go(SB)
+
+DATA _rt0_arm_lib_argc<>(SB)/4,$0
+GLOBL _rt0_arm_lib_argc<>(SB),NOPTR,$4
+DATA _rt0_arm_lib_argv<>(SB)/4,$0
+GLOBL _rt0_arm_lib_argv<>(SB),NOPTR,$4
+
 // using frame size $-4 means do not save LR on stack.
+// argc is in R0, argv is in R1.
 TEXT runtime·rt0_go(SB),NOSPLIT,$-4
 	MOVW	$0xcafebabe, R12
 
 	// copy arguments forward on an even stack
 	// use R13 instead of SP to avoid linker rewriting the offsets
-	MOVW	0(R13), R0		// argc
-	MOVW	4(R13), R1		// argv
 	SUB	$64, R13		// plenty of scratch
 	AND	$~7, R13
 	MOVW	R0, 60(R13)		// save argc, argv away
@@ -129,19 +227,6 @@ TEXT runtime·gosave(SB),NOSPLIT,$-4-4
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB),NOSPLIT,$8-4
 	MOVW	buf+0(FP), R1
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVW	gobuf_ctxt(R1), R0
-	CMP	$0, R0
-	B.EQ	nilctxt
-	MOVW	$gobuf_ctxt(R1), R0
-	MOVW	R0, 4(R13)
-	MOVW	$0, R0
-	MOVW	R0, 8(R13)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVW	buf+0(FP), R1
-
-nilctxt:
 	MOVW	gobuf_g(R1), R0
 	BL	setg<>(SB)
 
@@ -273,10 +358,12 @@ switch:
 	RET
 
 noswitch:
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVW	R0, R7
 	MOVW	0(R0), R0
-	BL	(R0)
-	RET
+	MOVW.P	4(R13), R14	// restore LR
+	B	(R0)
 
 /*
  * support for morestack
@@ -314,7 +401,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
 	MOVW	R13, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_pc)(g)
 	MOVW	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVW	R7, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -328,8 +415,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R13
 	MOVW	$0, R0
-	MOVW.W	R0, -8(R13)	// create a call frame on g0
-	MOVW	R7, 4(R13)	// ctxt argument
+	MOVW.W  R0, -4(R13)	// create a call frame on g0 (saved LR)
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -677,9 +763,9 @@ TEXT setg<>(SB),NOSPLIT,$-4-0
 	MOVW	g, R0
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
-	MOVW	8(R13), R0		// LR saved by caller
-	MOVW	R0, ret+4(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT,$-4-4
+	MOVW	0(R13), R0		// LR saved by caller
+	MOVW	R0, ret+0(FP)
 	RET
 
 TEXT runtime·emptyfunc(SB),0,$0-0
@@ -719,23 +805,6 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$-4-0
 	MOVW	$0, R0
 	MOVW	(R0), R1
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVW	p+0(FP), R0
-	MOVW	h+4(FP), R1
-	MOVW	4(R7), R2
-	MOVW	R0, 4(R13)
-	MOVW	R1, 8(R13)
-	MOVW	R2, 12(R13)
-	BL	runtime·memhash(SB)
-	MOVW	16(R13), R0
-	MOVW	R0, ret+8(FP)
-	RET
-
 // memequal(p, q unsafe.Pointer, size uintptr) bool
 TEXT runtime·memequal(SB),NOSPLIT,$-4-13
 	MOVW	a+0(FP), R1
@@ -830,31 +899,6 @@ samebytes:
 	MOVW	R0, (R7)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$-4-17
-	MOVW	s1_base+0(FP), R2
-	MOVW	s2_base+8(FP), R3
-	MOVW	$1, R8
-	MOVB	R8, ret+16(FP)
-	CMP	R2, R3
-	RET.EQ
-	MOVW	s1_len+4(FP), R0
-	ADD	R2, R0, R6
-loop:
-	CMP	R2, R6
-	RET.EQ
-	MOVBU.P	1(R2), R4
-	MOVBU.P	1(R3), R5
-	CMP	R4, R5
-	BEQ	loop
-	MOVW	$0, R8
-	MOVB	R8, ret+16(FP)
-	RET
-
 // TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-25
 	MOVW	a_len+4(FP), R1
@@ -973,18 +1017,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$-4-0
 	// traceback from goexit1 must hit code range of goexit
 	MOVW	R0, R0	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
-	RET
-
 // x -> x/1000000, x%1000000, called from Go with args, results on stack.
 TEXT runtime·usplit(SB),NOSPLIT,$0-12
 	MOVW	x+0(FP), R0
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 30ecec7675..9bf0646c8d 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -122,18 +122,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $-8-8
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $24-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R0
-	CMP	$0, R0
-	BEQ	nilctxt
-	MOVD	$gobuf_ctxt(R5), R0
-	MOVD	R0, 8(RSP)
-	MOVD	ZR, 16(RSP)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g
 	BL	runtime·save_g(SB)
 
@@ -251,9 +239,11 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVD	0(R26), R3	// code pointer
-	BL	(R3)
-	RET
+	MOVD.P	16(RSP), R30	// restore LR
+	B	(R3)
 
 /*
  * support for morestack
@@ -289,7 +279,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0
 	MOVD	R0, (g_sched+gobuf_sp)(g)
 	MOVD	LR, (g_sched+gobuf_pc)(g)
 	MOVD	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R26, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's callers.
@@ -303,8 +293,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP
-	MOVD.W	$0, -16(RSP)	// create a call frame on g0
-	MOVD	R26, 8(RSP)	// ctxt argument
+	MOVD.W	$0, -16(RSP)	// create a call frame on g0 (saved LR; keep 16-aligned)
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -368,16 +357,26 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
 	NO_LOCAL_POINTERS;			\
 	/* copy arguments to stack */		\
 	MOVD	arg+16(FP), R3;			\
-	MOVWU	argsize+24(FP), R4;			\
-	MOVD	RSP, R5;				\
-	ADD	$(8-1), R5;			\
-	SUB	$1, R3;				\
-	ADD	R5, R4;				\
-	CMP	R5, R4;				\
-	BEQ	4(PC);				\
-	MOVBU.W	1(R3), R6;			\
-	MOVBU.W	R6, 1(R5);			\
-	B	-4(PC);				\
+	MOVWU	argsize+24(FP), R4;		\
+	ADD	$8, RSP, R5;			\
+	BIC	$0xf, R4, R6;			\
+	CBZ	R6, 6(PC);			\
+	/* if R6=(argsize&~15) != 0 */		\
+	ADD	R6, R5, R6;			\
+	/* copy 16 bytes a time */		\
+	LDP.P	16(R3), (R7, R8);		\
+	STP.P	(R7, R8), 16(R5);		\
+	CMP	R5, R6;				\
+	BNE	-3(PC);				\
+	AND	$0xf, R4, R6;			\
+	CBZ	R6, 6(PC);			\
+	/* if R6=(argsize&15) != 0 */		\
+	ADD	R6, R5, R6;			\
+	/* copy 1 byte a time for the rest */	\
+	MOVBU.P	1(R3), R7;			\
+	MOVBU.P	R7, 1(R5);			\
+	CMP	R5, R6;				\
+	BNE	-3(PC);				\
 	/* call function */			\
 	MOVD	f+8(FP), R26;			\
 	MOVD	(R26), R0;			\
@@ -704,52 +703,27 @@ TEXT setg_gcc<>(SB),NOSPLIT,$8
 	MOVD	savedR27-8(SP), R27
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVD	16(RSP), R0		// LR saved by caller
-	MOVD	R0, ret+8(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-8
+	MOVD	0(RSP), R0		// LR saved by caller
+	MOVD	R0, ret+0(FP)
 	RET
 
 TEXT runtime·abort(SB),NOSPLIT,$-8-0
 	B	(ZR)
 	UNDEF
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVD	p+0(FP), R3
-	MOVD	h+8(FP), R4
-	MOVD	8(R26), R5
-	MOVD	R3, 8(RSP)
-	MOVD	R4, 16(RSP)
-	MOVD	R5, 24(RSP)
-	BL	runtime·memhash(SB)
-	MOVD	32(RSP), R3
-	MOVD	R3, ret+16(FP)
-	RET
-
-// memequal(p, q unsafe.Pointer, size uintptr) bool
+// memequal(a, b unsafe.Pointer, size uintptr) bool
 TEXT runtime·memequal(SB),NOSPLIT,$-8-25
-	MOVD	a+0(FP), R1
+	MOVD	size+16(FP), R1
+	// short path to handle 0-byte case
+	CBZ	R1, equal
+	MOVD	a+0(FP), R0
 	MOVD	b+8(FP), R2
-	MOVD	size+16(FP), R3
-	ADD	R1, R3, R6
+	MOVD	$ret+24(FP), R8
+	B	runtime·memeqbody<>(SB)
+equal:
 	MOVD	$1, R0
 	MOVB	R0, ret+24(FP)
-	CMP	R1, R2
-	BEQ	done
-loop:
-	CMP	R1, R6
-	BEQ	done
-	MOVBU.P	1(R1), R4
-	MOVBU.P	1(R2), R5
-	CMP	R4, R5
-	BEQ	loop
-
-	MOVB	$0, ret+24(FP)
-done:
 	RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
@@ -823,102 +797,234 @@ samebytes:
 	MOVD	R4, (R7)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVD	s1_base+0(FP), R0
-	MOVD	s1_len+8(FP), R1
-	MOVD	s2_base+16(FP), R2
-	ADD	R0, R1		// end
-loop:
-	CMP	R0, R1
-	BEQ	equal		// reaches the end
-	MOVBU.P	1(R0), R4
-	MOVBU.P	1(R2), R5
-	CMP	R4, R5
-	BEQ	loop
-notequal:
-	MOVB	ZR, ret+32(FP)
-	RET
-equal:
-	MOVD	$1, R0
-	MOVB	R0, ret+32(FP)
-	RET
-
 //
 // functions for other packages
 //
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
 	MOVD	b+0(FP), R0
-	MOVD	b_len+8(FP), R1
-	MOVBU	c+24(FP), R2	// byte to find
-	MOVD	R0, R4		// store base for later
-	ADD	R0, R1		// end
-loop:
-	CMP	R0, R1
-	BEQ	notfound
-	MOVBU.P	1(R0), R3
-	CMP	R2, R3
-	BNE	loop
-
-	SUB	$1, R0		// R0 will be one beyond the position we want
-	SUB	R4, R0		// remove base
-	MOVD	R0, ret+32(FP)
-	RET
-
-notfound:
-	MOVD	$-1, R0
-	MOVD	R0, ret+32(FP)
-	RET
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	runtime·indexbytebody<>(SB)
 
 TEXT strings·IndexByte(SB),NOSPLIT,$0-32
 	MOVD	s+0(FP), R0
-	MOVD	s_len+8(FP), R1
-	MOVBU	c+16(FP), R2	// byte to find
-	MOVD	R0, R4		// store base for later
-	ADD	R0, R1		// end
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	runtime·indexbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R1: byte to search
+//   R2: data len
+//   R8: address to put result
+TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0
+	// Core algorithm:
+	// For each 32-byte chunk we calculate a 64-bit syndrome value,
+	// with two bits per byte. For each tuple, bit 0 is set if the
+	// relevant byte matched the requested character and bit 1 is
+	// not used (faster than using a 32bit syndrome). Since the bits
+	// in the syndrome reflect exactly the order in which things occur
+	// in the original string, counting trailing zeros allows to
+	// identify exactly which byte has matched.
+
+	CBZ	R2, fail
+	MOVD	R0, R11
+	// Magic constant 0x40100401 allows us to identify
+	// which lane matches the requested byte.
+	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+	MOVD	$0x40100401, R5
+	VMOV	R1, V0.B16
+	// Work with aligned 32-byte chunks
+	BIC	$0x1f, R0, R3
+	VMOV	R5, V5.S4
+	ANDS	$0x1f, R0, R9
+	AND	$0x1f, R2, R10
+	BEQ	loop
+
+	// Input string is not 32-byte aligned. We calculate the
+	// syndrome value for the aligned 32 bytes block containing
+	// the first bytes and mask off the irrelevant part.
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUB	$0x20, R9, R4
+	ADDS	R4, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
+	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
+	VMOV	V6.D[0], R6
+	// Clear the irrelevant lower bits
+	LSL	$1, R9, R4
+	LSR	R4, R6, R6
+	LSL	R4, R6, R6
+	// The first block can also be the last
+	BLS	masklast
+	// Have we found something already?
+	CBNZ	R6, tail
+
 loop:
-	CMP	R0, R1
-	BEQ	notfound
-	MOVBU.P	1(R0), R3
-	CMP	R2, R3
-	BNE	loop
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUBS	$0x20, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// If we're out of data we finish regardless of the result
+	BLS	end
+	// Use a fast check for the termination condition
+	VORR	V4.B16, V3.B16, V6.B16
+	VADDP	V6.D2, V6.D2, V6.D2
+	VMOV	V6.D[0], R6
+	// We're not out of data, loop if we haven't found the character
+	CBZ	R6, loop
+
+end:
+	// Termination condition found, let's calculate the syndrome value
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16
+	VADDP	V6.B16, V6.B16, V6.B16
+	VMOV	V6.D[0], R6
+	// Only do the clear for the last possible block with less than 32 bytes
+	// Condition flags come from SUBS in the loop
+	BHS	tail
+
+masklast:
+	// Clear the irrelevant upper bits
+	ADD	R9, R10, R4
+	AND	$0x1f, R4, R4
+	SUB	$0x20, R4, R4
+	NEG	R4<<1, R4
+	LSL	R4, R6, R6
+	LSR	R4, R6, R6
 
-	SUB	$1, R0		// R0 will be one beyond the position we want
-	SUB	R4, R0		// remove base
-	MOVD	R0, ret+24(FP)
+tail:
+	// Check that we have found a character
+	CBZ	R6, fail
+	// Count the trailing zeros using bit reversing
+	RBIT	R6, R6
+	// Compensate the last post-increment
+	SUB	$0x20, R3, R3
+	// And count the leading zeros
+	CLZ	R6, R6
+	// R6 is twice the offset into the fragment
+	ADD	R6>>1, R3, R0
+	// Compute the offset result
+	SUB	R11, R0, R0
+	MOVD	R0, (R8)
 	RET
 
-notfound:
+fail:
 	MOVD	$-1, R0
-	MOVD	R0, ret+24(FP)
+	MOVD	R0, (R8)
 	RET
 
-// TODO: share code with memequal?
+// Equal(a, b []byte) bool
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	MOVD	a_len+8(FP), R1
 	MOVD	b_len+32(FP), R3
-	CMP	R1, R3		// unequal lengths are not equal
-	BNE	notequal
+	CMP	R1, R3
+	// unequal lengths are not equal
+	BNE	not_equal
+	// short path to handle 0-byte case
+	CBZ	R1, equal
 	MOVD	a+0(FP), R0
 	MOVD	b+24(FP), R2
-	ADD	R0, R1		// end
-loop:
-	CMP	R0, R1
-	BEQ	equal		// reaches the end
-	MOVBU.P	1(R0), R4
-	MOVBU.P	1(R2), R5
-	CMP	R4, R5
-	BEQ	loop
-notequal:
+	MOVD	$ret+48(FP), R8
+	B	runtime·memeqbody<>(SB)
+equal:
+	MOVD	$1, R0
+	MOVB	R0, ret+48(FP)
+	RET
+not_equal:
 	MOVB	ZR, ret+48(FP)
 	RET
+
+// input:
+// R0: pointer a
+// R1: data len
+// R2: pointer b
+// R8: address to put result
+TEXT runtime·memeqbody<>(SB),NOSPLIT,$0
+	CMP	$1, R1
+	// handle 1-byte special case for better performance
+	BEQ	one
+	CMP	$16, R1
+	// handle specially if length < 16
+	BLO	tail
+	BIC	$0x3f, R1, R3
+	CBZ	R3, chunk16
+	// work with 64-byte chunks
+	ADD	R3, R0, R6	// end of chunks
+chunk64_loop:
+	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+	VLD1.P	(R2), [V4.D2, V5.D2, V6.D2, V7.D2]
+	VCMEQ	V0.D2, V4.D2, V8.D2
+	VCMEQ	V1.D2, V5.D2, V9.D2
+	VCMEQ	V2.D2, V6.D2, V10.D2
+	VCMEQ	V3.D2, V7.D2, V11.D2
+	VAND	V8.B16, V9.B16, V8.B16
+	VAND	V8.B16, V10.B16, V8.B16
+	VAND	V8.B16, V11.B16, V8.B16
+	CMP	R0, R6
+	VMOV	V8.D[0], R4
+	VMOV	V8.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk64_loop
+	AND	$0x3f, R1, R1
+	CBZ	R1, equal
+chunk16:
+	// work with 16-byte chunks
+	BIC	$0xf, R1, R3
+	CBZ	R3, tail
+	ADD	R3, R0, R6	// end of chunks
+chunk16_loop:
+	VLD1.P	(R0), [V0.D2]
+	VLD1.P	(R2), [V1.D2]
+	VCMEQ	V0.D2, V1.D2, V2.D2
+	CMP	R0, R6
+	VMOV	V2.D[0], R4
+	VMOV	V2.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk16_loop
+	AND	$0xf, R1, R1
+	CBZ	R1, equal
+tail:
+	// special compare of tail with length < 16
+	TBZ	$3, R1, lt_8
+	MOVD.P	8(R0), R4
+	MOVD.P	8(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_8:
+	TBZ	$2, R1, lt_4
+	MOVWU.P	4(R0), R4
+	MOVWU.P	4(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_4:
+	TBZ	$1, R1, lt_2
+	MOVHU.P	2(R0), R4
+	MOVHU.P	2(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_2:
+	TBZ     $0, R1, equal
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
 equal:
 	MOVD	$1, R0
-	MOVB	R0, ret+48(FP)
+	MOVB	R0, (R8)
+	RET
+not_equal:
+	MOVB	ZR, (R8)
 	RET
 
 TEXT runtime·return0(SB), NOSPLIT, $0
@@ -931,19 +1037,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$-8-0
 	MOVD	R0, R0	// NOP
 	BL	runtime·goexit1(SB)	// does not return
 
-// TODO(aram): use PRFM here.
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
-	RET
-
 TEXT runtime·sigreturn(SB),NOSPLIT,$0-0
 	RET
 
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 57d45785f1..12cea00adc 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -108,17 +108,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $-8-8
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVV	buf+0(FP), R3
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVV	gobuf_ctxt(R3), R1
-	BEQ	R1, nilctxt
-	MOVV	$gobuf_ctxt(R3), R1
-	MOVV	R1, 8(R29)
-	MOVV	R0, 16(R29)
-	JAL	runtime·writebarrierptr_prewrite(SB)
-	MOVV	buf+0(FP), R3
-
-nilctxt:
 	MOVV	gobuf_g(R3), g	// make sure g is not nil
 	JAL	runtime·save_g(SB)
 
@@ -225,9 +214,12 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVV	0(REGCTXT), R4	// code pointer
-	JAL	(R4)
-	RET
+	MOVV	0(R29), R31	// restore LR
+	ADDV	$8, R29
+	JMP	(R4)
 
 /*
  * support for morestack
@@ -260,7 +252,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0
 	MOVV	R29, (g_sched+gobuf_sp)(g)
 	MOVV	R31, (g_sched+gobuf_pc)(g)
 	MOVV	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVV	REGCTXT, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -273,9 +265,8 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R29
 	// Create a stack frame on g0 to call newstack.
-	MOVV	R0, -16(R29)	// Zero saved LR in frame
-	ADDV	$-16, R29
-	MOVV	REGCTXT, 8(R29)	// ctxt argument
+	MOVV	R0, -8(R29)	// Zero saved LR in frame
+	ADDV	$-8, R29
 	JAL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -616,32 +607,15 @@ TEXT setg_gcc<>(SB),NOSPLIT,$0-0
 	JAL	runtime·save_g(SB)
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVV	16(R29), R1		// LR saved by caller
-	MOVV	R1, ret+8(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-8
+	MOVV	0(R29), R1		// LR saved by caller
+	MOVV	R1, ret+0(FP)
 	RET
 
 TEXT runtime·abort(SB),NOSPLIT,$-8-0
 	MOVW	(R0), R0
 	UNDEF
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVV	p+0(FP), R1
-	MOVV	h+8(FP), R2
-	MOVV	8(REGCTXT), R3
-	MOVV	R1, 8(R29)
-	MOVV	R2, 16(R29)
-	MOVV	R3, 24(R29)
-	JAL	runtime·memhash(SB)
-	MOVV	32(R29), R1
-	MOVV	R1, ret+16(FP)
-	RET
-
 // AES hashing not implemented for mips64
 TEXT runtime·aeshash(SB),NOSPLIT,$-8-0
 	MOVW	(R0), R1
@@ -696,31 +670,6 @@ eq:
 	MOVB	R1, ret+16(FP)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVV	s1_base+0(FP), R1
-	MOVV	s2_base+16(FP), R2
-	MOVV	$1, R3
-	MOVB	R3, ret+32(FP)
-	BNE	R1, R2, 2(PC)
-	RET
-	MOVV	s1_len+8(FP), R3
-	ADDV	R1, R3, R4
-loop:
-	BNE	R1, R4, 2(PC)
-	RET
-	MOVBU	(R1), R6
-	ADDV	$1, R1
-	MOVBU	(R2), R7
-	ADDV	$1, R2
-	BEQ	R6, R7, loop
-	MOVB	R0, ret+32(FP)
-	RET
-
 // TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	MOVV	a_len+8(FP), R3
@@ -823,18 +772,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$-8-0
 	// traceback from goexit1 must hit code range of goexit
 	NOR	R0, R0	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
-	RET
-
 TEXT ·checkASM(SB),NOSPLIT,$0-1
 	MOVW	$1, R1
 	MOVB	R1, ret+0(FP)
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 536c3156b5..bba6a9501d 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -109,17 +109,6 @@ TEXT runtime·gosave(SB),NOSPLIT,$-4-4
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB),NOSPLIT,$8-4
 	MOVW	buf+0(FP), R3
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVW	gobuf_ctxt(R3), R1
-	BEQ	R1, nilctxt
-	MOVW	$gobuf_ctxt(R3), R1
-	MOVW	R1, 4(R29)
-	MOVW	R0, 8(R29)
-	JAL	runtime·writebarrierptr_prewrite(SB)
-	MOVW	buf+0(FP), R3
-
-nilctxt:
 	MOVW	gobuf_g(R3), g	// make sure g is not nil
 	JAL	runtime·save_g(SB)
 
@@ -226,9 +215,12 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVW	0(REGCTXT), R4	// code pointer
-	JAL	(R4)
-	RET
+	MOVW	0(R29), R31	// restore LR
+	ADD	$4, R29
+	JMP	(R4)
 
 /*
  * support for morestack
@@ -261,7 +253,7 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
 	MOVW	R29, (g_sched+gobuf_sp)(g)
 	MOVW	R31, (g_sched+gobuf_pc)(g)
 	MOVW	R3, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVW	REGCTXT, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -274,9 +266,8 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R29
 	// Create a stack frame on g0 to call newstack.
-	MOVW	R0, -8(R29)	// Zero saved LR in frame
-	ADDU	$-8, R29
-	MOVW	REGCTXT, 4(R29)	// ctxt argument
+	MOVW	R0, -4(R29)	// Zero saved LR in frame
+	ADDU	$-4, R29
 	JAL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -619,31 +610,14 @@ TEXT setg_gcc<>(SB),NOSPLIT,$0
 	JAL	runtime·save_g(SB)
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
-	MOVW	8(R29), R1	// LR saved by caller
-	MOVW	R1, ret+4(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT,$-4-4
+	MOVW	0(R29), R1	// LR saved by caller
+	MOVW	R1, ret+0(FP)
 	RET
 
 TEXT runtime·abort(SB),NOSPLIT,$0-0
 	UNDEF
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVW	p+0(FP), R1
-	MOVW	h+4(FP), R2
-	MOVW	4(REGCTXT), R3
-	MOVW	R1, 4(R29)
-	MOVW	R2, 8(R29)
-	MOVW	R3, 12(R29)
-	JAL	runtime·memhash(SB)
-	MOVW	16(R29), R1
-	MOVW	R1, ret+8(FP)
-	RET
-
 // Not implemented.
 TEXT runtime·aeshash(SB),NOSPLIT,$0
 	UNDEF
@@ -712,31 +686,6 @@ eq:
 	MOVB	R1, ret+8(FP)
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-17
-	MOVW	s1_base+0(FP), R1
-	MOVW	s2_base+8(FP), R2
-	MOVW	$1, R3
-	MOVBU	R3, ret+16(FP)
-	BNE	R1, R2, 2(PC)
-	RET
-	MOVW	s1_len+4(FP), R3
-	ADDU	R1, R3, R4
-loop:
-	BNE	R1, R4, 2(PC)
-	RET
-	MOVBU	(R1), R6
-	ADDU	$1, R1
-	MOVBU	(R2), R7
-	ADDU	$1, R2
-	BEQ	R6, R7, loop
-	MOVB	R0, ret+16(FP)
-	RET
-
 TEXT bytes·Equal(SB),NOSPLIT,$0-25
 	MOVW	a_len+4(FP), R3
 	MOVW	b_len+16(FP), R4
@@ -903,18 +852,6 @@ TEXT runtime·goexit(SB),NOSPLIT,$-4-0
 	// traceback from goexit1 must hit code range of goexit
 	NOR	R0, R0	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
-	RET
-
 TEXT ·checkASM(SB),NOSPLIT,$0-1
 	MOVW	$1, R1
 	MOVB	R1, ret+0(FP)
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 616861ea7d..e02ca16907 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -133,18 +133,6 @@ TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R3
-	CMP	R0, R3
-	BEQ	nilctxt
-	MOVD	$gobuf_ctxt(R5), R3
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	R0, FIXED_FRAME+8(R1)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g	// make sure g is not nil
 	BL	runtime·save_g(SB)
 
@@ -277,6 +265,9 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// On other arches we do a tail call here, but it appears to be
+	// impossible to tail call a function pointer in shared mode on
+	// ppc64 because the caller is responsible for restoring the TOC.
 	MOVD	0(R11), R12	// code pointer
 	MOVD	R12, CTR
 	BL	(CTR)
@@ -317,7 +308,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	LR, R8
 	MOVD	R8, (g_sched+gobuf_pc)(g)
 	MOVD	R5, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R11, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -329,8 +320,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	m_g0(R7), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R1
-	MOVDU   R0, -(FIXED_FRAME+8)(R1)	// create a call frame on g0
-	MOVD	R11, FIXED_FRAME+0(R1)	// ctxt argument
+	MOVDU   R0, -(FIXED_FRAME+0)(R1)	// create a call frame on g0
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -714,9 +704,9 @@ TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R4, LR
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVD	FIXED_FRAME+8(R1), R3		// LR saved by caller
-	MOVD	R3, ret+8(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	0(R1), R3		// LR saved by caller
+	MOVD	R3, ret+0(FP)
 	RET
 
 TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
@@ -738,23 +728,6 @@ TEXT runtime·cputicks(SB),NOSPLIT,$0-8
 	MOVD	R3, ret+0(FP)
 	RET
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVD	p+0(FP), R3
-	MOVD	h+8(FP), R4
-	MOVD	8(R11), R5
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	R4, FIXED_FRAME+8(R1)
-	MOVD	R5, FIXED_FRAME+16(R1)
-	BL	runtime·memhash(SB)
-	MOVD	FIXED_FRAME+24(R1), R3
-	MOVD	R3, ret+16(FP)
-	RET
-
 // AES hashing not implemented for ppc64
 TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1
@@ -1074,24 +1047,6 @@ equal:
 	MOVD    $1, R9
 	RET
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVD    s1_base+0(FP), R3
-	MOVD    s2_base+16(FP), R4
-	MOVD    $1, R5
-	MOVB    R5, ret+32(FP)
-	CMP     R3, R4
-	BNE     2(PC)
-	RET
-	MOVD    s1_len+8(FP), R5
-	BL      runtime·memeqbody(SB)
-	MOVB    R9, ret+32(FP)
-	RET
-
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	MOVD	a_len+8(FP), R4
 	MOVD	b_len+32(FP), R5
@@ -1129,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
 
 TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 	DCBT	(R3)		// Prepare cache line.
-	MOVD	R3,R10		// Save base address for calculating the index later.
+	MOVD	R3,R17		// Save base address for calculating the index later.
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
-
-	// Calculate last acceptable address and check for possible overflow
-	// using a saturated add.
-	// Overflows set last acceptable address to 0xffffffffffffffff.
-	ADD	R4,R3,R7
-	SUBC	R3,R7,R6
-	SUBE	R0,R0,R9
-	MOVW	R9,R6
-	OR	R6,R7,R7
+	ADD	R4,R3,R7	// Last acceptable address in R7.
 
 	RLDIMI	$16,R5,$32,R5
 	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
-	WORD $0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
 	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
 	ADD	$-1,R7,R7
 #ifdef GOARCH_ppc64le
 	SLD	R6,R9,R9	// Prepare mask for Little Endian
@@ -1155,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #endif
 	BLE	small_string	// Jump to the small string case if it's <32 bytes.
 
-	// Case for length >32 bytes
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
+
 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
 	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
 
-	// Check for doubleword alignment and jump to the loop setup if aligned.
-	MOVFL	R8,CR7
-	BC	12,28,loop_setup
+	// Check for quadword alignment
+	ANDCC	$15,R8,R11
+	BEQ	CR0,qw_align
 
-	// Not aligned, so handle the second doubleword
-	MOVDU	8(R8),R12
+	// Not aligned, so handle the next doubleword
+	MOVD	0(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR7
 	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
 
-loop_setup:
-	// We are now aligned to a 16-byte boundary. We will load two doublewords
-	// per loop iteration. The last doubleword is in R7, so our loop counter
-	// starts at (R7-R8)/16.
-	SUB	R8,R7,R6
-	SRD	$4,R6,R6
-	MOVD	R6,CTR
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
+	BLE	  tail
+
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
 
-	// Note: when we have an align directive, align this loop to 32 bytes so
-	// it fits in a single icache sector.
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len <= 64, don't use the vectorized loop
+
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
+
+	MOVD	$16,R11      // Load offsets for the vector loads
+	MOVD	$32,R9
+	MOVD	$48,R7
+
+	// Main loop we will load 64 bytes per iteration
 loop:
-	// Load two doublewords, then compare and merge in a single register. We
-	// will check two doublewords per iteration, then find out which of them
-	// contains the byte later. This speeds up the search.
-	MOVD	8(R8),R12
-	MOVDU	16(R8),R11
-	CMPB	R12,R5,R3
-	CMPB	R11,R5,R9
-	OR	R3,R9,R6
-	CMPU	R6,$0,CR7
-	BNE	CR7,found
-	BC	16,0,loop
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
+	LVX	    (R11+R8),V3
+	LVX	    (R9+R8),V4
+	LVX	    (R7+R8),V5
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
+	VCMPEQUB    V1,V3,V7
+	VCMPEQUB    V1,V4,V8
+	VCMPEQUB    V1,V5,V9
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
+	VOR	    V8,V9,V12
+	VOR	    V11,V12,V11
+	VCMPEQUBCC  V0,V11,V11	      // Check for byte
+	BGE	    CR6,found
+	ADD	    $64,R8,R8
+	BC	    16,0,loop	      // bdnz loop
 
-	// Counter zeroed, but we may have another doubleword to read
-	CMPU	R8,R7
-	BEQ	notfound
+	// Handle the tailing bytes or R4 <= 64
+	RLDICL	$0,R6,$58,R4
+tail:
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
 
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	BNE	CR6,done
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
 
 notfound:
 	MOVD	$-1,R3
@@ -1212,15 +1246,68 @@ notfound:
 	RET
 
 found:
-	// One of the doublewords from the loop contains the byte we are looking
-	// for. Check the first doubleword and adjust the address if found.
-	CMPU	R3,$0,CR6
-	ADD	$-8,R8,R8
-	BNE	CR6,done
+	// We will now compress the results into a single doubleword,
+	// so it can be moved to a GPR for the final index calculation.
 
-	// Not found, so it must be in the second doubleword of the merged pair.
-	MOVD	R9,R3
-	ADD	$8,R8,R8
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
+
+	// Merge V6-V9 into a single doubleword and move to a GPR.
+	VOR	V6,V7,V11
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+
+return:
+	SUB	R17,R3
+	MOVD	R3,(R14)
+	RET
+
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
 
 done:
 	// At this point, R3 has 0xFF in the same position as the byte we are
@@ -1236,17 +1323,10 @@ done:
 	CMPU	R8,R7		// Check if we are at the last doubleword.
 	SRD	$3,R11		// Convert trailing zeros to bytes.
 	ADD	R11,R8,R3
-	CMPU	R11,R4,CR7	// If at the last doubleword, check the byte offset.
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
 	BNE	return
 	BLE	CR7,return
-	MOVD	$-1,R3
-	MOVD	R3,(R14)
-	RET
-
-return:
-	SUB	R10,R3		// Calculate index.
-	MOVD	R3,(R14)
-	RET
+	BR	notfound
 
 small_string:
 	// We unroll this loop for better performance.
@@ -1257,9 +1337,9 @@ small_string:
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base.
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
-        CMPU	R8,R7
+	CMPU	R8,R7
 	BNE	CR7,done
 	BEQ	notfound	// Hit length.
 
@@ -1287,34 +1367,70 @@ small_string:
 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
-	CMPU	R8,R7
 	BNE	CR6,done
 	BR	notfound
 
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s1_base+0(FP), R5
-	MOVD	s1_len+8(FP), R3
 	MOVD	s2_base+16(FP), R6
+	MOVD	s1_len+8(FP), R3
+	CMP	R5,R6,CR7
 	MOVD	s2_len+24(FP), R4
 	MOVD	$ret+32(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
+notequal:
 #ifdef	GOARCH_ppc64le
 	BR	cmpbodyLE<>(SB)
 #else
 	BR      cmpbodyBE<>(SB)
 #endif
 
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
 TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
 	MOVD	s1+0(FP), R5
-	MOVD	s1+8(FP), R3
 	MOVD	s2+24(FP), R6
+	MOVD	s1+8(FP), R3
+	CMP	R5,R6,CR7
 	MOVD	s2+32(FP), R4
 	MOVD	$ret+48(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
 #ifdef	GOARCH_ppc64le
 	BR	cmpbodyLE<>(SB)
 #else
 	BR      cmpbodyBE<>(SB)
 #endif
 
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R3
 	RET
@@ -1353,18 +1469,6 @@ TEXT runtime·goexit(SB),NOSPLIT|NOFRAME,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	MOVD	R0, R0	// NOP
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
-	RET
-
 TEXT runtime·sigreturn(SB),NOSPLIT,$0-0
 	RET
 
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 20e740b927..6b71830557 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -7,6 +7,83 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+// _rt0_s390x_lib is common startup code for s390x systems when
+// using -buildmode=c-archive or -buildmode=c-shared. The linker will
+// arrange to invoke this function as a global constructor (for
+// c-archive) or when the shared library is loaded (for c-shared).
+// We expect argc and argv to be passed in the usual C ABI registers
+// R2 and R3.
+TEXT _rt0_s390x_lib(SB), NOSPLIT|NOFRAME, $0
+	STMG	R6, R15, 48(R15)
+	MOVD	R2, _rt0_s390x_lib_argc<>(SB)
+	MOVD	R3, _rt0_s390x_lib_argv<>(SB)
+
+	// Save R6-R15 in the register save area of the calling function.
+	STMG	R6, R15, 48(R15)
+
+	// Allocate 80 bytes on the stack.
+	MOVD	$-80(R15), R15
+
+	// Save F8-F15 in our stack frame.
+	FMOVD	F8, 16(R15)
+	FMOVD	F9, 24(R15)
+	FMOVD	F10, 32(R15)
+	FMOVD	F11, 40(R15)
+	FMOVD	F12, 48(R15)
+	FMOVD	F13, 56(R15)
+	FMOVD	F14, 64(R15)
+	FMOVD	F15, 72(R15)
+
+	// Synchronous initialization.
+	MOVD	$runtime·libpreinit(SB), R1
+	BL	R1
+
+	// Create a new thread to finish Go runtime initialization.
+	MOVD	_cgo_sys_thread_create(SB), R1
+	CMP	R1, $0
+	BEQ	nocgo
+	MOVD	$_rt0_s390x_lib_go(SB), R2
+	MOVD	$0, R3
+	BL	R1
+	BR	restore
+
+nocgo:
+	MOVD	$0x800000, R1              // stacksize
+	MOVD	R1, 0(R15)
+	MOVD	$_rt0_s390x_lib_go(SB), R1
+	MOVD	R1, 8(R15)                 // fn
+	MOVD	$runtime·newosproc(SB), R1
+	BL	R1
+
+restore:
+	// Restore F8-F15 from our stack frame.
+	FMOVD	16(R15), F8
+	FMOVD	24(R15), F9
+	FMOVD	32(R15), F10
+	FMOVD	40(R15), F11
+	FMOVD	48(R15), F12
+	FMOVD	56(R15), F13
+	FMOVD	64(R15), F14
+	FMOVD	72(R15), F15
+	MOVD	$80(R15), R15
+
+	// Restore R6-R15.
+	LMG	48(R15), R6, R15
+	RET
+
+// _rt0_s390x_lib_go initializes the Go runtime.
+// This is started in a separate thread by _rt0_s390x_lib.
+TEXT _rt0_s390x_lib_go(SB), NOSPLIT|NOFRAME, $0
+	MOVD	_rt0_s390x_lib_argc<>(SB), R2
+	MOVD	_rt0_s390x_lib_argv<>(SB), R3
+	MOVD	$runtime·rt0_go(SB), R1
+	BR	R1
+
+DATA _rt0_s390x_lib_argc<>(SB)/8, $0
+GLOBL _rt0_s390x_lib_argc<>(SB), NOPTR, $8
+DATA _rt0_s90x_lib_argv<>(SB)/8, $0
+GLOBL _rt0_s390x_lib_argv<>(SB), NOPTR, $8
+
 TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	// R2 = argc; R3 = argv; R11 = temp; R13 = g; R15 = stack pointer
 	// C TLS base pointer in AR0:AR1
@@ -116,17 +193,6 @@ TEXT runtime·gosave(SB), NOSPLIT, $-8-8
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVD	buf+0(FP), R5
-
-	// If ctxt is not nil, invoke deletion barrier before overwriting.
-	MOVD	gobuf_ctxt(R5), R1
-	CMPBEQ	R1, $0, nilctxt
-	MOVD	$gobuf_ctxt(R5), R1
-	MOVD	R1, 8(R15)
-	MOVD	R0, 16(R15)
-	BL	runtime·writebarrierptr_prewrite(SB)
-	MOVD	buf+0(FP), R5
-
-nilctxt:
 	MOVD	gobuf_g(R5), g	// make sure g is not nil
 	BL	runtime·save_g(SB)
 
@@ -235,9 +301,12 @@ switch:
 
 noswitch:
 	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
 	MOVD	0(R12), R3	// code pointer
-	BL	(R3)
-	RET
+	MOVD	0(R15), LR	// restore LR
+	ADD	$8, R15
+	BR	(R3)
 
 /*
  * support for morestack
@@ -272,7 +341,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	LR, R8
 	MOVD	R8, (g_sched+gobuf_pc)(g)
 	MOVD	R5, (g_sched+gobuf_lr)(g)
-	// newstack will fill gobuf.ctxt.
+	MOVD	R12, (g_sched+gobuf_ctxt)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -285,9 +354,8 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R15
 	// Create a stack frame on g0 to call newstack.
-	MOVD	$0, -16(R15)	// Zero saved LR in frame
-	SUB	$16, R15
-	MOVD	R12, 8(R15)	// ctxt argument
+	MOVD	$0, -8(R15)	// Zero saved LR in frame
+	SUB	$8, R15
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -656,9 +724,9 @@ TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R1, LR
 	RET
 
-TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVD	16(R15), R3		// LR saved by caller
-	MOVD	R3, ret+8(FP)
+TEXT runtime·getcallerpc(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	0(R15), R3		// LR saved by caller
+	MOVD	R3, ret+0(FP)
 	RET
 
 TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
@@ -678,23 +746,6 @@ TEXT runtime·cputicks(SB),NOSPLIT,$0-8
 	MOVD	R3, ret+0(FP)
 	RET
 
-// memhash_varlen(p unsafe.Pointer, h seed) uintptr
-// redirects to memhash(p, h, size) using the size
-// stored in the closure.
-TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
-	GO_ARGS
-	NO_LOCAL_POINTERS
-	MOVD	p+0(FP), R3
-	MOVD	h+8(FP), R4
-	MOVD	8(R12), R5
-	MOVD	R3, 8(R15)
-	MOVD	R4, 16(R15)
-	MOVD	R5, 24(R15)
-	BL	runtime·memhash(SB)
-	MOVD	32(R15), R3
-	MOVD	R3, ret+16(FP)
-	RET
-
 // AES hashing not implemented for s390x
 TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R15
@@ -721,18 +772,6 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
 	LA	ret+16(FP), R7
 	BR	runtime·memeqbody(SB)
 
-// eqstring tests whether two strings are equal.
-// The compiler guarantees that strings passed
-// to eqstring have equal length.
-// See runtime_test.go:eqstring_generic for
-// equivalent Go code.
-TEXT runtime·eqstring(SB),NOSPLIT|NOFRAME,$0-33
-	MOVD	s1_base+0(FP), R3
-	MOVD	s1_len+8(FP), R6
-	MOVD	s2_base+16(FP), R5
-	LA	ret+32(FP), R7
-	BR	runtime·memeqbody(SB)
-
 TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49
 	MOVD	a_len+8(FP), R2
 	MOVD	b_len+32(FP), R6
@@ -949,23 +988,12 @@ TEXT runtime·goexit(SB),NOSPLIT|NOFRAME,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	BYTE $0x07; BYTE $0x00; // 2-byte nop
 
-TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
-	RET
-
-TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
-	RET
-
 TEXT runtime·sigreturn(SB),NOSPLIT,$0-0
 	RET
 
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
-	SYNC
+        // Stores are already ordered on s390x, so this is just a
+        // compile barrier.
 	RET
 
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
index dc8897d353..7293c20bf8 100644
--- a/src/runtime/cgo/asm_386.s
+++ b/src/runtime/cgo/asm_386.s
@@ -7,26 +7,23 @@
 // Called by C code generated by cmd/cgo.
 // func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
 // Saves C callee-saved registers and calls fn with three arguments.
-TEXT crosscall2(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	PUSHL	BX
-	PUSHL	SI
-	PUSHL	DI
-	
-	SUBL	$12, SP
-	MOVL	20(BP), AX
+TEXT crosscall2(SB),NOSPLIT,$28-16
+	MOVL BP, 24(SP)
+	MOVL BX, 20(SP)
+	MOVL SI, 16(SP)
+	MOVL DI, 12(SP)
+
+	MOVL	ctxt+12(FP), AX
 	MOVL	AX, 8(SP)
-	MOVL	16(BP), AX
+	MOVL	n+8(FP), AX
 	MOVL	AX, 4(SP)
-	MOVL	12(BP), AX
+	MOVL	a+4(FP), AX
 	MOVL	AX, 0(SP)
-	MOVL	8(BP), AX
+	MOVL	fn+0(FP), AX
 	CALL	AX
-	ADDL	$12, SP
-	
-	POPL	DI
-	POPL	SI
-	POPL	BX
-	POPL	BP
+
+	MOVL 12(SP), DI
+	MOVL 16(SP), SI
+	MOVL 20(SP), BX
+	MOVL 24(SP), BP
 	RET
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 541bd9ea01..0e33fc4796 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s
@@ -7,14 +7,12 @@
 // Called by C code generated by cmd/cgo.
 // func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
 // Saves C callee-saved registers and calls fn with three arguments.
-TEXT crosscall2(SB),NOSPLIT,$0
 #ifndef GOOS_windows
-	SUBQ	$0x58, SP	/* keeps stack pointer 32-byte aligned */
+TEXT crosscall2(SB),NOSPLIT,$0x50-0 /* keeps stack pointer 32-byte aligned */
 #else
-	SUBQ	$0x118, SP	/* also need to save xmm6 - xmm15 */
+TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 #endif
 	MOVQ	BX, 0x18(SP)
-	MOVQ	BP, 0x20(SP)
 	MOVQ	R12, 0x28(SP)
 	MOVQ	R13, 0x30(SP)
 	MOVQ	R14, 0x38(SP)
@@ -62,15 +60,9 @@ TEXT crosscall2(SB),NOSPLIT,$0
 #endif
 
 	MOVQ	0x18(SP), BX
-	MOVQ	0x20(SP), BP
 	MOVQ	0x28(SP), R12
 	MOVQ	0x30(SP), R13
 	MOVQ	0x38(SP), R14
 	MOVQ	0x40(SP), R15
 	
-#ifndef GOOS_windows
-	ADDQ	$0x58, SP
-#else
-	ADDQ	$0x118, SP
-#endif
 	RET
diff --git a/src/runtime/cgo/asm_mipsx.s b/src/runtime/cgo/asm_mipsx.s
index dd16af6fbe..2483bdd7d4 100644
--- a/src/runtime/cgo/asm_mipsx.s
+++ b/src/runtime/cgo/asm_mipsx.s
@@ -20,7 +20,11 @@ TEXT crosscall2(SB),NOSPLIT,$-4
 
 	// Space for 9 caller-saved GPR + LR + 6 caller-saved FPR.
 	// O32 ABI allows us to smash 16 bytes argument area of caller frame.
+#ifndef GOMIPS_softfloat
 	SUBU	$(4*14+8*6-16), R29
+#else
+	SUBU	$(4*14-16), R29	// For soft-float, no FPR.
+#endif
 	MOVW	R5, (4*1)(R29)
 	MOVW	R6, (4*2)(R29)
 	MOVW	R7, (4*3)(R29)
@@ -34,14 +38,14 @@ TEXT crosscall2(SB),NOSPLIT,$-4
 	MOVW	R23, (4*11)(R29)
 	MOVW	g, (4*12)(R29)
 	MOVW	R31, (4*13)(R29)
-
+#ifndef GOMIPS_softfloat
 	MOVD	F20, (4*14)(R29)
 	MOVD	F22, (4*14+8*1)(R29)
 	MOVD	F24, (4*14+8*2)(R29)
 	MOVD	F26, (4*14+8*3)(R29)
 	MOVD	F28, (4*14+8*4)(R29)
 	MOVD	F30, (4*14+8*5)(R29)
-
+#endif
 	JAL	runtime·load_g(SB)
 	JAL	(R4)
 
@@ -55,7 +59,7 @@ TEXT crosscall2(SB),NOSPLIT,$-4
 	MOVW	(4*11)(R29), R23
 	MOVW	(4*12)(R29), g
 	MOVW	(4*13)(R29), R31
-
+#ifndef GOMIPS_softfloat
 	MOVD	(4*14)(R29), F20
 	MOVD	(4*14+8*1)(R29), F22
 	MOVD	(4*14+8*2)(R29), F24
@@ -64,4 +68,7 @@ TEXT crosscall2(SB),NOSPLIT,$-4
 	MOVD	(4*14+8*5)(R29), F30
 
 	ADDU	$(4*14+8*6-16), R29
+#else
+	ADDU	$(4*14-16), R29
+#endif
 	RET
diff --git a/src/runtime/cgo/gcc_android_386.c b/src/runtime/cgo/gcc_android_386.c
index 23a15f1c87..28f553c446 100644
--- a/src/runtime/cgo/gcc_android_386.c
+++ b/src/runtime/cgo/gcc_android_386.c
@@ -36,7 +36,7 @@ inittls(void)
 	 */
 	ntofree = 0;
 	for(;;) {
-		if(pthread_key_create(&k, nil) < 0) {
+		if(pthread_key_create(&k, nil) != 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
@@ -77,7 +77,10 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	pthread_setspecific(k1, (void*)ts.g);
+	if (pthread_setspecific(k1, (void*)ts.g) != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_setspecific failed\n");
+		abort();
+	}
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/runtime/cgo/gcc_android_amd64.c b/src/runtime/cgo/gcc_android_amd64.c
index e006c49bcf..6f92d90dd4 100644
--- a/src/runtime/cgo/gcc_android_amd64.c
+++ b/src/runtime/cgo/gcc_android_amd64.c
@@ -41,7 +41,7 @@ inittls(void)
 	 */
 	ntofree = 0;
 	for(;;) {
-		if(pthread_key_create(&k, nil) < 0) {
+		if(pthread_key_create(&k, nil) != 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
@@ -82,7 +82,10 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	pthread_setspecific(k1, (void*)ts.g);
+	if (pthread_setspecific(k1, (void*)ts.g) != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_setspecific failed\n");
+		abort();
+	}
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/runtime/cgo/gcc_darwin_386.c b/src/runtime/cgo/gcc_darwin_386.c
index 4ab3267d64..7d3c55cd56 100644
--- a/src/runtime/cgo/gcc_darwin_386.c
+++ b/src/runtime/cgo/gcc_darwin_386.c
@@ -39,8 +39,8 @@ inittls(void)
 	 *
 	 * The linker and runtime hard-code this constant offset
 	 * from %gs where we expect to find g.
-	 * Known to ../../../liblink/sym.c:/468
-	 * and to ../sys_darwin_386.s:/468
+	 * Known to src/cmd/link/internal/ld/sym.go:/0x468
+	 * and to src/runtime/sys_darwin_386.s:/0x468
 	 *
 	 * This is truly disgusting and a bit fragile, but taking care
 	 * of it here protects the rest of the system from damage.
@@ -64,7 +64,7 @@ inittls(void)
 	 */
 	ntofree = 0;
 	for(;;) {
-		if(pthread_key_create(&k, nil) < 0) {
+		if(pthread_key_create(&k, nil) != 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
@@ -142,7 +142,10 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	pthread_setspecific(k1, (void*)ts.g);
+	if (pthread_setspecific(k1, (void*)ts.g) != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_setspecific failed\n");
+		abort();
+	}
 
 	crosscall_386(ts.fn);
 	return nil;
diff --git a/src/runtime/cgo/gcc_darwin_amd64.c b/src/runtime/cgo/gcc_darwin_amd64.c
index 181d0ab490..c57608c675 100644
--- a/src/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/runtime/cgo/gcc_darwin_amd64.c
@@ -28,14 +28,14 @@ inittls(void)
 	 *
 	 * The linker and runtime hard-code this constant offset
 	 * from %gs where we expect to find g.
-	 * Known to ../../../liblink/sym.c:/8a0
-	 * and to ../sys_darwin_amd64.s:/8a0
+	 * Known to src/cmd/link/internal/ld/sym.go:/0x8a0
+	 * and to src/runtime/sys_darwin_amd64.s:/0x8a0
 	 *
 	 * As disgusting as on the 386; same justification.
 	 */
 	ntofree = 0;
 	for(;;) {
-		if(pthread_key_create(&k, nil) < 0) {
+		if(pthread_key_create(&k, nil) != 0) {
 			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
 			abort();
 		}
@@ -113,7 +113,10 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	pthread_setspecific(k1, (void*)ts.g);
+	if (pthread_setspecific(k1, (void*)ts.g) != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_setspecific failed\n");
+		abort();
+	}
 
 	crosscall_amd64(ts.fn);
 	return nil;
diff --git a/src/runtime/cgo/gcc_libinit.c b/src/runtime/cgo/gcc_libinit.c
index 31594addce..3dc5bde4cc 100644
--- a/src/runtime/cgo/gcc_libinit.c
+++ b/src/runtime/cgo/gcc_libinit.c
@@ -98,6 +98,10 @@ _cgo_try_pthread_create(pthread_t* thread, const pthread_attr_t* attr, void* (*p
 
 	for (tries = 0; tries < 20; tries++) {
 		err = pthread_create(thread, attr, pfn, arg);
+		if (err == 0) {
+			pthread_detach(*thread);
+			return 0;
+		}
 		if (err != EAGAIN) {
 			return err;
 		}
diff --git a/src/runtime/cgo/gcc_mipsx.S b/src/runtime/cgo/gcc_mipsx.S
index c51c36a9b7..54f4b8201a 100644
--- a/src/runtime/cgo/gcc_mipsx.S
+++ b/src/runtime/cgo/gcc_mipsx.S
@@ -14,8 +14,11 @@
 .globl crosscall1
 .set noat
 crosscall1:
+#ifndef __mips_soft_float
 	addiu	$29, $29, -88
-
+#else
+	addiu	$29, $29, -40 // For soft-float, no need to make room for FP registers
+#endif
 	sw	$31, 0($29)
 	sw	$16, 4($29)
 	sw	$17, 8($29)
@@ -27,14 +30,14 @@ crosscall1:
 	sw	$23, 32($29)
 	sw	$30, 36($29)
 
+#ifndef __mips_soft_float
 	sdc1	$f20, 40($29)
 	sdc1	$f22, 48($29)
 	sdc1	$f24, 56($29)
 	sdc1	$f26, 64($29)
 	sdc1	$f28, 72($29)
 	sdc1	$f30, 80($29)
-
-
+#endif
 	move	$20, $4 // save R4
 	move	$4, $6
 	jalr	$5	// call setg_gcc
@@ -49,16 +52,20 @@ crosscall1:
 	lw	$22, 28($29)
 	lw	$23, 32($29)
 	lw	$30, 36($29)
+#ifndef __mips_soft_float
 	ldc1	$f20, 40($29)
 	ldc1	$f22, 48($29)
 	ldc1	$f24, 56($29)
 	ldc1	$f26, 64($29)
 	ldc1	$f28, 72($29)
 	ldc1	$f30, 80($29)
-
+#endif
 	lw	$31, 0($29)
-
+#ifndef __mips_soft_float
 	addiu	$29, $29, 88
+#else
+	addiu	$29, $29, 40
+#endif
 	jr	$31
 
 .set at
diff --git a/src/runtime/cgo/gcc_mmap.c b/src/runtime/cgo/gcc_mmap.c
index 29acd3c185..5cf6bdf8cf 100644
--- a/src/runtime/cgo/gcc_mmap.c
+++ b/src/runtime/cgo/gcc_mmap.c
@@ -11,7 +11,7 @@
 
 #include "libcgo.h"
 
-void *
+uintptr_t
 x_cgo_mmap(void *addr, uintptr_t length, int32_t prot, int32_t flags, int32_t fd, uint32_t offset) {
 	void *p;
 
@@ -20,9 +20,9 @@ x_cgo_mmap(void *addr, uintptr_t length, int32_t prot, int32_t flags, int32_t fd
 	_cgo_tsan_release();
 	if (p == MAP_FAILED) {
 		/* This is what the Go code expects on failure.  */
-		p = (void *) (uintptr_t) errno;
+		return (uintptr_t)errno;
 	}
-	return p;
+	return (uintptr_t)p;
 }
 
 void
diff --git a/src/runtime/cgo/gcc_signal2_darwin_armx.c b/src/runtime/cgo/gcc_signal2_darwin_armx.c
new file mode 100644
index 0000000000..54b7e32658
--- /dev/null
+++ b/src/runtime/cgo/gcc_signal2_darwin_armx.c
@@ -0,0 +1,13 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build lldb
+// +build darwin
+// +build arm arm64
+
+// Used by gcc_signal_darwin_armx.c when doing the test build during cgo.
+// We hope that for real binaries the definition provided by Go will take precedence
+// and the linker will drop this .o file altogether, which is why this definition
+// is all by itself in its own file.
+void __attribute__((weak)) xx_cgo_panicmem(void) {}
diff --git a/src/runtime/cgo/gcc_signal_darwin_armx.c b/src/runtime/cgo/gcc_signal_darwin_armx.c
index a2d520bce8..3ab1d8b0d6 100644
--- a/src/runtime/cgo/gcc_signal_darwin_armx.c
+++ b/src/runtime/cgo/gcc_signal_darwin_armx.c
@@ -39,7 +39,8 @@
 #include "libcgo.h"
 #include "libcgo_unix.h"
 
-uintptr_t x_cgo_panicmem;
+void xx_cgo_panicmem(void);
+uintptr_t x_cgo_panicmem = (uintptr_t)xx_cgo_panicmem;
 
 static pthread_mutex_t mach_exception_handler_port_set_mu;
 static mach_port_t mach_exception_handler_port_set = MACH_PORT_NULL;
diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_darwin_lldb.c
index 12cc388400..54d91f6390 100644
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ b/src/runtime/cgo/gcc_signal_darwin_lldb.c
@@ -8,7 +8,5 @@
 
 #include <stdint.h>
 
-uintptr_t x_cgo_panicmem;
-
 void darwin_arm_init_thread_exception_port() {}
 void darwin_arm_init_mach_exception_handler() {}
diff --git a/src/runtime/cgo/gcc_util.c b/src/runtime/cgo/gcc_util.c
index 2d5382a8f0..3fcb48cc8d 100644
--- a/src/runtime/cgo/gcc_util.c
+++ b/src/runtime/cgo/gcc_util.c
@@ -29,6 +29,10 @@ void(* const _cgo_yield)() = NULL;
 
 #include <string.h>
 
+char x_cgo_yield_strncpy_src = 0;
+char x_cgo_yield_strncpy_dst = 0;
+size_t x_cgo_yield_strncpy_n = 0;
+
 /*
 Stub for allowing libc interceptors to execute.
 
@@ -50,9 +54,14 @@ x_cgo_yield()
 
 	So we choose strncpy(_, _, 0): it requires an extra header,
 	but it's standard and should be very efficient.
+
+	GCC 7 has an unfortunate habit of optimizing out strncpy calls (see
+	https://golang.org/issue/21196), so the arguments here need to be global
+	variables with external linkage in order to ensure that the call traps all the
+	way down into libc.
 	*/
-	char nothing = 0;
-	strncpy(&nothing, &nothing, 0);
+	strncpy(&x_cgo_yield_strncpy_dst, &x_cgo_yield_strncpy_src,
+	        x_cgo_yield_strncpy_n);
 }
 
 void(* const _cgo_yield)() = &x_cgo_yield;
diff --git a/src/runtime/cgo/signal_darwin_arm.s b/src/runtime/cgo/signal_darwin_arm.s
index ee5c3d3476..f886e4bc06 100644
--- a/src/runtime/cgo/signal_darwin_arm.s
+++ b/src/runtime/cgo/signal_darwin_arm.s
@@ -4,13 +4,13 @@
 
 #include "textflag.h"
 
-// panicmem is the entrypoint for SIGSEGV as intercepted via a
+// xx_cgo_panicmem is the entrypoint for SIGSEGV as intercepted via a
 // mach thread port as EXC_BAD_ACCESS. As the segfault may have happened
-// in C code, we first need to load_g then call panicmem.
+// in C code, we first need to load_g then call xx_cgo_panicmem.
 //
 //	R1 - LR at moment of fault
 //	R2 - PC at moment of fault
-TEXT ·panicmem(SB),NOSPLIT,$-4
+TEXT xx_cgo_panicmem(SB),NOSPLIT,$-4
 	// If in external C code, we need to load the g register.
 	BL  runtime·load_g(SB)
 	CMP $0, g
diff --git a/src/runtime/cgo/signal_darwin_arm64.s b/src/runtime/cgo/signal_darwin_arm64.s
index 75aefd4b95..17781cf496 100644
--- a/src/runtime/cgo/signal_darwin_arm64.s
+++ b/src/runtime/cgo/signal_darwin_arm64.s
@@ -4,13 +4,13 @@
 
 #include "textflag.h"
 
-// panicmem is the entrypoint for SIGSEGV as intercepted via a
+// xx_cgo_panicmem is the entrypoint for SIGSEGV as intercepted via a
 // mach thread port as EXC_BAD_ACCESS. As the segfault may have happened
-// in C code, we first need to load_g then call panicmem.
+// in C code, we first need to load_g then call xx_cgo_panicmem.
 //
 //	R1 - LR at moment of fault
 //	R2 - PC at moment of fault
-TEXT ·panicmem(SB),NOSPLIT,$-8
+TEXT xx_cgo_panicmem(SB),NOSPLIT,$-8
 	// If in external C code, we need to load the g register.
 	BL  runtime·load_g(SB)
 	CMP $0, g
diff --git a/src/runtime/cgo/signal_darwin_armx.go b/src/runtime/cgo/signal_darwin_armx.go
index 9f6741eb08..9f4b462415 100644
--- a/src/runtime/cgo/signal_darwin_armx.go
+++ b/src/runtime/cgo/signal_darwin_armx.go
@@ -7,29 +7,7 @@
 
 package cgo
 
-import "unsafe"
+import _ "unsafe"
 
-//go:cgo_import_static x_cgo_panicmem
-//go:linkname x_cgo_panicmem x_cgo_panicmem
-var x_cgo_panicmem uintptr
-
-// use a pointer to avoid relocation of external symbol in __TEXT
-// make linker happy
-var _cgo_panicmem = &x_cgo_panicmem
-
-// TODO(crawshaw): move this into x_cgo_init, it will not run until
-// runtime has finished loading, which may be after its use.
-func init() {
-	*_cgo_panicmem = funcPC(panicmem)
-}
-
-func funcPC(f interface{}) uintptr {
-	var ptrSize = unsafe.Sizeof(uintptr(0))
-	return **(**uintptr)(add(unsafe.Pointer(&f), ptrSize))
-}
-
-func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
-	return unsafe.Pointer(uintptr(p) + x)
-}
-
-func panicmem()
+//go:cgo_export_static xx_cgo_panicmem xx_cgo_panicmem
+func xx_cgo_panicmem()
diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
index aa531b9020..b7c70c6fff 100644
--- a/src/runtime/cgo_mmap.go
+++ b/src/runtime/cgo_mmap.go
@@ -20,19 +20,21 @@ var _cgo_mmap unsafe.Pointer
 //go:linkname _cgo_munmap _cgo_munmap
 var _cgo_munmap unsafe.Pointer
 
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer {
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
 	if _cgo_mmap != nil {
 		// Make ret a uintptr so that writing to it in the
 		// function literal does not trigger a write barrier.
 		// A write barrier here could break because of the way
 		// that mmap uses the same value both as a pointer and
 		// an errno value.
-		// TODO: Fix mmap to return two values.
 		var ret uintptr
 		systemstack(func() {
 			ret = callCgoMmap(addr, n, prot, flags, fd, off)
 		})
-		return unsafe.Pointer(ret)
+		if ret < 4096 {
+			return nil, int(ret)
+		}
+		return unsafe.Pointer(ret), 0
 	}
 	return sysMmap(addr, n, prot, flags, fd, off)
 }
@@ -46,7 +48,7 @@ func munmap(addr unsafe.Pointer, n uintptr) {
 }
 
 // sysMmap calls the mmap system call. It is implemented in assembly.
-func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
 
 // callCgoMmap calls the mmap function in the runtime/cgo package
 // using the GCC calling convention. It is implemented in assembly.
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 755269ebd2..02c4cb3622 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -8,9 +8,9 @@
 // runtime.cgocall(_cgo_Cfunc_f, frame), where _cgo_Cfunc_f is a
 // gcc-compiled function written by cgo.
 //
-// runtime.cgocall (below) locks g to m, calls entersyscall
-// so as not to block other goroutines or the garbage collector,
-// and then calls runtime.asmcgocall(_cgo_Cfunc_f, frame).
+// runtime.cgocall (below) calls entersyscall so as not to block
+// other goroutines or the garbage collector, and then calls
+// runtime.asmcgocall(_cgo_Cfunc_f, frame).
 //
 // runtime.asmcgocall (in asm_$GOARCH.s) switches to the m->g0 stack
 // (assumed to be an operating system-allocated stack, so safe to run
@@ -104,13 +104,9 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
 	}
 
-	// Lock g to m to ensure we stay on the same stack if we do a
-	// cgo callback. In case of panic, unwindm calls endcgo.
-	lockOSThread()
 	mp := getg().m
 	mp.ncgocall++
 	mp.ncgo++
-	mp.incgo = true
 
 	// Reset traceback.
 	mp.cgoCallers[0] = 0
@@ -130,7 +126,14 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 	// and then re-enter the "system call" reusing the PC and SP
 	// saved by entersyscall here.
 	entersyscall(0)
+
+	mp.incgo = true
 	errno := asmcgocall(fn, arg)
+
+	// Call endcgo before exitsyscall because exitsyscall may
+	// reschedule us on to a different M.
+	endcgo(mp)
+
 	exitsyscall(0)
 
 	// From the garbage collector's perspective, time can move
@@ -145,8 +148,8 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 	// GC by forcing them to stay live across this time warp.
 	KeepAlive(fn)
 	KeepAlive(arg)
+	KeepAlive(mp)
 
-	endcgo(mp)
 	return errno
 }
 
@@ -158,8 +161,6 @@ func endcgo(mp *m) {
 	if raceenabled {
 		raceacquire(unsafe.Pointer(&racecgosync))
 	}
-
-	unlockOSThread() // invalidates mp
 }
 
 // Call from C back to Go.
@@ -171,6 +172,12 @@ func cgocallbackg(ctxt uintptr) {
 		exit(2)
 	}
 
+	// The call from C is on gp.m's g0 stack, so we must ensure
+	// that we stay on that M. We have to do this before calling
+	// exitsyscall, since it would otherwise be free to move us to
+	// a different M. The call to unlockOSThread is in unwindm.
+	lockOSThread()
+
 	// Save current syscall parameters, so m.syscall can be
 	// used again if callback decide to make syscall.
 	syscall := gp.m.syscall
@@ -186,6 +193,10 @@ func cgocallbackg(ctxt uintptr) {
 
 	cgocallbackg1(ctxt)
 
+	// At this point unlockOSThread has been called.
+	// The following code must not change to a different m.
+	// This is enforced by checking incgo in the schedule function.
+
 	gp.m.incgo = true
 	// going back to cgo call
 	reentersyscall(savedpc, uintptr(savedsp))
@@ -321,32 +332,35 @@ func cgocallbackg1(ctxt uintptr) {
 }
 
 func unwindm(restore *bool) {
-	if !*restore {
-		return
-	}
-	// Restore sp saved by cgocallback during
-	// unwind of g's stack (see comment at top of file).
-	mp := acquirem()
-	sched := &mp.g0.sched
-	switch GOARCH {
-	default:
-		throw("unwindm not implemented")
-	case "386", "amd64", "arm", "ppc64", "ppc64le", "mips64", "mips64le", "s390x", "mips", "mipsle":
-		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + sys.MinFrameSize))
-	case "arm64":
-		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 16))
-	}
+	if *restore {
+		// Restore sp saved by cgocallback during
+		// unwind of g's stack (see comment at top of file).
+		mp := acquirem()
+		sched := &mp.g0.sched
+		switch GOARCH {
+		default:
+			throw("unwindm not implemented")
+		case "386", "amd64", "arm", "ppc64", "ppc64le", "mips64", "mips64le", "s390x", "mips", "mipsle":
+			sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + sys.MinFrameSize))
+		case "arm64":
+			sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 16))
+		}
 
-	// Call endcgo to do the accounting that cgocall will not have a
-	// chance to do during an unwind.
-	//
-	// In the case where a a Go call originates from C, ncgo is 0
-	// and there is no matching cgocall to end.
-	if mp.ncgo > 0 {
-		endcgo(mp)
+		// Call endcgo to do the accounting that cgocall will not have a
+		// chance to do during an unwind.
+		//
+		// In the case where a Go call originates from C, ncgo is 0
+		// and there is no matching cgocall to end.
+		if mp.ncgo > 0 {
+			endcgo(mp)
+		}
+
+		releasem(mp)
 	}
 
-	releasem(mp)
+	// Undo the call to lockOSThread in cgocallbackg.
+	// We must still stay on the same m.
+	unlockOSThread()
 }
 
 // called from assembly
@@ -580,10 +594,8 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 				// No more possible pointers.
 				break
 			}
-			if hbits.isPointer() {
-				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
-					panic(errorString(msg))
-				}
+			if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+				panic(errorString(msg))
 			}
 			hbits = hbits.next()
 		}
diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go
index 61aaa0a8f7..ea1ab974c3 100644
--- a/src/runtime/cgocheck.go
+++ b/src/runtime/cgocheck.go
@@ -16,6 +16,10 @@ const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
 
 // cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
 // It throws if the program is storing a Go pointer into non-Go memory.
+//
+// This is called from the write barrier, so its entire call tree must
+// be nosplit.
+//
 //go:nosplit
 //go:nowritebarrier
 func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 6294678d4a..41ae803574 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -55,11 +55,19 @@ type waitq struct {
 }
 
 //go:linkname reflect_makechan reflect.makechan
-func reflect_makechan(t *chantype, size int64) *hchan {
+func reflect_makechan(t *chantype, size int) *hchan {
 	return makechan(t, size)
 }
 
-func makechan(t *chantype, size int64) *hchan {
+func makechan64(t *chantype, size int64) *hchan {
+	if int64(int(size)) != size {
+		panic(plainError("makechan: size out of range"))
+	}
+
+	return makechan(t, int(size))
+}
+
+func makechan(t *chantype, size int) *hchan {
 	elem := t.elem
 
 	// compiler checks this but be safe.
@@ -69,29 +77,33 @@ func makechan(t *chantype, size int64) *hchan {
 	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
 		throw("makechan: bad alignment")
 	}
-	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/elem.size) {
+
+	if size < 0 || uintptr(size) > maxSliceCap(elem.size) || uintptr(size)*elem.size > _MaxMem-hchanSize {
 		panic(plainError("makechan: size out of range"))
 	}
 
+	// Hchan does not contain pointers interesting for GC when elements stored in buf do not contain pointers.
+	// buf points into the same allocation, elemtype is persistent.
+	// SudoG's are referenced from their owning thread so they can't be collected.
+	// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
 	var c *hchan
-	if elem.kind&kindNoPointers != 0 || size == 0 {
-		// Allocate memory in one call.
-		// Hchan does not contain pointers interesting for GC in this case:
-		// buf points into the same allocation, elemtype is persistent.
-		// SudoG's are referenced from their owning thread so they can't be collected.
-		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
+	switch {
+	case size == 0 || elem.size == 0:
+		// Queue or element size is zero.
+		c = (*hchan)(mallocgc(hchanSize, nil, true))
+		// Race detector uses this location for synchronization.
+		c.buf = unsafe.Pointer(c)
+	case elem.kind&kindNoPointers != 0:
+		// Elements do not contain pointers.
+		// Allocate hchan and buf in one call.
 		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
-		if size > 0 && elem.size != 0 {
-			c.buf = add(unsafe.Pointer(c), hchanSize)
-		} else {
-			// race detector uses this location for synchronization
-			// Also prevents us from pointing beyond the allocation (see issue 9401).
-			c.buf = unsafe.Pointer(c)
-		}
-	} else {
+		c.buf = add(unsafe.Pointer(c), hchanSize)
+	default:
+		// Elements contain pointers.
 		c = new(hchan)
-		c.buf = newarray(elem, int(size))
+		c.buf = mallocgc(uintptr(size)*elem.size, elem, true)
 	}
+
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
@@ -110,7 +122,7 @@ func chanbuf(c *hchan, i uint) unsafe.Pointer {
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
-	chansend(c, elem, true, getcallerpc(unsafe.Pointer(&c)))
+	chansend(c, elem, true, getcallerpc())
 }
 
 /*
@@ -214,7 +226,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	mysg.elem = ep
 	mysg.waitlink = nil
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.waiting = mysg
 	gp.param = nil
@@ -322,7 +334,7 @@ func closechan(c *hchan) {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&c))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(c), callerpc, funcPC(closechan))
 		racerelease(unsafe.Pointer(c))
 	}
@@ -499,7 +511,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	mysg.waitlink = nil
 	gp.waiting = mysg
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
@@ -594,7 +606,7 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 //	}
 //
 func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
-	return chansend(c, elem, false, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, false, getcallerpc())
 }
 
 // compiler implements
@@ -644,7 +656,7 @@ func selectnbrecv2(elem unsafe.Pointer, received *bool, c *hchan) (selected bool
 
 //go:linkname reflect_chansend reflect.chansend
 func reflect_chansend(c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
-	return chansend(c, elem, !nb, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, !nb, getcallerpc())
 }
 
 //go:linkname reflect_chanrecv reflect.chanrecv
@@ -703,10 +715,16 @@ func (q *waitq) dequeue() *sudog {
 			sgp.next = nil // mark as removed (see dequeueSudog)
 		}
 
-		// if sgp participates in a select and is already signaled, ignore it
-		if sgp.selectdone != nil {
-			// claim the right to signal
-			if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
+		// if a goroutine was put on this queue because of a
+		// select, there is a small window between the goroutine
+		// being woken up by a different case and it grabbing the
+		// channel locks. Once it has the lock
+		// it removes itself from the queue, so we won't see it after that.
+		// We use a flag in the G struct to tell us when someone
+		// else has won the race to signal this goroutine but the goroutine
+		// hasn't removed itself from the queue yet.
+		if sgp.isSelect {
+			if !atomic.Cas(&sgp.g.selectDone, 0, 1) {
 				continue
 			}
 		}
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index 0c94cf1a63..b6188f5e87 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"internal/testenv"
 	"math"
 	"runtime"
 	"sync"
@@ -433,6 +434,9 @@ func TestSelectStress(t *testing.T) {
 
 func TestSelectFairness(t *testing.T) {
 	const trials = 10000
+	if runtime.GOOS == "linux" && runtime.GOARCH == "ppc64le" {
+		testenv.SkipFlaky(t, 22047)
+	}
 	c1 := make(chan byte, trials+1)
 	c2 := make(chan byte, trials+1)
 	for i := 0; i < trials+1; i++ {
@@ -726,6 +730,55 @@ done:
 	<-ready2
 }
 
+type struct0 struct{}
+
+func BenchmarkMakeChan(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		var x chan byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Int", func(b *testing.B) {
+		var x chan int
+		for i := 0; i < b.N; i++ {
+			x = make(chan int, 8)
+		}
+		close(x)
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		var x chan *byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan *byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("0", func(b *testing.B) {
+			var x chan struct0
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct0, 8)
+			}
+			close(x)
+		})
+		b.Run("32", func(b *testing.B) {
+			var x chan struct32
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct32, 8)
+			}
+			close(x)
+		})
+		b.Run("40", func(b *testing.B) {
+			var x chan struct40
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct40, 8)
+			}
+			close(x)
+		})
+	})
+}
+
 func BenchmarkChanNonblocking(b *testing.B) {
 	myc := make(chan int)
 	b.RunParallel(func(pb *testing.PB) {
diff --git a/src/runtime/cputicks.go b/src/runtime/cputicks.go
index ccc3947bb2..de97d5b6fa 100644
--- a/src/runtime/cputicks.go
+++ b/src/runtime/cputicks.go
@@ -11,6 +11,6 @@
 
 package runtime
 
-// careful: cputicks is not guaranteed to be monotonic!  In particular, we have
+// careful: cputicks is not guaranteed to be monotonic! In particular, we have
 // noticed drift between cpus on certain os/arch combinations. See issue 8976.
 func cputicks() int64
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index a5cbbad69b..50b634dda4 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -13,6 +13,7 @@ import (
 	"os"
 	"os/exec"
 	"runtime"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -113,7 +114,7 @@ func TestCgoExternalThreadSIGPROF(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -136,7 +137,7 @@ func TestCgoExternalThreadSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -203,14 +204,14 @@ func TestCgoCheckBytes(t *testing.T) {
 	const tries = 10
 	var tot1, tot2 time.Duration
 	for i := 0; i < tries; i++ {
-		cmd := testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, "GODEBUG=cgocheck=0", fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start := time.Now()
 		cmd.Run()
 		d1 := time.Since(start)
 
-		cmd = testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd = testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start = time.Now()
@@ -251,7 +252,7 @@ func TestCgoCCodeSIGPROF(t *testing.T) {
 
 func TestCgoCrashTraceback(t *testing.T) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	got := runTestProg(t, "testprogcgo", "CrashTraceback")
@@ -273,7 +274,7 @@ func TestCgoTracebackContext(t *testing.T) {
 
 func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	testenv.MustHaveGoRun(t)
@@ -283,7 +284,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, runArg)).CombinedOutput()
 	if err != nil {
 		if testenv.Builder() == "linux-amd64-alpine" {
 			// See Issue 18243 and Issue 19938.
@@ -295,7 +296,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -330,7 +331,7 @@ func TestCgoPprof(t *testing.T) {
 }
 
 func TestCgoPprofPIE(t *testing.T) {
-	testCgoPprof(t, "-ldflags=-extldflags=-pie", "CgoPprof")
+	testCgoPprof(t, "-buildmode=pie", "CgoPprof")
 }
 
 func TestCgoPprofThread(t *testing.T) {
@@ -359,7 +360,7 @@ func TestRaceProf(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -388,7 +389,7 @@ func TestRaceSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
 	if err != nil {
 		t.Logf("%s\n", got)
 		t.Fatal(err)
@@ -411,3 +412,72 @@ func TestCgoNumGoroutine(t *testing.T) {
 		t.Errorf("expected %q got %v", want, got)
 	}
 }
+
+func TestCatchPanic(t *testing.T) {
+	t.Parallel()
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	case "darwin":
+		if runtime.GOARCH == "amd64" {
+			t.Skipf("crash() on darwin/amd64 doesn't raise SIGABRT")
+		}
+	}
+
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprogcgo")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, early := range []bool{true, false} {
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCatchPanic"))
+		// Make sure a panic results in a crash.
+		cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
+		if early {
+			// Tell testprogcgo to install an early signal handler for SIGABRT
+			cmd.Env = append(cmd.Env, "CGOCATCHPANIC_EARLY_HANDLER=1")
+		}
+		if out, err := cmd.CombinedOutput(); err != nil {
+			t.Errorf("testprogcgo CgoCatchPanic failed: %v\n%s", err, out)
+		}
+	}
+}
+
+func TestCgoLockOSThreadExit(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	testLockOSThreadExit(t, "testprogcgo")
+}
+
+func TestWindowsStackMemoryCgo(t *testing.T) {
+	if runtime.GOOS != "windows" {
+		t.Skip("skipping windows specific test")
+	}
+	testenv.SkipFlaky(t, 22575)
+	o := runTestProg(t, "testprogcgo", "StackMemory")
+	stackUsage, err := strconv.Atoi(o)
+	if err != nil {
+		t.Fatalf("Failed to read stack usage: %v", err)
+	}
+	if expected, got := 100<<10, stackUsage; got > expected {
+		t.Fatalf("expected < %d bytes of memory per thread, got %d", expected, got)
+	}
+}
+
+func TestSigStackSwapping(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skip("no sigaltstack on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "SigStack")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 7753809d45..9588ddd4de 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -32,25 +32,6 @@ func TestMain(m *testing.M) {
 	os.Exit(status)
 }
 
-func testEnv(cmd *exec.Cmd) *exec.Cmd {
-	if cmd.Env != nil {
-		panic("environment already set")
-	}
-	for _, env := range os.Environ() {
-		// Exclude GODEBUG from the environment to prevent its output
-		// from breaking tests that are trying to parse other command output.
-		if strings.HasPrefix(env, "GODEBUG=") {
-			continue
-		}
-		// Exclude GOTRACEBACK for the same reason.
-		if strings.HasPrefix(env, "GOTRACEBACK=") {
-			continue
-		}
-		cmd.Env = append(cmd.Env, env)
-	}
-	return cmd
-}
-
 var testprog struct {
 	sync.Mutex
 	dir    string
@@ -62,7 +43,11 @@ type buildexe struct {
 	err error
 }
 
-func runTestProg(t *testing.T, binary, name string) string {
+func runTestProg(t *testing.T, binary, name string, env ...string) string {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	testenv.MustHaveGoBuild(t)
 
 	exe, err := buildTestProg(t, binary)
@@ -70,7 +55,11 @@ func runTestProg(t *testing.T, binary, name string) string {
 		t.Fatal(err)
 	}
 
-	cmd := testEnv(exec.Command(exe, name))
+	cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
+	cmd.Env = append(cmd.Env, env...)
+	if testing.Short() {
+		cmd.Env = append(cmd.Env, "RUNTIME_TEST_SHORT=1")
+	}
 	var b bytes.Buffer
 	cmd.Stdout = &b
 	cmd.Stderr = &b
@@ -111,6 +100,10 @@ func runTestProg(t *testing.T, binary, name string) string {
 }
 
 func buildTestProg(t *testing.T, binary string, flags ...string) (string, error) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	checkStaleRuntime(t)
 
 	testprog.Lock()
@@ -139,7 +132,7 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	exe := filepath.Join(testprog.dir, name+".exe")
 	cmd := exec.Command(testenv.GoToolPath(t), append([]string{"build", "-o", exe}, flags...)...)
 	cmd.Dir = "testdata/" + binary
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		target.err = fmt.Errorf("building %s %v: %v\n%s", binary, flags, err, out)
 		testprog.target[name] = target
@@ -158,14 +151,14 @@ var (
 func checkStaleRuntime(t *testing.T) {
 	staleRuntimeOnce.Do(func() {
 		// 'go run' uses the installed copy of runtime.a, which may be out of date.
-		out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.Stale}}", "runtime")).CombinedOutput()
+		out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.Stale}}", "runtime")).CombinedOutput()
 		if err != nil {
 			staleRuntimeErr = fmt.Errorf("failed to execute 'go list': %v\n%v", err, string(out))
 			return
 		}
 		if string(out) != "false\n" {
 			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
-			out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
+			out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
 			if err != nil {
 				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
 			}
@@ -468,7 +461,7 @@ func TestMemPprof(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "MemProf")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "MemProf")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -476,7 +469,7 @@ func TestMemPprof(t *testing.T) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -586,7 +579,7 @@ func TestPanicRace(t *testing.T) {
 	const tries = 10
 retry:
 	for i := 0; i < tries; i++ {
-		got, err := testEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
+		got, err := testenv.CleanCmdEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
 		if err == nil {
 			t.Logf("try %d: program exited successfully, should have failed", i+1)
 			continue
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index cbaa1f65fe..af9e6430da 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -65,13 +65,13 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source: %v\n%s", err, out)
 	}
 
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 
 	// Set GOGC=off. Because of golang.org/issue/10958, the tight
@@ -184,7 +184,7 @@ func TestPanicSystemstack(t *testing.T) {
 
 	t.Parallel()
 	cmd := exec.Command(os.Args[0], "testPanicSystemstackInternal")
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 	pr, pw, err := os.Pipe()
 	if err != nil {
@@ -249,7 +249,7 @@ func TestSignalExitStatus(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	err = testEnv(exec.Command(exe, "SignalExitStatus")).Run()
+	err = testenv.CleanCmdEnv(exec.Command(exe, "SignalExitStatus")).Run()
 	if err == nil {
 		t.Error("test program succeeded unexpectedly")
 	} else if ee, ok := err.(*exec.ExitError); !ok {
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index 0e798fc6f5..feacfb6026 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -15,9 +15,6 @@ import (
 // The number of logical CPUs on the local machine can be queried with NumCPU.
 // This call will go away when the scheduler improves.
 func GOMAXPROCS(n int) int {
-	if n > _MaxGomaxprocs {
-		n = _MaxGomaxprocs
-	}
 	lock(&sched.lock)
 	ret := int(gomaxprocs)
 	unlock(&sched.lock)
diff --git a/src/runtime/defs1_netbsd_386.go b/src/runtime/defs1_netbsd_386.go
index 66f07ce5a5..c26f417a02 100644
--- a/src/runtime/defs1_netbsd_386.go
+++ b/src/runtime/defs1_netbsd_386.go
@@ -79,6 +79,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = 0x0
 	_EVFILT_WRITE = 0x1
 )
diff --git a/src/runtime/defs1_netbsd_amd64.go b/src/runtime/defs1_netbsd_amd64.go
index 9e314718f3..0704cd4fb3 100644
--- a/src/runtime/defs1_netbsd_amd64.go
+++ b/src/runtime/defs1_netbsd_amd64.go
@@ -79,6 +79,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = 0x0
 	_EVFILT_WRITE = 0x1
 )
diff --git a/src/runtime/defs1_netbsd_arm.go b/src/runtime/defs1_netbsd_arm.go
index db8e4c63fc..d2a13ad4b0 100644
--- a/src/runtime/defs1_netbsd_arm.go
+++ b/src/runtime/defs1_netbsd_arm.go
@@ -79,6 +79,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = 0x0
 	_EVFILT_WRITE = 0x1
 )
diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
index 78df4e7ac8..f7d65e700d 100644
--- a/src/runtime/defs_darwin.go
+++ b/src/runtime/defs_darwin.go
@@ -139,6 +139,7 @@ const (
 	EV_CLEAR     = C.EV_CLEAR
 	EV_RECEIPT   = C.EV_RECEIPT
 	EV_ERROR     = C.EV_ERROR
+	EV_EOF       = C.EV_EOF
 	EVFILT_READ  = C.EVFILT_READ
 	EVFILT_WRITE = C.EVFILT_WRITE
 )
diff --git a/src/runtime/defs_darwin_386.go b/src/runtime/defs_darwin_386.go
index 1a5967b24b..f6dbcc519c 100644
--- a/src/runtime/defs_darwin_386.go
+++ b/src/runtime/defs_darwin_386.go
@@ -118,6 +118,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_darwin_amd64.go b/src/runtime/defs_darwin_amd64.go
index a4ab090d51..245fe158c7 100644
--- a/src/runtime/defs_darwin_amd64.go
+++ b/src/runtime/defs_darwin_amd64.go
@@ -118,6 +118,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_darwin_arm.go b/src/runtime/defs_darwin_arm.go
index 3f8dbbf254..f89aee6775 100644
--- a/src/runtime/defs_darwin_arm.go
+++ b/src/runtime/defs_darwin_arm.go
@@ -120,6 +120,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_darwin_arm64.go b/src/runtime/defs_darwin_arm64.go
index c25a41b749..a0ca7f1703 100644
--- a/src/runtime/defs_darwin_arm64.go
+++ b/src/runtime/defs_darwin_arm64.go
@@ -118,6 +118,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_dragonfly.go b/src/runtime/defs_dragonfly.go
index ed00be0f44..95014fe6e7 100644
--- a/src/runtime/defs_dragonfly.go
+++ b/src/runtime/defs_dragonfly.go
@@ -103,6 +103,7 @@ const (
 	EV_DELETE    = C.EV_DELETE
 	EV_CLEAR     = C.EV_CLEAR
 	EV_ERROR     = C.EV_ERROR
+	EV_EOF       = C.EV_EOF
 	EVFILT_READ  = C.EVFILT_READ
 	EVFILT_WRITE = C.EVFILT_WRITE
 )
diff --git a/src/runtime/defs_dragonfly_amd64.go b/src/runtime/defs_dragonfly_amd64.go
index fc70103286..c30da805cc 100644
--- a/src/runtime/defs_dragonfly_amd64.go
+++ b/src/runtime/defs_dragonfly_amd64.go
@@ -82,6 +82,7 @@ const (
 	_EV_DELETE    = 0x2
 	_EV_CLEAR     = 0x20
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
index 0a11d09db2..9d55111786 100644
--- a/src/runtime/defs_freebsd.go
+++ b/src/runtime/defs_freebsd.go
@@ -125,6 +125,7 @@ const (
 	EV_CLEAR     = C.EV_CLEAR
 	EV_RECEIPT   = C.EV_RECEIPT
 	EV_ERROR     = C.EV_ERROR
+	EV_EOF       = C.EV_EOF
 	EVFILT_READ  = C.EVFILT_READ
 	EVFILT_WRITE = C.EVFILT_WRITE
 )
diff --git a/src/runtime/defs_freebsd_386.go b/src/runtime/defs_freebsd_386.go
index 92b05503a3..49bcbb12a2 100644
--- a/src/runtime/defs_freebsd_386.go
+++ b/src/runtime/defs_freebsd_386.go
@@ -95,6 +95,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_freebsd_amd64.go b/src/runtime/defs_freebsd_amd64.go
index 645e2053f2..0e1c6752d6 100644
--- a/src/runtime/defs_freebsd_amd64.go
+++ b/src/runtime/defs_freebsd_amd64.go
@@ -95,6 +95,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_freebsd_arm.go b/src/runtime/defs_freebsd_arm.go
index c8a198fb4a..71684fe9f8 100644
--- a/src/runtime/defs_freebsd_arm.go
+++ b/src/runtime/defs_freebsd_arm.go
@@ -95,6 +95,7 @@ const (
 	_EV_CLEAR     = 0x20
 	_EV_RECEIPT   = 0x40
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_netbsd.go b/src/runtime/defs_netbsd.go
index 56db1f0159..41aa07af98 100644
--- a/src/runtime/defs_netbsd.go
+++ b/src/runtime/defs_netbsd.go
@@ -105,6 +105,7 @@ const (
 	EV_CLEAR     = C.EV_CLEAR
 	EV_RECEIPT   = 0
 	EV_ERROR     = C.EV_ERROR
+	EV_EOF       = C.EV_EOF
 	EVFILT_READ  = C.EVFILT_READ
 	EVFILT_WRITE = C.EVFILT_WRITE
 )
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index 7e721504e6..9ff13dfcbf 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -100,6 +100,7 @@ const (
 	EV_DELETE    = C.EV_DELETE
 	EV_CLEAR     = C.EV_CLEAR
 	EV_ERROR     = C.EV_ERROR
+	EV_EOF       = C.EV_EOF
 	EVFILT_READ  = C.EVFILT_READ
 	EVFILT_WRITE = C.EVFILT_WRITE
 )
diff --git a/src/runtime/defs_openbsd_386.go b/src/runtime/defs_openbsd_386.go
index ce08111dea..1185530964 100644
--- a/src/runtime/defs_openbsd_386.go
+++ b/src/runtime/defs_openbsd_386.go
@@ -80,6 +80,7 @@ const (
 	_EV_DELETE    = 0x2
 	_EV_CLEAR     = 0x20
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_openbsd_amd64.go b/src/runtime/defs_openbsd_amd64.go
index ea0709809a..4bb8eac08f 100644
--- a/src/runtime/defs_openbsd_amd64.go
+++ b/src/runtime/defs_openbsd_amd64.go
@@ -80,6 +80,7 @@ const (
 	_EV_DELETE    = 0x2
 	_EV_CLEAR     = 0x20
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/defs_openbsd_arm.go b/src/runtime/defs_openbsd_arm.go
index b0fb639c72..38b77c92d0 100644
--- a/src/runtime/defs_openbsd_arm.go
+++ b/src/runtime/defs_openbsd_arm.go
@@ -80,6 +80,7 @@ const (
 	_EV_DELETE    = 0x2
 	_EV_CLEAR     = 0x20
 	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
 	_EVFILT_READ  = -0x1
 	_EVFILT_WRITE = -0x2
 )
diff --git a/src/runtime/duff_amd64.s b/src/runtime/duff_amd64.s
index a1112a4b59..44dc75d297 100644
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s
@@ -9,97 +9,97 @@ TEXT runtime·duffzero(SB), NOSPLIT, $0-0
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
 	MOVUPS	X0,48(DI)
-	ADDQ	$64,DI
+	LEAQ	64(DI),DI
 
 	RET
 
diff --git a/src/runtime/duff_arm64.s b/src/runtime/duff_arm64.s
index 60a0e26cd3..21619ff910 100644
--- a/src/runtime/duff_arm64.s
+++ b/src/runtime/duff_arm64.s
@@ -5,134 +5,70 @@
 #include "textflag.h"
 
 TEXT runtime·duffzero(SB), NOSPLIT, $-8-0
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
-	MOVD.W	ZR, 8(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP.P	(ZR, ZR), 16(R16)
+	STP	(ZR, ZR), (R16)
 	RET
 
 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
diff --git a/src/runtime/error.go b/src/runtime/error.go
index eafcc9b173..16f3e53a47 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -126,34 +126,31 @@ func printany(i interface{}) {
 //go:linkname stringsIndexByte strings.IndexByte
 func stringsIndexByte(s string, c byte) int
 
-// called from generated code
+// panicwrap generates a panic for a call to a wrapped value method
+// with a nil pointer receiver.
+//
+// It is called from the generated wrapper code.
 func panicwrap() {
-	pc := make([]uintptr, 1)
-	n := Callers(2, pc)
-	if n == 0 {
-		throw("panicwrap: Callers failed")
-	}
-	frames := CallersFrames(pc)
-	frame, _ := frames.Next()
-	name := frame.Function
+	pc := getcallerpc()
+	name := funcname(findfunc(pc))
 	// name is something like "main.(*T).F".
 	// We want to extract pkg ("main"), typ ("T"), and meth ("F").
 	// Do it by finding the parens.
 	i := stringsIndexByte(name, '(')
 	if i < 0 {
-		throw("panicwrap: no ( in " + frame.Function)
+		throw("panicwrap: no ( in " + name)
 	}
 	pkg := name[:i-1]
 	if i+2 >= len(name) || name[i-1:i+2] != ".(*" {
-		throw("panicwrap: unexpected string after package name: " + frame.Function)
+		throw("panicwrap: unexpected string after package name: " + name)
 	}
 	name = name[i+2:]
 	i = stringsIndexByte(name, ')')
 	if i < 0 {
-		throw("panicwrap: no ) in " + frame.Function)
+		throw("panicwrap: no ) in " + name)
 	}
 	if i+2 >= len(name) || name[i:i+2] != ")." {
-		throw("panicwrap: unexpected string after type name: " + frame.Function)
+		throw("panicwrap: unexpected string after type name: " + name)
 	}
 	typ := name[:i]
 	meth := name[i+2:]
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index c929bd4618..385c569ed8 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -152,12 +152,19 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 	}
 }
 
-var StringHash = stringHash
-var BytesHash = bytesHash
-var Int32Hash = int32Hash
-var Int64Hash = int64Hash
-var EfaceHash = efaceHash
-var IfaceHash = ifaceHash
+var (
+	StringHash = stringHash
+	BytesHash  = bytesHash
+	Int32Hash  = int32Hash
+	Int64Hash  = int64Hash
+	MemHash    = memhash
+	MemHash32  = memhash32
+	MemHash64  = memhash64
+	EfaceHash  = efaceHash
+	IfaceHash  = ifaceHash
+)
+
+var UseAeshash = &useAeshash
 
 func MemclrBytes(b []byte) {
 	s := (*slice)(unsafe.Pointer(&b))
@@ -369,3 +376,40 @@ func (rw *RWMutex) Lock() {
 func (rw *RWMutex) Unlock() {
 	rw.rw.unlock()
 }
+
+func MapBucketsCount(m map[int]int) int {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return 1 << h.B
+}
+
+func MapBucketsPointerIsNil(m map[int]int) bool {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return h.buckets == nil
+}
+
+func LockOSCounts() (external, internal uint32) {
+	g := getg()
+	if g.m.lockedExt+g.m.lockedInt == 0 {
+		if g.lockedm != 0 {
+			panic("lockedm on non-locked goroutine")
+		}
+	} else {
+		if g.lockedm == 0 {
+			panic("nil lockedm on locked goroutine")
+		}
+	}
+	return g.m.lockedExt, g.m.lockedInt
+}
+
+//go:noinline
+func TracebackSystemstack(stk []uintptr, i int) int {
+	if i == 0 {
+		pc, sp := getcallerpc(), getcallersp(unsafe.Pointer(&stk))
+		return gentraceback(pc, sp, 0, getg(), 0, &stk[0], len(stk), nil, nil, _TraceJumpStack)
+	}
+	n := 0
+	systemstack(func() {
+		n = TracebackSystemstack(stk, i-1)
+	})
+	return n
+}
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 6e6c674d96..2c20e0d8af 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -178,11 +178,11 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
 	// We asked for one extra, so skip that one. If this is sigpanic,
 	// stepping over this frame will set up state in Frames so the
 	// next frame is correct.
-	callers, _, ok = stackExpander.next(callers)
+	callers, _, ok = stackExpander.next(callers, true)
 	if !ok {
 		return
 	}
-	_, frame, _ := stackExpander.next(callers)
+	_, frame, _ := stackExpander.next(callers, true)
 	pc = frame.PC
 	file = frame.File
 	line = frame.Line
@@ -212,8 +212,8 @@ func Callers(skip int, pc []uintptr) int {
 	return callers(skip, pc)
 }
 
-// GOROOT returns the root of the Go tree.
-// It uses the GOROOT environment variable, if set,
+// GOROOT returns the root of the Go tree. It uses the
+// GOROOT environment variable, if set at process start,
 // or else the root used during the Go build.
 func GOROOT() string {
 	s := gogetenv("GOROOT")
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 0620f2d61e..1b1db25b17 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -10,6 +10,7 @@ import (
 	"reflect"
 	"runtime"
 	"runtime/debug"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -515,3 +516,126 @@ func TestUserForcedGC(t *testing.T) {
 		t.Fatalf("runtime.GC() was not accounted in NumForcedGC")
 	}
 }
+
+func writeBarrierBenchmark(b *testing.B, f func()) {
+	runtime.GC()
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	//b.Logf("heap size: %d MB", ms.HeapAlloc>>20)
+
+	// Keep GC running continuously during the benchmark, which in
+	// turn keeps the write barrier on continuously.
+	var stop uint32
+	done := make(chan bool)
+	go func() {
+		for atomic.LoadUint32(&stop) == 0 {
+			runtime.GC()
+		}
+		close(done)
+	}()
+	defer func() {
+		atomic.StoreUint32(&stop, 1)
+		<-done
+	}()
+
+	b.ResetTimer()
+	f()
+	b.StopTimer()
+}
+
+func BenchmarkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large tree both so the GC runs for a while and
+	// so we have a data structure to manipulate the pointers of.
+	type node struct {
+		l, r *node
+	}
+	var wbRoots []*node
+	var mkTree func(level int) *node
+	mkTree = func(level int) *node {
+		if level == 0 {
+			return nil
+		}
+		n := &node{mkTree(level - 1), mkTree(level - 1)}
+		if level == 10 {
+			// Seed GC with enough early pointers so it
+			// doesn't accidentally switch to mark 2 when
+			// it only has the top of the tree.
+			wbRoots = append(wbRoots, n)
+		}
+		return n
+	}
+	const depth = 22 // 64 MB
+	root := mkTree(22)
+
+	writeBarrierBenchmark(b, func() {
+		var stack [depth]*node
+		tos := -1
+
+		// There are two write barriers per iteration, so i+=2.
+		for i := 0; i < b.N; i += 2 {
+			if tos == -1 {
+				stack[0] = root
+				tos = 0
+			}
+
+			// Perform one step of reversing the tree.
+			n := stack[tos]
+			if n.l == nil {
+				tos--
+			} else {
+				n.l, n.r = n.r, n.l
+				stack[tos] = n.l
+				stack[tos+1] = n.r
+				tos++
+			}
+
+			if i%(1<<12) == 0 {
+				// Avoid non-preemptible loops (see issue #10958).
+				runtime.Gosched()
+			}
+		}
+	})
+
+	runtime.KeepAlive(wbRoots)
+}
+
+func BenchmarkBulkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large set of objects we can copy around.
+	const heapSize = 64 << 20
+	type obj [16]*byte
+	ptrs := make([]*obj, heapSize/unsafe.Sizeof(obj{}))
+	for i := range ptrs {
+		ptrs[i] = new(obj)
+	}
+
+	writeBarrierBenchmark(b, func() {
+		const blockSize = 1024
+		var pos int
+		for i := 0; i < b.N; i += blockSize {
+			// Rotate block.
+			block := ptrs[pos : pos+blockSize]
+			first := block[0]
+			copy(block, block[1:])
+			block[blockSize-1] = first
+
+			pos += blockSize
+			if pos+blockSize > len(ptrs) {
+				pos = 0
+			}
+
+			runtime.Gosched()
+		}
+	})
+
+	runtime.KeepAlive(ptrs)
+}
diff --git a/src/runtime/hash32.go b/src/runtime/hash32.go
index be59076635..5574923911 100644
--- a/src/runtime/hash32.go
+++ b/src/runtime/hash32.go
@@ -81,6 +81,32 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 4*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 8*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= readUnaligned32(add(p, 4))
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index d61f114475..3cf3f4629b 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -81,6 +81,28 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 4*hashkey[0])
+	v := uint64(readUnaligned32(p))
+	h ^= v
+	h ^= v << 32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 8*hashkey[0])
+	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
index a6f3cdbdbe..1400579cda 100644
--- a/src/runtime/hash_test.go
+++ b/src/runtime/hash_test.go
@@ -14,6 +14,40 @@ import (
 	"unsafe"
 )
 
+func TestMemHash32Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [4]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash32(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 4)
+		if got != want {
+			t.Errorf("MemHash32(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
+func TestMemHash64Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [8]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash64(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 8)
+		if got != want {
+			t.Errorf("MemHash64(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index 11ce0cbc4b..dee5dd5816 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go
@@ -64,8 +64,10 @@ const (
 	bucketCntBits = 3
 	bucketCnt     = 1 << bucketCntBits
 
-	// Maximum average load of a bucket that triggers growth.
-	loadFactor = 6.5
+	// Maximum average load of a bucket that triggers growth is 6.5.
+	// Represent as loadFactorNum/loadFactDen, to allow integer math.
+	loadFactorNum = 13
+	loadFactorDen = 2
 
 	// Maximum key or value size to keep inline (instead of mallocing per element).
 	// Must fit in a uint8.
@@ -124,12 +126,13 @@ type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
-	// alive, we store pointers to all overflow buckets in hmap.overflow.
-	// Overflow is used only if key and value do not contain pointers.
-	// overflow[0] contains overflow buckets for hmap.buckets.
-	// overflow[1] contains overflow buckets for hmap.oldbuckets.
+	// alive, we store pointers to all overflow buckets in hmap.overflow and h.map.oldoverflow.
+	// overflow and oldoverflow are only used if key and value do not contain pointers.
+	// overflow contains overflow buckets for hmap.buckets.
+	// oldoverflow contains overflow buckets for hmap.oldbuckets.
 	// The indirection allows to store a pointer to the slice in hiter.
-	overflow [2]*[]*bmap
+	overflow    *[]*bmap
+	oldoverflow *[]*bmap
 
 	// nextOverflow holds a pointer to a free overflow bucket.
 	nextOverflow *bmap
@@ -158,7 +161,8 @@ type hiter struct {
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
 	bptr        *bmap          // current bucket
-	overflow    [2]*[]*bmap    // keeps overflow buckets alive
+	overflow    *[]*bmap       // keeps overflow buckets of hmap.buckets alive
+	oldoverflow *[]*bmap       // keeps overflow buckets of hmap.oldbuckets alive
 	startBucket uintptr        // bucket iteration started at
 	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
 	wrapped     bool           // already wrapped around from end of bucket array to beginning
@@ -168,6 +172,28 @@ type hiter struct {
 	checkBucket uintptr
 }
 
+// bucketShift returns 1<<b, optimized for code generation.
+func bucketShift(b uint8) uintptr {
+	if sys.GoarchAmd64|sys.GoarchAmd64p32|sys.Goarch386 != 0 {
+		b &= sys.PtrSize*8 - 1 // help x86 archs remove shift overflow checks
+	}
+	return uintptr(1) << b
+}
+
+// bucketMask returns 1<<b - 1, optimized for code generation.
+func bucketMask(b uint8) uintptr {
+	return bucketShift(b) - 1
+}
+
+// tophash calculates the tophash value for hash.
+func tophash(hash uintptr) uint8 {
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	return top
+}
+
 func evacuated(b *bmap) bool {
 	h := b.tophash[0]
 	return h > empty && h < minTopHash
@@ -181,6 +207,10 @@ func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
 	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
 }
 
+func (b *bmap) keys() unsafe.Pointer {
+	return add(unsafe.Pointer(b), dataOffset)
+}
+
 // incrnoverflow increments h.noverflow.
 // noverflow counts the number of overflow buckets.
 // This is used to trigger same-size map growth.
@@ -229,7 +259,7 @@ func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
 	h.incrnoverflow()
 	if t.bucket.kind&kindNoPointers != 0 {
 		h.createOverflow()
-		*h.extra.overflow[0] = append(*h.extra.overflow[0], ovf)
+		*h.extra.overflow = append(*h.extra.overflow, ovf)
 	}
 	b.setoverflow(t, ovf)
 	return ovf
@@ -239,97 +269,69 @@ func (h *hmap) createOverflow() {
 	if h.extra == nil {
 		h.extra = new(mapextra)
 	}
-	if h.extra.overflow[0] == nil {
-		h.extra.overflow[0] = new([]*bmap)
+	if h.extra.overflow == nil {
+		h.extra.overflow = new([]*bmap)
 	}
 }
 
-// makemap implements a Go map creation make(map[k]v, hint)
+func makemap64(t *maptype, hint int64, h *hmap) *hmap {
+	if int64(int(hint)) != hint {
+		hint = 0
+	}
+	return makemap(t, int(hint), h)
+}
+
+// makehmap_small implements Go map creation for make(map[k]v) and
+// make(map[k]v, hint) when hint is known to be at most bucketCnt
+// at compile time and the map needs to be allocated on the heap.
+func makemap_small() *hmap {
+	h := new(hmap)
+	h.hash0 = fastrand()
+	return h
+}
+
+// makemap implements Go map creation for make(map[k]v, hint).
 // If the compiler has determined that the map or the first bucket
 // can be created on the stack, h and/or bucket may be non-nil.
 // If h != nil, the map can be created directly in h.
-// If bucket != nil, bucket can be used as the first bucket.
-func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
-	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != t.hmap.size {
+// If h.buckets != nil, bucket pointed to can be used as the first bucket.
+func makemap(t *maptype, hint int, h *hmap) *hmap {
+	// The size of hmap should be 48 bytes on 64 bit
+	// and 28 bytes on 32 bit platforms.
+	if sz := unsafe.Sizeof(hmap{}); sz != 8+5*sys.PtrSize {
 		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
 		throw("bad hmap size")
 	}
 
-	if hint < 0 || hint > int64(maxSliceCap(t.bucket.size)) {
+	if hint < 0 || hint > int(maxSliceCap(t.bucket.size)) {
 		hint = 0
 	}
 
-	if !ismapkey(t.key) {
-		throw("runtime.makemap: unsupported map key type")
-	}
-
-	// check compiler's and reflect's math
-	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
-		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
-		throw("key size wrong")
-	}
-	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
-		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
-		throw("value size wrong")
-	}
-
-	// invariants we depend on. We should probably check these at compile time
-	// somewhere, but for now we'll do it here.
-	if t.key.align > bucketCnt {
-		throw("key align too big")
-	}
-	if t.elem.align > bucketCnt {
-		throw("value align too big")
-	}
-	if t.key.size%uintptr(t.key.align) != 0 {
-		throw("key size not a multiple of key align")
-	}
-	if t.elem.size%uintptr(t.elem.align) != 0 {
-		throw("value size not a multiple of value align")
-	}
-	if bucketCnt < 8 {
-		throw("bucketsize too small for proper alignment")
-	}
-	if dataOffset%uintptr(t.key.align) != 0 {
-		throw("need padding in bucket (key)")
-	}
-	if dataOffset%uintptr(t.elem.align) != 0 {
-		throw("need padding in bucket (value)")
+	// initialize Hmap
+	if h == nil {
+		h = (*hmap)(newobject(t.hmap))
 	}
+	h.hash0 = fastrand()
 
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
-	for ; overLoadFactor(hint, B); B++ {
+	for overLoadFactor(hint, B) {
+		B++
 	}
+	h.B = B
 
 	// allocate initial hash table
 	// if B == 0, the buckets field is allocated lazily later (in mapassign)
 	// If hint is large zeroing this memory could take a while.
-	buckets := bucket
-	var extra *mapextra
-	if B != 0 {
+	if h.B != 0 {
 		var nextOverflow *bmap
-		buckets, nextOverflow = makeBucketArray(t, B)
+		h.buckets, nextOverflow = makeBucketArray(t, h.B)
 		if nextOverflow != nil {
-			extra = new(mapextra)
-			extra.nextOverflow = nextOverflow
+			h.extra = new(mapextra)
+			h.extra.nextOverflow = nextOverflow
 		}
 	}
 
-	// initialize Hmap
-	if h == nil {
-		h = (*hmap)(newobject(t.hmap))
-	}
-	h.count = 0
-	h.B = B
-	h.extra = extra
-	h.flags = 0
-	h.hash0 = fastrand()
-	h.buckets = buckets
-	h.oldbuckets = nil
-	h.nevacuate = 0
-	h.noverflow = 0
-
 	return h
 }
 
@@ -340,7 +342,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 // hold onto it for very long.
 func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess1)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -356,7 +358,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -368,11 +370,8 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -389,16 +388,13 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 				return v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess2)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -414,7 +410,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -426,11 +422,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -447,11 +440,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 				return v, true
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 // returns both key and value. Used by map iterator
@@ -461,7 +451,7 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -473,11 +463,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -494,11 +481,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 				return k, v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return nil, nil
-		}
 	}
+	return nil, nil
 }
 
 func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
@@ -523,7 +507,7 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		pc := funcPC(mapassign)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -542,19 +526,16 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
+	top := tophash(hash)
 
 	var inserti *uint8
 	var insertk unsafe.Pointer
@@ -594,7 +575,7 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
@@ -634,7 +615,7 @@ done:
 
 func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		pc := funcPC(mapdelete)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -656,16 +637,14 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	// in which case we have not actually done a write (delete).
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -678,53 +657,44 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 			if !alg.equal(key, k2) {
 				continue
 			}
+			// Only clear key if there are pointers in it.
 			if t.indirectkey {
 				*(*unsafe.Pointer)(k) = nil
-			} else {
-				typedmemclr(t.key, k)
+			} else if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
 			}
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*uintptr(t.keysize) + i*uintptr(t.valuesize))
-			if t.indirectvalue {
-				*(*unsafe.Pointer)(v) = nil
-			} else {
-				typedmemclr(t.elem, v)
+			// Only clear value if there are pointers in it.
+			if t.indirectvalue || t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(v) = nil
+				} else {
+					memclrHasPointers(v, t.elem.size)
+				}
 			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
 
+// mapiterinit initializes the hiter struct used for ranging over maps.
+// The hiter struct pointed to by 'it' is allocated on the stack
+// by the compilers order pass or on the heap by reflect_mapiterinit.
+// Both need to have zeroed hiter since the struct contains pointers.
 func mapiterinit(t *maptype, h *hmap, it *hiter) {
-	// Clear pointer fields so garbage collector does not complain.
-	it.key = nil
-	it.value = nil
-	it.t = nil
-	it.h = nil
-	it.buckets = nil
-	it.bptr = nil
-	it.overflow[0] = nil
-	it.overflow[1] = nil
-
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiterinit))
 	}
 
 	if h == nil || h.count == 0 {
-		it.key = nil
-		it.value = nil
 		return
 	}
 
@@ -744,6 +714,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 		// while we are iterating.
 		h.createOverflow()
 		it.overflow = h.extra.overflow
+		it.oldoverflow = h.extra.oldoverflow
 	}
 
 	// decide where to start
@@ -751,16 +722,14 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	if h.B > 31-bucketCntBits {
 		r += uintptr(fastrand()) << 31
 	}
-	it.startBucket = r & (uintptr(1)<<h.B - 1)
+	it.startBucket = r & bucketMask(h.B)
 	it.offset = uint8(r >> h.B & (bucketCnt - 1))
 
 	// iterator state
 	it.bucket = it.startBucket
-	it.wrapped = false
-	it.bptr = nil
 
 	// Remember we have an iterator.
-	// Can run concurrently with another hash_iter_init().
+	// Can run concurrently with another mapiterinit().
 	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
 		atomic.Or8(&h.flags, iterator|oldIterator)
 	}
@@ -771,7 +740,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 func mapiternext(it *hiter) {
 	h := it.h
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&it))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiternext))
 	}
 	if h.flags&hashWriting != 0 {
@@ -810,7 +779,7 @@ next:
 			checkBucket = noCheck
 		}
 		bucket++
-		if bucket == uintptr(1)<<it.B {
+		if bucket == bucketShift(it.B) {
 			bucket = 0
 			it.wrapped = true
 		}
@@ -818,90 +787,75 @@ next:
 	}
 	for ; i < bucketCnt; i++ {
 		offi := (i + it.offset) & (bucketCnt - 1)
+		if b.tophash[offi] == empty || b.tophash[offi] == evacuatedEmpty {
+			continue
+		}
 		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
+		if t.indirectkey {
+			k = *((*unsafe.Pointer)(k))
+		}
 		v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.valuesize))
-		if b.tophash[offi] != empty && b.tophash[offi] != evacuatedEmpty {
-			if checkBucket != noCheck && !h.sameSizeGrow() {
-				// Special case: iterator was started during a grow to a larger size
-				// and the grow is not done yet. We're working on a bucket whose
-				// oldbucket has not been evacuated yet. Or at least, it wasn't
-				// evacuated when we started the bucket. So we're iterating
-				// through the oldbucket, skipping any keys that will go
-				// to the other new bucket (each oldbucket expands to two
-				// buckets during a grow).
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || alg.equal(k2, k2) {
-					// If the item in the oldbucket is not destined for
-					// the current new bucket in the iteration, skip it.
-					hash := alg.hash(k2, uintptr(h.hash0))
-					if hash&(uintptr(1)<<it.B-1) != checkBucket {
-						continue
-					}
-				} else {
-					// Hash isn't repeatable if k != k (NaNs).  We need a
-					// repeatable and randomish choice of which direction
-					// to send NaNs during evacuation. We'll use the low
-					// bit of tophash to decide which way NaNs go.
-					// NOTE: this case is why we need two evacuate tophash
-					// values, evacuatedX and evacuatedY, that differ in
-					// their low bit.
-					if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
-						continue
-					}
-				}
-			}
-			if b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY {
-				// this is the golden data, we can return it.
-				if t.indirectkey {
-					k = *((*unsafe.Pointer)(k))
-				}
-				it.key = k
-				if t.indirectvalue {
-					v = *((*unsafe.Pointer)(v))
+		if checkBucket != noCheck && !h.sameSizeGrow() {
+			// Special case: iterator was started during a grow to a larger size
+			// and the grow is not done yet. We're working on a bucket whose
+			// oldbucket has not been evacuated yet. Or at least, it wasn't
+			// evacuated when we started the bucket. So we're iterating
+			// through the oldbucket, skipping any keys that will go
+			// to the other new bucket (each oldbucket expands to two
+			// buckets during a grow).
+			if t.reflexivekey || alg.equal(k, k) {
+				// If the item in the oldbucket is not destined for
+				// the current new bucket in the iteration, skip it.
+				hash := alg.hash(k, uintptr(h.hash0))
+				if hash&bucketMask(it.B) != checkBucket {
+					continue
 				}
-				it.value = v
 			} else {
-				// The hash table has grown since the iterator was started.
-				// The golden data for this key is now somewhere else.
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || alg.equal(k2, k2) {
-					// Check the current hash table for the data.
-					// This code handles the case where the key
-					// has been deleted, updated, or deleted and reinserted.
-					// NOTE: we need to regrab the key as it has potentially been
-					// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
-					rk, rv := mapaccessK(t, h, k2)
-					if rk == nil {
-						continue // key has been deleted
-					}
-					it.key = rk
-					it.value = rv
-				} else {
-					// if key!=key then the entry can't be deleted or
-					// updated, so we can just return it. That's lucky for
-					// us because when key!=key we can't look it up
-					// successfully in the current table.
-					it.key = k2
-					if t.indirectvalue {
-						v = *((*unsafe.Pointer)(v))
-					}
-					it.value = v
+				// Hash isn't repeatable if k != k (NaNs).  We need a
+				// repeatable and randomish choice of which direction
+				// to send NaNs during evacuation. We'll use the low
+				// bit of tophash to decide which way NaNs go.
+				// NOTE: this case is why we need two evacuate tophash
+				// values, evacuatedX and evacuatedY, that differ in
+				// their low bit.
+				if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
+					continue
 				}
 			}
-			it.bucket = bucket
-			if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
-				it.bptr = b
+		}
+		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
+			!(t.reflexivekey || alg.equal(k, k)) {
+			// This is the golden data, we can return it.
+			// OR
+			// key!=key, so the entry can't be deleted or updated, so we can just return it.
+			// That's lucky for us because when key!=key we can't look it up successfully.
+			it.key = k
+			if t.indirectvalue {
+				v = *((*unsafe.Pointer)(v))
 			}
-			it.i = i + 1
-			it.checkBucket = checkBucket
-			return
+			it.value = v
+		} else {
+			// The hash table has grown since the iterator was started.
+			// The golden data for this key is now somewhere else.
+			// Check the current hash table for the data.
+			// This code handles the case where the key
+			// has been deleted, updated, or deleted and reinserted.
+			// NOTE: we need to regrab the key as it has potentially been
+			// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
+			rk, rv := mapaccessK(t, h, k)
+			if rk == nil {
+				continue // key has been deleted
+			}
+			it.key = rk
+			it.value = rv
+		}
+		it.bucket = bucket
+		if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+			it.bptr = b
 		}
+		it.i = i + 1
+		it.checkBucket = checkBucket
+		return
 	}
 	b = b.overflow(t)
 	i = 0
@@ -909,7 +863,7 @@ next:
 }
 
 func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow *bmap) {
-	base := uintptr(1 << b)
+	base := bucketShift(b)
 	nbuckets := base
 	// For small b, overflow buckets are unlikely.
 	// Avoid the overhead of the calculation.
@@ -917,7 +871,7 @@ func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow
 		// Add on the estimated number of overflow buckets
 		// required to insert the median number of elements
 		// used with this value of b.
-		nbuckets += 1 << (b - 4)
+		nbuckets += bucketShift(b - 4)
 		sz := t.bucket.size * nbuckets
 		up := roundupsize(sz)
 		if up != sz {
@@ -943,7 +897,7 @@ func hashGrow(t *maptype, h *hmap) {
 	// Otherwise, there are too many overflow buckets,
 	// so keep the same number of buckets and "grow" laterally.
 	bigger := uint8(1)
-	if !overLoadFactor(int64(h.count), h.B) {
+	if !overLoadFactor(h.count+1, h.B) {
 		bigger = 0
 		h.flags |= sameSizeGrow
 	}
@@ -962,13 +916,13 @@ func hashGrow(t *maptype, h *hmap) {
 	h.nevacuate = 0
 	h.noverflow = 0
 
-	if h.extra != nil && h.extra.overflow[0] != nil {
+	if h.extra != nil && h.extra.overflow != nil {
 		// Promote current overflow buckets to the old generation.
-		if h.extra.overflow[1] != nil {
-			throw("overflow is not nil")
+		if h.extra.oldoverflow != nil {
+			throw("oldoverflow is not nil")
 		}
-		h.extra.overflow[1] = h.extra.overflow[0]
-		h.extra.overflow[0] = nil
+		h.extra.oldoverflow = h.extra.overflow
+		h.extra.overflow = nil
 	}
 	if nextOverflow != nil {
 		if h.extra == nil {
@@ -982,9 +936,8 @@ func hashGrow(t *maptype, h *hmap) {
 }
 
 // overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
-func overLoadFactor(count int64, B uint8) bool {
-	// TODO: rewrite to use integer math and comparison?
-	return count >= bucketCnt && float32(count) >= loadFactor*float32((uint64(1)<<B))
+func overLoadFactor(count int, B uint8) bool {
+	return count > bucketCnt && uintptr(count) > loadFactorNum*(bucketShift(B)/loadFactorDen)
 }
 
 // tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
@@ -995,10 +948,11 @@ func tooManyOverflowBuckets(noverflow uint16, B uint8) bool {
 	// If the threshold is too high, maps that grow and shrink can hold on to lots of unused memory.
 	// "too many" means (approximately) as many overflow buckets as regular buckets.
 	// See incrnoverflow for more details.
-	if B < 16 {
-		return noverflow >= uint16(1)<<B
+	if B > 15 {
+		B = 15
 	}
-	return noverflow >= 1<<15
+	// The compiler doesn't see here that B < 16; mask B to generate shorter shift code.
+	return noverflow >= uint16(1)<<(B&15)
 }
 
 // growing reports whether h is growing. The growth may be to the same size or bigger.
@@ -1017,7 +971,7 @@ func (h *hmap) noldbuckets() uintptr {
 	if !h.sameSizeGrow() {
 		oldB--
 	}
-	return uintptr(1) << oldB
+	return bucketShift(oldB)
 }
 
 // oldbucketmask provides a mask that can be applied to calculate n % noldbuckets().
@@ -1041,32 +995,37 @@ func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
 	return evacuated(b)
 }
 
+// evacDst is an evacuation destination.
+type evacDst struct {
+	b *bmap          // current destination bucket
+	i int            // key/val index into b
+	k unsafe.Pointer // pointer to current key storage
+	v unsafe.Pointer // pointer to current value storage
+}
+
 func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
 	newbit := h.noldbuckets()
-	alg := t.key.alg
 	if !evacuated(b) {
 		// TODO: reuse overflow buckets instead of using new ones, if there
 		// is no iterator using the old buckets.  (If !oldIterator.)
 
-		var (
-			x, y   *bmap          // current low/high buckets in new map
-			xi, yi int            // key/val indices into x and y
-			xk, yk unsafe.Pointer // pointers to current x and y key storage
-			xv, yv unsafe.Pointer // pointers to current x and y value storage
-		)
-		x = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
-		xi = 0
-		xk = add(unsafe.Pointer(x), dataOffset)
-		xv = add(xk, bucketCnt*uintptr(t.keysize))
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*uintptr(t.keysize))
+
 		if !h.sameSizeGrow() {
 			// Only calculate y pointers if we're growing bigger.
 			// Otherwise GC can see bad pointers.
-			y = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
-			yi = 0
-			yk = add(unsafe.Pointer(y), dataOffset)
-			yv = add(yk, bucketCnt*uintptr(t.keysize))
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*uintptr(t.keysize))
 		}
+
 		for ; b != nil; b = b.overflow(t) {
 			k := add(unsafe.Pointer(b), dataOffset)
 			v := add(k, bucketCnt*uintptr(t.keysize))
@@ -1083,122 +1042,102 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 				if t.indirectkey {
 					k2 = *((*unsafe.Pointer)(k2))
 				}
-				useX := true
+				var useY uint8
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/value to bucket x or bucket y).
-					hash := alg.hash(k2, uintptr(h.hash0))
-					if h.flags&iterator != 0 {
-						if !t.reflexivekey && !alg.equal(k2, k2) {
-							// If key != key (NaNs), then the hash could be (and probably
-							// will be) entirely different from the old hash. Moreover,
-							// it isn't reproducible. Reproducibility is required in the
-							// presence of iterators, as our evacuation decision must
-							// match whatever decision the iterator made.
-							// Fortunately, we have the freedom to send these keys either
-							// way. Also, tophash is meaningless for these kinds of keys.
-							// We let the low bit of tophash drive the evacuation decision.
-							// We recompute a new random tophash for the next level so
-							// these keys will get evenly distributed across all buckets
-							// after multiple grows.
-							if top&1 != 0 {
-								hash |= newbit
-							} else {
-								hash &^= newbit
-							}
-							top = uint8(hash >> (sys.PtrSize*8 - 8))
-							if top < minTopHash {
-								top += minTopHash
-							}
+					hash := t.key.alg.hash(k2, uintptr(h.hash0))
+					if h.flags&iterator != 0 && !t.reflexivekey && !t.key.alg.equal(k2, k2) {
+						// If key != key (NaNs), then the hash could be (and probably
+						// will be) entirely different from the old hash. Moreover,
+						// it isn't reproducible. Reproducibility is required in the
+						// presence of iterators, as our evacuation decision must
+						// match whatever decision the iterator made.
+						// Fortunately, we have the freedom to send these keys either
+						// way. Also, tophash is meaningless for these kinds of keys.
+						// We let the low bit of tophash drive the evacuation decision.
+						// We recompute a new random tophash for the next level so
+						// these keys will get evenly distributed across all buckets
+						// after multiple grows.
+						useY = top & 1
+						top = tophash(hash)
+					} else {
+						if hash&newbit != 0 {
+							useY = 1
 						}
 					}
-					useX = hash&newbit == 0
 				}
-				if useX {
-					b.tophash[i] = evacuatedX
-					if xi == bucketCnt {
-						newx := h.newoverflow(t, x)
-						x = newx
-						xi = 0
-						xk = add(unsafe.Pointer(x), dataOffset)
-						xv = add(xk, bucketCnt*uintptr(t.keysize))
-					}
-					x.tophash[xi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(xk) = k2 // copy pointer
-					} else {
-						typedmemmove(t.key, xk, k) // copy value
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(xv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, xv, v)
-					}
-					xi++
-					xk = add(xk, uintptr(t.keysize))
-					xv = add(xv, uintptr(t.valuesize))
+
+				if evacuatedX+1 != evacuatedY {
+					throw("bad evacuatedN")
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*uintptr(t.keysize))
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+				if t.indirectkey {
+					*(*unsafe.Pointer)(dst.k) = k2 // copy pointer
 				} else {
-					b.tophash[i] = evacuatedY
-					if yi == bucketCnt {
-						newy := h.newoverflow(t, y)
-						y = newy
-						yi = 0
-						yk = add(unsafe.Pointer(y), dataOffset)
-						yv = add(yk, bucketCnt*uintptr(t.keysize))
-					}
-					y.tophash[yi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(yk) = k2
-					} else {
-						typedmemmove(t.key, yk, k)
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(yv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, yv, v)
-					}
-					yi++
-					yk = add(yk, uintptr(t.keysize))
-					yv = add(yv, uintptr(t.valuesize))
+					typedmemmove(t.key, dst.k, k) // copy value
 				}
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(dst.v) = *(*unsafe.Pointer)(v)
+				} else {
+					typedmemmove(t.elem, dst.v, v)
+				}
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, uintptr(t.keysize))
+				dst.v = add(dst.v, uintptr(t.valuesize))
 			}
 		}
 		// Unlink the overflow buckets & clear key/value to help GC.
-		if h.flags&oldIterator == 0 {
-			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
 			// Preserve b.tophash because the evacuation
 			// state is maintained there.
-			if t.bucket.kind&kindNoPointers == 0 {
-				memclrHasPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			} else {
-				memclrNoHeapPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			}
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
 		}
 	}
 
-	// Advance evacuation mark
 	if oldbucket == h.nevacuate {
-		h.nevacuate = oldbucket + 1
-		// Experiments suggest that 1024 is overkill by at least an order of magnitude.
-		// Put it in there as a safeguard anyway, to ensure O(1) behavior.
-		stop := h.nevacuate + 1024
-		if stop > newbit {
-			stop = newbit
-		}
-		for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
-			h.nevacuate++
-		}
-		if h.nevacuate == newbit { // newbit == # of oldbuckets
-			// Growing is all done. Free old main bucket array.
-			h.oldbuckets = nil
-			// Can discard old overflow buckets as well.
-			// If they are still referenced by an iterator,
-			// then the iterator holds a pointers to the slice.
-			if h.extra != nil {
-				h.extra.overflow[1] = nil
-			}
-			h.flags &^= sameSizeGrow
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {
+	h.nevacuate++
+	// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+	// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+	stop := h.nevacuate + 1024
+	if stop > newbit {
+		stop = newbit
+	}
+	for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+		h.nevacuate++
+	}
+	if h.nevacuate == newbit { // newbit == # of oldbuckets
+		// Growing is all done. Free old main bucket array.
+		h.oldbuckets = nil
+		// Can discard old overflow buckets as well.
+		// If they are still referenced by an iterator,
+		// then the iterator holds a pointers to the slice.
+		if h.extra != nil {
+			h.extra.oldoverflow = nil
 		}
+		h.flags &^= sameSizeGrow
 	}
 }
 
@@ -1210,7 +1149,45 @@ func ismapkey(t *_type) bool {
 
 //go:linkname reflect_makemap reflect.makemap
 func reflect_makemap(t *maptype, cap int) *hmap {
-	return makemap(t, int64(cap), nil, nil)
+	// Check invariants and reflects math.
+	if sz := unsafe.Sizeof(hmap{}); sz != t.hmap.size {
+		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
+		throw("bad hmap size")
+	}
+	if !ismapkey(t.key) {
+		throw("runtime.reflect_makemap: unsupported map key type")
+	}
+	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
+		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
+		throw("key size wrong")
+	}
+	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
+		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
+		throw("value size wrong")
+	}
+	if t.key.align > bucketCnt {
+		throw("key align too big")
+	}
+	if t.elem.align > bucketCnt {
+		throw("value align too big")
+	}
+	if t.key.size%uintptr(t.key.align) != 0 {
+		throw("key size not a multiple of key align")
+	}
+	if t.elem.size%uintptr(t.elem.align) != 0 {
+		throw("value size not a multiple of value align")
+	}
+	if bucketCnt < 8 {
+		throw("bucketsize too small for proper alignment")
+	}
+	if dataOffset%uintptr(t.key.align) != 0 {
+		throw("need padding in bucket (key)")
+	}
+	if dataOffset%uintptr(t.elem.align) != 0 {
+		throw("need padding in bucket (value)")
+	}
+
+	return makemap(t, cap, nil)
 }
 
 //go:linkname reflect_mapaccess reflect.mapaccess
@@ -1257,7 +1234,7 @@ func reflect_maplen(h *hmap) int {
 		return 0
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&h))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(reflect_maplen))
 	}
 	return h.count
diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
index 67b9787909..2de381412b 100644
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go
@@ -11,7 +11,7 @@ import (
 
 func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -26,7 +26,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -39,28 +39,19 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -75,7 +66,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -88,28 +79,19 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -124,7 +106,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -137,28 +119,19 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -173,7 +146,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -186,28 +159,19 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -222,13 +186,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -239,13 +199,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -275,7 +231,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -287,34 +243,24 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -329,13 +275,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -346,13 +288,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -382,7 +320,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -394,37 +332,113 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
 	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
+				}
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
 				continue
 			}
-			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
-			}
+			inserti = i
+			insertb = b
+			goto done
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
 		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*uint32)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
 }
 
-func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
 	}
 	if h.flags&hashWriting != 0 {
@@ -436,38 +450,35 @@ func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast32(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*4)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
 				}
 				continue
 			}
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*4)))
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -481,25 +492,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*4)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -512,7 +524,7 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
@@ -524,30 +536,26 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*8)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -555,7 +563,8 @@ again:
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			insertb = b
+			inserti = i
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -569,25 +578,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*8)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*uint64)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -595,48 +605,131 @@ done:
 	return val
 }
 
-func mapassign_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
-		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	key := stringStructOf(&ky)
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				continue
+			}
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			insertb = b
+			inserti = i
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
 	}
 
-	var inserti *uint8
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&s)
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if b.tophash[i] == empty && insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -648,7 +741,8 @@ again:
 				continue
 			}
 			// already have a mapping for key. Update it.
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -662,25 +756,25 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*2*sys.PtrSize)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = top // mask inserti to avoid bounds checks
 
-	// store new key/value at insert position
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*2*sys.PtrSize)
+	// store new key at insert position
 	*((*stringStruct)(insertk)) = *key
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*2*sys.PtrSize+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -690,7 +784,7 @@ done:
 
 func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -705,38 +799,32 @@ func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+		growWork_fast32(t, h, bucket)
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if key != *(*uint32)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint32)(add(unsafe.Pointer(b), dataOffset+i*4))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*4 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -745,7 +833,7 @@ done:
 
 func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -760,38 +848,32 @@ func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if key != *(*uint64)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint64)(add(unsafe.Pointer(b), dataOffset+i*8))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*8 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -800,7 +882,7 @@ done:
 
 func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -816,43 +898,340 @@ func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+		growWork_faststr(t, h, bucket)
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
 				continue
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*2*sys.PtrSize + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
+			// Clear key's pointer.
+			k.str = nil
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
+			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
+
+func growWork_fast32(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast32(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast32(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast32(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*4)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*4)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*4)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 4), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*4)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if sys.PtrSize == 4 && t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+				} else {
+					*(*uint32)(dst.k) = *(*uint32)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 4)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_fast64(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast64(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast64(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast64(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*8)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*8)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*8)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 8), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*8)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					if sys.PtrSize == 8 {
+						writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+					} else {
+						// There are three ways to squeeze at least one 32 bit pointer into 64 bits.
+						// Give up and call typedmemmove.
+						typedmemmove(t.key, dst.k, k)
+					}
+				} else {
+					*(*uint64)(dst.k) = *(*uint64)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 8)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_faststr(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_faststr(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_faststr(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_faststr(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*2*sys.PtrSize)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*2*sys.PtrSize)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*2*sys.PtrSize)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 2*sys.PtrSize), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*2*sys.PtrSize)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				*(*string)(dst.k) = *(*string)(k)
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 2*sys.PtrSize)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 35f6124643..2b51758ae1 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -200,7 +200,6 @@ func dumptype(t *_type) {
 
 // dump an object
 func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
-	dumpbvtypes(&bv, obj)
 	dumpint(tagObject)
 	dumpint(uint64(uintptr(obj)))
 	dumpmemrange(obj, size)
@@ -261,14 +260,9 @@ func dumpframe(s *stkframe, arg unsafe.Pointer) bool {
 	}
 	stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
 
-	// Dump any types we will need to resolve Efaces.
-	if child.args.n >= 0 {
-		dumpbvtypes(&child.args, unsafe.Pointer(s.sp+child.argoff))
-	}
 	var bv bitvector
 	if stkmap != nil && stkmap.n > 0 {
 		bv = stackmapdata(stkmap, pcdata)
-		dumpbvtypes(&bv, unsafe.Pointer(s.varp-uintptr(bv.n*sys.PtrSize)))
 	} else {
 		bv.n = -1
 	}
@@ -423,14 +417,12 @@ func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, o
 func dumproots() {
 	// TODO(mwhudson): dump datamask etc from all objects
 	// data segment
-	dumpbvtypes(&firstmoduledata.gcdatamask, unsafe.Pointer(firstmoduledata.data))
 	dumpint(tagData)
 	dumpint(uint64(firstmoduledata.data))
 	dumpmemrange(unsafe.Pointer(firstmoduledata.data), firstmoduledata.edata-firstmoduledata.data)
 	dumpfields(firstmoduledata.gcdatamask)
 
 	// bss segment
-	dumpbvtypes(&firstmoduledata.gcbssmask, unsafe.Pointer(firstmoduledata.bss))
 	dumpint(tagBSS)
 	dumpint(uint64(firstmoduledata.bss))
 	dumpmemrange(unsafe.Pointer(firstmoduledata.bss), firstmoduledata.ebss-firstmoduledata.bss)
@@ -677,16 +669,6 @@ func dumpfields(bv bitvector) {
 	dumpint(fieldKindEol)
 }
 
-// The heap dump reader needs to be able to disambiguate
-// Eface entries. So it needs to know every type that might
-// appear in such an entry. The following routine accomplishes that.
-// TODO(rsc, khr): Delete - no longer possible.
-
-// Dump all the types that appear in the type field of
-// any Eface described by this bit vector.
-func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
-}
-
 func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	// Extend the temp buffer if necessary.
 	nptr := size / sys.PtrSize
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 58ed61e3aa..7c5d3a05b2 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -10,21 +10,24 @@ import (
 	"unsafe"
 )
 
-const (
-	hashSize = 1009
-)
+const itabInitSize = 512
 
 var (
-	ifaceLock mutex // lock for accessing hash
-	hash      [hashSize]*itab
+	itabLock      mutex                               // lock for accessing itab table
+	itabTable     = &itabTableInit                    // pointer to current table
+	itabTableInit = itabTableType{size: itabInitSize} // starter table
 )
 
-func itabhash(inter *interfacetype, typ *_type) uint32 {
+//Note: change the formula in the mallocgc call in itabAdd if you change these fields.
+type itabTableType struct {
+	size    uintptr             // length of entries array. Always a power of 2.
+	count   uintptr             // current number of filled entries.
+	entries [itabInitSize]*itab // really [size] large
+}
+
+func itabHashFunc(inter *interfacetype, typ *_type) uintptr {
 	// compiler has provided some good hash codes for us.
-	h := inter.typ.hash
-	h += 17 * typ.hash
-	// TODO(rsc): h += 23 * x.mhash ?
-	return h % hashSize
+	return uintptr(inter.typ.hash ^ typ.hash)
 }
 
 func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
@@ -41,50 +44,137 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 		panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), name.name()})
 	}
 
-	h := itabhash(inter, typ)
-
-	// look twice - once without lock, once with.
-	// common case will be no lock contention.
 	var m *itab
-	var locked int
-	for locked = 0; locked < 2; locked++ {
-		if locked != 0 {
-			lock(&ifaceLock)
-		}
-		for m = (*itab)(atomic.Loadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
-			if m.inter == inter && m._type == typ {
-				if m.bad {
-					if !canfail {
-						// this can only happen if the conversion
-						// was already done once using the , ok form
-						// and we have a cached negative result.
-						// the cached result doesn't record which
-						// interface function was missing, so try
-						// adding the itab again, which will throw an error.
-						additab(m, locked != 0, false)
-					}
-					m = nil
-				}
-				if locked != 0 {
-					unlock(&ifaceLock)
-				}
-				return m
-			}
-		}
+
+	// First, look in the existing table to see if we can find the itab we need.
+	// This is by far the most common case, so do it without locks.
+	// Use atomic to ensure we see any previous writes done by the thread
+	// that updates the itabTable field (with atomic.Storep in itabAdd).
+	t := (*itabTableType)(atomic.Loadp(unsafe.Pointer(&itabTable)))
+	if m = t.find(inter, typ); m != nil {
+		goto finish
 	}
 
+	// Not found.  Grab the lock and try again.
+	lock(&itabLock)
+	if m = itabTable.find(inter, typ); m != nil {
+		unlock(&itabLock)
+		goto finish
+	}
+
+	// Entry doesn't exist yet. Make a new entry & add it.
 	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*sys.PtrSize, 0, &memstats.other_sys))
 	m.inter = inter
 	m._type = typ
-	additab(m, true, canfail)
-	unlock(&ifaceLock)
-	if m.bad {
+	m.init()
+	itabAdd(m)
+	unlock(&itabLock)
+finish:
+	if m.fun[0] != 0 {
+		return m
+	}
+	if canfail {
 		return nil
 	}
-	return m
+	// this can only happen if the conversion
+	// was already done once using the , ok form
+	// and we have a cached negative result.
+	// The cached result doesn't record which
+	// interface function was missing, so initialize
+	// the itab again to get the missing function name.
+	panic(&TypeAssertionError{concreteString: typ.string(), assertedString: inter.typ.string(), missingMethod: m.init()})
+}
+
+// find finds the given interface/type pair in t.
+// Returns nil if the given interface/type pair isn't present.
+func (t *itabTableType) find(inter *interfacetype, typ *_type) *itab {
+	// Implemented using quadratic probing.
+	// Probe sequence is h(i) = h0 + i*(i+1)/2 mod 2^k.
+	// We're guaranteed to hit all table entries using this probe sequence.
+	mask := t.size - 1
+	h := itabHashFunc(inter, typ) & mask
+	for i := uintptr(1); ; i++ {
+		p := (**itab)(add(unsafe.Pointer(&t.entries), h*sys.PtrSize))
+		// Use atomic read here so if we see m != nil, we also see
+		// the initializations of the fields of m.
+		// m := *p
+		m := (*itab)(atomic.Loadp(unsafe.Pointer(p)))
+		if m == nil {
+			return nil
+		}
+		if m.inter == inter && m._type == typ {
+			return m
+		}
+		h += i
+		h &= mask
+	}
+}
+
+// itabAdd adds the given itab to the itab hash table.
+// itabLock must be held.
+func itabAdd(m *itab) {
+	t := itabTable
+	if t.count >= 3*(t.size/4) { // 75% load factor
+		// Grow hash table.
+		// t2 = new(itabTableType) + some additional entries
+		// We lie and tell malloc we want pointer-free memory because
+		// all the pointed-to values are not in the heap.
+		t2 := (*itabTableType)(mallocgc((2+2*t.size)*sys.PtrSize, nil, true))
+		t2.size = t.size * 2
+
+		// Copy over entries.
+		// Note: while copying, other threads may look for an itab and
+		// fail to find it. That's ok, they will then try to get the itab lock
+		// and as a consequence wait until this copying is complete.
+		iterate_itabs(t2.add)
+		if t2.count != t.count {
+			throw("mismatched count during itab table copy")
+		}
+		// Publish new hash table. Use an atomic write: see comment in getitab.
+		atomicstorep(unsafe.Pointer(&itabTable), unsafe.Pointer(t2))
+		// Adopt the new table as our own.
+		t = itabTable
+		// Note: the old table can be GC'ed here.
+	}
+	t.add(m)
 }
 
-func additab(m *itab, locked, canfail bool) {
+// add adds the given itab to itab table t.
+// itabLock must be held.
+func (t *itabTableType) add(m *itab) {
+	// See comment in find about the probe sequence.
+	// Insert new itab in the first empty spot in the probe sequence.
+	mask := t.size - 1
+	h := itabHashFunc(m.inter, m._type) & mask
+	for i := uintptr(1); ; i++ {
+		p := (**itab)(add(unsafe.Pointer(&t.entries), h*sys.PtrSize))
+		m2 := *p
+		if m2 == m {
+			// A given itab may be used in more than one module
+			// and thanks to the way global symbol resolution works, the
+			// pointed-to itab may already have been inserted into the
+			// global 'hash'.
+			return
+		}
+		if m2 == nil {
+			// Use atomic write here so if a reader sees m, it also
+			// sees the correctly initialized fields of m.
+			// NoWB is ok because m is not in heap memory.
+			// *p = m
+			atomic.StorepNoWB(unsafe.Pointer(p), unsafe.Pointer(m))
+			t.count++
+			return
+		}
+		h += i
+		h &= mask
+	}
+}
+
+// init fills in the m.fun array with all the code pointers for
+// the m.inter/m._type pair. If the type does not implement the interface,
+// it sets m.fun[0] to 0 and returns the name of an interface function that is missing.
+// It is ok to call this multiple times on the same m, even concurrently.
+func (m *itab) init() string {
 	inter := m.inter
 	typ := m._type
 	x := typ.uncommon()
@@ -97,6 +187,7 @@ func additab(m *itab, locked, canfail bool) {
 	nt := int(x.mcount)
 	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
 	j := 0
+imethods:
 	for k := 0; k < ni; k++ {
 		i := &inter.mhdr[k]
 		itype := inter.typ.typeOff(i.ityp)
@@ -119,45 +210,26 @@ func additab(m *itab, locked, canfail bool) {
 						ifn := typ.textOff(t.ifn)
 						*(*unsafe.Pointer)(add(unsafe.Pointer(&m.fun[0]), uintptr(k)*sys.PtrSize)) = ifn
 					}
-					goto nextimethod
+					continue imethods
 				}
 			}
 		}
 		// didn't find method
-		if !canfail {
-			if locked {
-				unlock(&ifaceLock)
-			}
-			panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), iname})
-		}
-		m.bad = true
-		break
-	nextimethod:
+		m.fun[0] = 0
+		return iname
 	}
-	if !locked {
-		throw("invalid itab locking")
-	}
-	h := itabhash(inter, typ)
-	m.link = hash[h]
-	m.inhash = true
-	atomicstorep(unsafe.Pointer(&hash[h]), unsafe.Pointer(m))
+	m.hash = typ.hash
+	return ""
 }
 
 func itabsinit() {
-	lock(&ifaceLock)
+	lock(&itabLock)
 	for _, md := range activeModules() {
 		for _, i := range md.itablinks {
-			// itablinks is a slice of pointers to the itabs used in this
-			// module. A given itab may be used in more than one module
-			// and thanks to the way global symbol resolution works, the
-			// pointed-to itab may already have been inserted into the
-			// global 'hash'.
-			if !i.inhash {
-				additab(i, true, false)
-			}
+			itabAdd(i)
 		}
 	}
-	unlock(&ifaceLock)
+	unlock(&itabLock)
 }
 
 // panicdottypeE is called when doing an e.(T) conversion and the conversion fails.
@@ -200,7 +272,7 @@ func panicnildottype(want *_type) {
 
 func convT2E(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2E))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -216,7 +288,7 @@ func convT2E(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2E16(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E16))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2E16))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -235,7 +307,7 @@ func convT2E16(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2E32(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E32))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2E32))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -254,7 +326,7 @@ func convT2E32(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2E64(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E64))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2E64))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -273,7 +345,7 @@ func convT2E64(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2Estring(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Estring))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Estring))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -292,7 +364,7 @@ func convT2Estring(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2Eslice(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Eslice))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Eslice))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -311,7 +383,7 @@ func convT2Eslice(t *_type, elem unsafe.Pointer) (e eface) {
 
 func convT2Enoptr(t *_type, elem unsafe.Pointer) (e eface) {
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Enoptr))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Enoptr))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -326,7 +398,7 @@ func convT2Enoptr(t *_type, elem unsafe.Pointer) (e eface) {
 func convT2I(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2I))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -341,7 +413,7 @@ func convT2I(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2I16(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I16))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2I16))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -361,7 +433,7 @@ func convT2I16(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2I32(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I32))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2I32))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -381,7 +453,7 @@ func convT2I32(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2I64(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I64))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2I64))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -401,7 +473,7 @@ func convT2I64(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2Istring(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Istring))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Istring))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -421,7 +493,7 @@ func convT2Istring(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2Islice(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Islice))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Islice))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -441,7 +513,7 @@ func convT2Islice(tab *itab, elem unsafe.Pointer) (i iface) {
 func convT2Inoptr(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Inoptr))
+		raceReadObjectPC(t, elem, getcallerpc(), funcPC(convT2Inoptr))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
@@ -533,9 +605,13 @@ func reflect_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
 }
 
 func iterate_itabs(fn func(*itab)) {
-	for _, h := range &hash {
-		for ; h != nil; h = h.link {
-			fn(h)
+	// Note: only runs during stop the world or with itabLock held,
+	// so no other locks/atomics needed.
+	t := itabTable
+	for i := uintptr(0); i < t.size; i++ {
+		m := *(**itab)(add(unsafe.Pointer(&t.entries), i*sys.PtrSize))
+		if m != nil {
+			fn(m)
 		}
 	}
 }
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index 879a82f9c8..b697aa8bd3 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -52,7 +52,7 @@ func TestXadduintptr(t *testing.T) {
 // Tests that xadduintptr correctly updates 64-bit values. The place where
 // we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
 func TestXadduintptrOnUint64(t *testing.T) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		// On big endian architectures, we never use xadduintptr to update
 		// 64-bit values and hence we skip the test.  (Note that functions
 		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
diff --git a/src/runtime/internal/sys/arch_386.go b/src/runtime/internal/sys/arch_386.go
index 61d6722cca..5fb1fba02b 100644
--- a/src/runtime/internal/sys/arch_386.go
+++ b/src/runtime/internal/sys/arch_386.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = I386
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 64
 	DefaultPhysPageSize = GoosNacl*65536 + (1-GoosNacl)*4096 // 4k normally; 64k on NaCl
 	PCQuantum           = 1
diff --git a/src/runtime/internal/sys/arch_amd64.go b/src/runtime/internal/sys/arch_amd64.go
index 1f2114a736..2f32bc469f 100644
--- a/src/runtime/internal/sys/arch_amd64.go
+++ b/src/runtime/internal/sys/arch_amd64.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = AMD64
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 64
 	DefaultPhysPageSize = 4096
 	PCQuantum           = 1
diff --git a/src/runtime/internal/sys/arch_amd64p32.go b/src/runtime/internal/sys/arch_amd64p32.go
index 07798557de..c560907c67 100644
--- a/src/runtime/internal/sys/arch_amd64p32.go
+++ b/src/runtime/internal/sys/arch_amd64p32.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = AMD64
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 64
 	DefaultPhysPageSize = 65536*GoosNacl + 4096*(1-GoosNacl)
 	PCQuantum           = 1
diff --git a/src/runtime/internal/sys/arch_arm.go b/src/runtime/internal/sys/arch_arm.go
index 899010bfa1..f383d82027 100644
--- a/src/runtime/internal/sys/arch_arm.go
+++ b/src/runtime/internal/sys/arch_arm.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = ARM
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_arm64.go b/src/runtime/internal/sys/arch_arm64.go
index 2d57ddae19..cb83ecc445 100644
--- a/src/runtime/internal/sys/arch_arm64.go
+++ b/src/runtime/internal/sys/arch_arm64.go
@@ -6,8 +6,8 @@ package sys
 
 const (
 	ArchFamily          = ARM64
-	BigEndian           = 0
-	CacheLineSize       = 32
+	BigEndian           = false
+	CacheLineSize       = 64
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_mips.go b/src/runtime/internal/sys/arch_mips.go
index 65fc4f8a60..e12f32d0ee 100644
--- a/src/runtime/internal/sys/arch_mips.go
+++ b/src/runtime/internal/sys/arch_mips.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = MIPS
-	BigEndian           = 1
+	BigEndian           = true
 	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_mips64.go b/src/runtime/internal/sys/arch_mips64.go
index 0f6de74e6f..973ec10e17 100644
--- a/src/runtime/internal/sys/arch_mips64.go
+++ b/src/runtime/internal/sys/arch_mips64.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = MIPS64
-	BigEndian           = 1
+	BigEndian           = true
 	CacheLineSize       = 32
 	DefaultPhysPageSize = 16384
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_mips64le.go b/src/runtime/internal/sys/arch_mips64le.go
index 4ced35bfde..e96d962f36 100644
--- a/src/runtime/internal/sys/arch_mips64le.go
+++ b/src/runtime/internal/sys/arch_mips64le.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = MIPS64
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 32
 	DefaultPhysPageSize = 16384
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_mipsle.go b/src/runtime/internal/sys/arch_mipsle.go
index 33e9764037..25742ae9d3 100644
--- a/src/runtime/internal/sys/arch_mipsle.go
+++ b/src/runtime/internal/sys/arch_mipsle.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = MIPS
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_ppc64.go b/src/runtime/internal/sys/arch_ppc64.go
index 80595ee195..a538bbdec0 100644
--- a/src/runtime/internal/sys/arch_ppc64.go
+++ b/src/runtime/internal/sys/arch_ppc64.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = PPC64
-	BigEndian           = 1
+	BigEndian           = true
 	CacheLineSize       = 128
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_ppc64le.go b/src/runtime/internal/sys/arch_ppc64le.go
index f68e777055..aa50689181 100644
--- a/src/runtime/internal/sys/arch_ppc64le.go
+++ b/src/runtime/internal/sys/arch_ppc64le.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = PPC64
-	BigEndian           = 0
+	BigEndian           = false
 	CacheLineSize       = 128
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
diff --git a/src/runtime/internal/sys/arch_s390x.go b/src/runtime/internal/sys/arch_s390x.go
index 4ec4bf8fec..e42c420a54 100644
--- a/src/runtime/internal/sys/arch_s390x.go
+++ b/src/runtime/internal/sys/arch_s390x.go
@@ -6,7 +6,7 @@ package sys
 
 const (
 	ArchFamily          = S390X
-	BigEndian           = 1
+	BigEndian           = true
 	CacheLineSize       = 256
 	DefaultPhysPageSize = 4096
 	PCQuantum           = 2
diff --git a/src/runtime/internal/sys/gengoos.go b/src/runtime/internal/sys/gengoos.go
index 4c45c0af02..a9f86256bf 100644
--- a/src/runtime/internal/sys/gengoos.go
+++ b/src/runtime/internal/sys/gengoos.go
@@ -30,7 +30,7 @@ func main() {
 		if strings.HasPrefix(line, goosPrefix) {
 			text, err := strconv.Unquote(strings.TrimPrefix(line, goosPrefix))
 			if err != nil {
-				log.Fatalf("parsing goosList %#q: %v", strings.TrimPrefix(line, goosPrefix), err)
+				log.Fatalf("parsing goosList: %v", err)
 			}
 			gooses = strings.Fields(text)
 		}
@@ -45,10 +45,11 @@ func main() {
 
 	for _, target := range gooses {
 		var buf bytes.Buffer
-		fmt.Fprintf(&buf, "// generated by gengoos.go using 'go generate'\n\n")
+		fmt.Fprintf(&buf, "// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.\n\n")
 		if target == "linux" {
-			fmt.Fprintf(&buf, "// +build !android\n\n") // must explicitly exclude android for linux
+			fmt.Fprintf(&buf, "// +build !android\n") // must explicitly exclude android for linux
 		}
+		fmt.Fprintf(&buf, "// +build %s\n\n", target) // must explicitly include target for bootstrapping purposes
 		fmt.Fprintf(&buf, "package sys\n\n")
 		fmt.Fprintf(&buf, "const GOOS = `%s`\n\n", target)
 		for _, goos := range gooses {
@@ -66,7 +67,8 @@ func main() {
 
 	for _, target := range goarches {
 		var buf bytes.Buffer
-		fmt.Fprintf(&buf, "// generated by gengoos.go using 'go generate'\n\n")
+		fmt.Fprintf(&buf, "// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.\n\n")
+		fmt.Fprintf(&buf, "// +build %s\n\n", target) // must explicitly include target for bootstrapping purposes
 		fmt.Fprintf(&buf, "package sys\n\n")
 		fmt.Fprintf(&buf, "const GOARCH = `%s`\n\n", target)
 		for _, goarch := range goarches {
diff --git a/src/runtime/internal/sys/sys.go b/src/runtime/internal/sys/sys.go
index 586a763717..9d9ac4507f 100644
--- a/src/runtime/internal/sys/sys.go
+++ b/src/runtime/internal/sys/sys.go
@@ -6,9 +6,9 @@
 // constants used by the runtime.
 package sys
 
-// The next line makes 'go generate' write the zgen_*.go files with
+// The next line makes 'go generate' write the zgo*.go files with
 // per-OS and per-arch information, including constants
-// named goos_$GOOS and goarch_$GOARCH for every
+// named Goos$GOOS and Goarch$GOARCH for every
 // known GOOS and GOARCH. The constant is 1 on the
 // current system, 0 otherwise; multiplying by them is
 // useful for defining GOOS- or GOARCH-specific constants.
diff --git a/src/runtime/internal/sys/zgoarch_386.go b/src/runtime/internal/sys/zgoarch_386.go
index 3bcf83b8e3..b07abbedc6 100644
--- a/src/runtime/internal/sys/zgoarch_386.go
+++ b/src/runtime/internal/sys/zgoarch_386.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build 386
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_amd64.go b/src/runtime/internal/sys/zgoarch_amd64.go
index 699f191fba..bfdcb00bd9 100644
--- a/src/runtime/internal/sys/zgoarch_amd64.go
+++ b/src/runtime/internal/sys/zgoarch_amd64.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build amd64
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_amd64p32.go b/src/runtime/internal/sys/zgoarch_amd64p32.go
index cc2d658406..b61617d4d9 100644
--- a/src/runtime/internal/sys/zgoarch_amd64p32.go
+++ b/src/runtime/internal/sys/zgoarch_amd64p32.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build amd64p32
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_arm.go b/src/runtime/internal/sys/zgoarch_arm.go
index a5fd789f13..79595d545a 100644
--- a/src/runtime/internal/sys/zgoarch_arm.go
+++ b/src/runtime/internal/sys/zgoarch_arm.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build arm
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_arm64.go b/src/runtime/internal/sys/zgoarch_arm64.go
index 084d2c7330..c839b8fc03 100644
--- a/src/runtime/internal/sys/zgoarch_arm64.go
+++ b/src/runtime/internal/sys/zgoarch_arm64.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build arm64
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_arm64be.go b/src/runtime/internal/sys/zgoarch_arm64be.go
new file mode 100644
index 0000000000..58b4ef198b
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_arm64be.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build arm64be
+
+package sys
+
+const GOARCH = `arm64be`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 1
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_armbe.go b/src/runtime/internal/sys/zgoarch_armbe.go
new file mode 100644
index 0000000000..e9e2c314d8
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_armbe.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build armbe
+
+package sys
+
+const GOARCH = `armbe`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 1
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_mips.go b/src/runtime/internal/sys/zgoarch_mips.go
index 2f733d2788..b0bf4ffec3 100644
--- a/src/runtime/internal/sys/zgoarch_mips.go
+++ b/src/runtime/internal/sys/zgoarch_mips.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mips
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_mips64.go b/src/runtime/internal/sys/zgoarch_mips64.go
index 2ad62bd68c..093e88ceaa 100644
--- a/src/runtime/internal/sys/zgoarch_mips64.go
+++ b/src/runtime/internal/sys/zgoarch_mips64.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mips64
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_mips64le.go b/src/runtime/internal/sys/zgoarch_mips64le.go
index 047c8b425a..3bad7cfd38 100644
--- a/src/runtime/internal/sys/zgoarch_mips64le.go
+++ b/src/runtime/internal/sys/zgoarch_mips64le.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mips64le
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_mips64p32.go b/src/runtime/internal/sys/zgoarch_mips64p32.go
new file mode 100644
index 0000000000..c5f69fc687
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_mips64p32.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mips64p32
+
+package sys
+
+const GOARCH = `mips64p32`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 1
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_mips64p32le.go b/src/runtime/internal/sys/zgoarch_mips64p32le.go
new file mode 100644
index 0000000000..014ef84ce8
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_mips64p32le.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mips64p32le
+
+package sys
+
+const GOARCH = `mips64p32le`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 1
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_mipsle.go b/src/runtime/internal/sys/zgoarch_mipsle.go
index 95f3d5aab9..75814be787 100644
--- a/src/runtime/internal/sys/zgoarch_mipsle.go
+++ b/src/runtime/internal/sys/zgoarch_mipsle.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build mipsle
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_ppc.go b/src/runtime/internal/sys/zgoarch_ppc.go
new file mode 100644
index 0000000000..2a891b8477
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_ppc.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build ppc
+
+package sys
+
+const GOARCH = `ppc`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 1
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_ppc64.go b/src/runtime/internal/sys/zgoarch_ppc64.go
index 748b5b562c..847db4bdb2 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build ppc64
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_ppc64le.go b/src/runtime/internal/sys/zgoarch_ppc64le.go
index d3dcba467d..5195797b29 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64le.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64le.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build ppc64le
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_s390.go b/src/runtime/internal/sys/zgoarch_s390.go
new file mode 100644
index 0000000000..cd215da577
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_s390.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build s390
+
+package sys
+
+const GOARCH = `s390`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 1
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_s390x.go b/src/runtime/internal/sys/zgoarch_s390x.go
index 1ead5d573c..b9368ff4e3 100644
--- a/src/runtime/internal/sys/zgoarch_s390x.go
+++ b/src/runtime/internal/sys/zgoarch_s390x.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build s390x
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoarch_sparc.go b/src/runtime/internal/sys/zgoarch_sparc.go
new file mode 100644
index 0000000000..e9afe0131b
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_sparc.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build sparc
+
+package sys
+
+const GOARCH = `sparc`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 1
+const GoarchSparc64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_sparc64.go b/src/runtime/internal/sys/zgoarch_sparc64.go
new file mode 100644
index 0000000000..b6004efe1c
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_sparc64.go
@@ -0,0 +1,28 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build sparc64
+
+package sys
+
+const GOARCH = `sparc64`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 1
diff --git a/src/runtime/internal/sys/zgoos_android.go b/src/runtime/internal/sys/zgoos_android.go
index 6503b15246..01ebe753c5 100644
--- a/src/runtime/internal/sys/zgoos_android.go
+++ b/src/runtime/internal/sys/zgoos_android.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build android
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_darwin.go b/src/runtime/internal/sys/zgoos_darwin.go
index 6a285984bd..1303d71ee9 100644
--- a/src/runtime/internal/sys/zgoos_darwin.go
+++ b/src/runtime/internal/sys/zgoos_darwin.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build darwin
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_dragonfly.go b/src/runtime/internal/sys/zgoos_dragonfly.go
index 886ac2698f..64325c7a42 100644
--- a/src/runtime/internal/sys/zgoos_dragonfly.go
+++ b/src/runtime/internal/sys/zgoos_dragonfly.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build dragonfly
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_freebsd.go b/src/runtime/internal/sys/zgoos_freebsd.go
index 0bf2403eab..37449713f9 100644
--- a/src/runtime/internal/sys/zgoos_freebsd.go
+++ b/src/runtime/internal/sys/zgoos_freebsd.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build freebsd
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_linux.go b/src/runtime/internal/sys/zgoos_linux.go
index c8664db15d..c726465350 100644
--- a/src/runtime/internal/sys/zgoos_linux.go
+++ b/src/runtime/internal/sys/zgoos_linux.go
@@ -1,6 +1,7 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
 // +build !android
+// +build linux
 
 package sys
 
@@ -17,3 +18,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_nacl.go b/src/runtime/internal/sys/zgoos_nacl.go
index 054122638a..53b394c631 100644
--- a/src/runtime/internal/sys/zgoos_nacl.go
+++ b/src/runtime/internal/sys/zgoos_nacl.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build nacl
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_netbsd.go b/src/runtime/internal/sys/zgoos_netbsd.go
index 5c509a1250..8bfdf45d4a 100644
--- a/src/runtime/internal/sys/zgoos_netbsd.go
+++ b/src/runtime/internal/sys/zgoos_netbsd.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build netbsd
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_openbsd.go b/src/runtime/internal/sys/zgoos_openbsd.go
index dc43157d49..fc6acb761c 100644
--- a/src/runtime/internal/sys/zgoos_openbsd.go
+++ b/src/runtime/internal/sys/zgoos_openbsd.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build openbsd
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 1
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_plan9.go b/src/runtime/internal/sys/zgoos_plan9.go
index 4b0934f77a..75baeb34f6 100644
--- a/src/runtime/internal/sys/zgoos_plan9.go
+++ b/src/runtime/internal/sys/zgoos_plan9.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build plan9
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 1
 const GoosSolaris = 0
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_solaris.go b/src/runtime/internal/sys/zgoos_solaris.go
index 42511a36ad..c18f34f398 100644
--- a/src/runtime/internal/sys/zgoos_solaris.go
+++ b/src/runtime/internal/sys/zgoos_solaris.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build solaris
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 1
 const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_windows.go b/src/runtime/internal/sys/zgoos_windows.go
index d77f62c396..b9f0d4e584 100644
--- a/src/runtime/internal/sys/zgoos_windows.go
+++ b/src/runtime/internal/sys/zgoos_windows.go
@@ -1,4 +1,6 @@
-// generated by gengoos.go using 'go generate'
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build windows
 
 package sys
 
@@ -15,3 +17,4 @@ const GoosOpenbsd = 0
 const GoosPlan9 = 0
 const GoosSolaris = 0
 const GoosWindows = 1
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_zos.go b/src/runtime/internal/sys/zgoos_zos.go
new file mode 100644
index 0000000000..2563ebea23
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_zos.go
@@ -0,0 +1,20 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build zos
+
+package sys
+
+const GOOS = `zos`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0
+const GoosZos = 1
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index 5b0169d572..b41f805cee 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -71,7 +71,7 @@ Loop:
 			// for this lock, chained through m->nextwaitm.
 			// Queue this M.
 			for {
-				gp.m.nextwaitm = v &^ locked
+				gp.m.nextwaitm = muintptr(v &^ locked)
 				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
 					break
 				}
@@ -103,8 +103,8 @@ func unlock(l *mutex) {
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
-			mp = (*m)(unsafe.Pointer(v &^ locked))
-			if atomic.Casuintptr(&l.key, v, mp.nextwaitm) {
+			mp = muintptr(v &^ locked).ptr()
+			if atomic.Casuintptr(&l.key, v, uintptr(mp.nextwaitm)) {
 				// Dequeued an M.  Wake it.
 				semawakeup(mp)
 				break
@@ -140,7 +140,7 @@ func notewakeup(n *note) {
 	case v == 0:
 		// Nothing was waiting. Done.
 	case v == locked:
-		// Two notewakeups!  Not allowed.
+		// Two notewakeups! Not allowed.
 		throw("notewakeup - double wakeup")
 	default:
 		// Must be the waiting m. Wake it up.
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 0ebd2c0ab2..72b8f40b96 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -529,9 +529,8 @@ func nextFreeFast(s *mspan) gclinkptr {
 			}
 			s.allocCache >>= uint(theBit + 1)
 			s.freeindex = freeidx
-			v := gclinkptr(result*s.elemsize + s.base())
 			s.allocCount++
-			return v
+			return gclinkptr(result*s.elemsize + s.base())
 		}
 	}
 	return 0
@@ -847,6 +846,9 @@ func reflect_unsafe_New(typ *_type) unsafe.Pointer {
 
 // newarray allocates an array of n elements of type typ.
 func newarray(typ *_type, n int) unsafe.Pointer {
+	if n == 1 {
+		return mallocgc(typ.size, typ, true)
+	}
 	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
 		panic(plainError("runtime: allocation size out of range"))
 	}
@@ -863,11 +865,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	mProf_Malloc(x, size)
 }
 
-// nextSample returns the next sampling point for heap profiling.
-// It produces a random variable with a geometric distribution and
-// mean MemProfileRate. This is done by generating a uniformly
-// distributed random number and applying the cumulative distribution
-// function for an exponential.
+// nextSample returns the next sampling point for heap profiling. The goal is
+// to sample allocations on average every MemProfileRate bytes, but with a
+// completely random distribution over the allocation timeline; this
+// corresponds to a Poisson process with parameter MemProfileRate. In Poisson
+// processes, the distance between two samples follows the exponential
+// distribution (exp(MemProfileRate)), so the best return value is a random
+// number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() int32 {
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
@@ -876,25 +880,29 @@ func nextSample() int32 {
 		}
 	}
 
-	period := MemProfileRate
+	return fastexprand(MemProfileRate)
+}
 
-	// make nextSample not overflow. Maximum possible step is
-	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+// fastexprand returns a random number from an exponential distribution with
+// the specified mean.
+func fastexprand(mean int) int32 {
+	// Avoid overflow. Maximum possible step is
+	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean.
 	switch {
-	case period > 0x7000000:
-		period = 0x7000000
-	case period == 0:
+	case mean > 0x7000000:
+		mean = 0x7000000
+	case mean == 0:
 		return 0
 	}
 
-	// Let m be the sample rate,
-	// the probability distribution function is m*exp(-mx), so the CDF is
-	// p = 1 - exp(-mx), so
-	// q = 1 - p == exp(-mx)
-	// log_e(q) = -mx
-	// -log_e(q)/m = x
-	// x = -log_e(q) * period
-	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	// Take a random sample of the exponential distribution exp(-mean*x).
+	// The probability distribution function is mean*exp(-mean*x), so the CDF is
+	// p = 1 - exp(-mean*x), so
+	// q = 1 - p == exp(-mean*x)
+	// log_e(q) = -mean*x
+	// -log_e(q)/mean = x
+	// x = -log_e(q) * mean
+	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency
 	const randomBitCount = 26
 	q := fastrand()%(1<<randomBitCount) + 1
 	qlog := fastlog2(float64(q)) - randomBitCount
@@ -902,7 +910,7 @@ func nextSample() int32 {
 		qlog = 0
 	}
 	const minusLog2 = -0.6931471805599453 // -ln(2)
-	return int32(qlog*(minusLog2*float64(period))) + 1
+	return int32(qlog*(minusLog2*float64(mean))) + 1
 }
 
 // nextSampleNoFP is similar to nextSample, but uses older,
@@ -920,7 +928,7 @@ func nextSampleNoFP() int32 {
 }
 
 type persistentAlloc struct {
-	base unsafe.Pointer
+	base *notInHeap
 	off  uintptr
 }
 
@@ -937,17 +945,17 @@ var globalAlloc struct {
 //
 // Consider marking persistentalloc'd types go:notinheap.
 func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
-	var p unsafe.Pointer
+	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
 	})
-	return p
+	return unsafe.Pointer(p)
 }
 
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	const (
 		chunk    = 256 << 10
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
@@ -968,7 +976,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 
 	if size >= maxBlock {
-		return sysAlloc(size, sysStat)
+		return (*notInHeap)(sysAlloc(size, sysStat))
 	}
 
 	mp := acquirem()
@@ -981,7 +989,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	persistent.off = round(persistent.off, align)
 	if persistent.off+size > chunk || persistent.base == nil {
-		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		persistent.base = (*notInHeap)(sysAlloc(chunk, &memstats.other_sys))
 		if persistent.base == nil {
 			if persistent == &globalAlloc.persistentAlloc {
 				unlock(&globalAlloc.mutex)
@@ -990,7 +998,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		persistent.off = 0
 	}
-	p := add(persistent.base, persistent.off)
+	p := persistent.base.add(persistent.off)
 	persistent.off += size
 	releasem(mp)
 	if persistent == &globalAlloc.persistentAlloc {
@@ -1003,3 +1011,19 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	return p
 }
+
+// notInHeap is off-heap memory allocated by a lower-level allocator
+// like sysAlloc or persistentAlloc.
+//
+// In general, it's better to use real types marked as go:notinheap,
+// but this serves as a generic type for situations where that isn't
+// possible (like in the allocators).
+//
+// TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
+//
+//go:notinheap
+type notInHeap struct{}
+
+func (p *notInHeap) add(bytes uintptr) *notInHeap {
+	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
+}
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index d9487eed3a..93aa56dbd5 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -46,9 +46,6 @@ func TestMemStats(t *testing.T) {
 	}
 	// Of the uint fields, HeapReleased, HeapIdle can be 0.
 	// PauseTotalNs can be 0 if timer resolution is poor.
-	//
-	// TODO: Test that GCCPUFraction is <= 0.99. This currently
-	// fails on windows/386. (Issue #19319)
 	fields := map[string][]func(interface{}) error{
 		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
 		"Lookups": {nz, le(1e10)}, "Mallocs": {nz, le(1e10)}, "Frees": {nz, le(1e10)},
@@ -61,7 +58,7 @@ func TestMemStats(t *testing.T) {
 		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
 		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
 		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
-		"GCCPUFraction": nil, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
+		"GCCPUFraction": {le(0.99)}, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
 		"BySize": nil,
 	}
 
diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index 81f05a0613..6ed655de0a 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go
@@ -244,7 +244,7 @@ func testConcurrentReadsAfterGrowth(t *testing.T, useReflect bool) {
 	numGrowStep := 250
 	numReader := 16
 	if testing.Short() {
-		numLoop, numGrowStep = 2, 500
+		numLoop, numGrowStep = 2, 100
 	}
 	for i := 0; i < numLoop; i++ {
 		m := make(map[int]int, 0)
@@ -596,6 +596,134 @@ func TestIgnoreBogusMapHint(t *testing.T) {
 	}
 }
 
+var mapSink map[int]int
+
+var mapBucketTests = [...]struct {
+	n        int // n is the number of map elements
+	noescape int // number of expected buckets for non-escaping map
+	escape   int // number of expected buckets for escaping map
+}{
+	{-(1 << 30), 1, 1},
+	{-1, 1, 1},
+	{0, 1, 1},
+	{1, 1, 1},
+	{8, 1, 1},
+	{9, 2, 2},
+	{13, 2, 2},
+	{14, 4, 4},
+	{26, 4, 4},
+}
+
+func TestMapBuckets(t *testing.T) {
+	// Test that maps of different sizes have the right number of buckets.
+	// Non-escaping maps with small buckets (like map[int]int) never
+	// have a nil bucket pointer due to starting with preallocated buckets
+	// on the stack. Escaping maps start with a non-nil bucket pointer if
+	// hint size is above bucketCnt and thereby have more than one bucket.
+	// These tests depend on bucketCnt and loadFactor* in hashmap.go.
+	t.Run("mapliteral", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := map[int]int{}
+			if runtime.MapBucketsPointerIsNil(localMap) {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := map[int]int{}
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("nohint", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int)
+			if runtime.MapBucketsPointerIsNil(localMap) {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, tt.n)
+			if runtime.MapBucketsPointerIsNil(localMap) {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap64", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, int64(tt.n))
+			if runtime.MapBucketsPointerIsNil(localMap) {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+
+}
+
 func benchmarkMapPop(b *testing.B, n int) {
 	m := map[int]int{}
 	for i := 0; i < b.N; i++ {
@@ -617,14 +745,38 @@ func BenchmarkMapPop100(b *testing.B)   { benchmarkMapPop(b, 100) }
 func BenchmarkMapPop1000(b *testing.B)  { benchmarkMapPop(b, 1000) }
 func BenchmarkMapPop10000(b *testing.B) { benchmarkMapPop(b, 10000) }
 
+var testNonEscapingMapVariable int = 8
+
 func TestNonEscapingMap(t *testing.T) {
 	n := testing.AllocsPerRun(1000, func() {
+		m := map[int]int{}
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("mapliteral: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
 		m := make(map[int]int)
 		m[0] = 0
 	})
 	if n != 0 {
-		t.Fatalf("want 0 allocs, got %v", n)
+		t.Fatalf("no hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, 8)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with small hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, testNonEscapingMapVariable)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with variable hint: want 0 allocs, got %v", n)
 	}
+
 }
 
 func benchmarkMapAssignInt32(b *testing.B, n int) {
@@ -635,12 +787,16 @@ func benchmarkMapAssignInt32(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt32(b *testing.B, n int) {
-	a := make(map[int32]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int32(i)] = i
-	}
+	a := make(map[int32]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int32(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int32(i))
 	}
 }
@@ -653,12 +809,16 @@ func benchmarkMapAssignInt64(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt64(b *testing.B, n int) {
-	a := make(map[int64]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int64(i)] = i
-	}
+	a := make(map[int64]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int64(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int64(i))
 	}
 }
@@ -676,17 +836,23 @@ func benchmarkMapAssignStr(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteStr(b *testing.B, n int) {
-	k := make([]string, n*b.N)
-	for i := 0; i < n*b.N; i++ {
-		k[i] = strconv.Itoa(i)
-	}
-	a := make(map[string]int)
-	for i := 0; i < n*b.N; i++ {
-		a[k[i]] = i
+	i2s := make([]string, n)
+	for i := 0; i < n; i++ {
+		i2s[i] = strconv.Itoa(i)
 	}
+	a := make(map[string]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
-		delete(a, k[i])
+	k := 0
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := 0; j < n; j++ {
+				a[i2s[j]] = j
+			}
+			k = i
+			b.StartTimer()
+		}
+		delete(a, i2s[i-k])
 	}
 }
 
@@ -705,7 +871,7 @@ func BenchmarkMapAssign(b *testing.B) {
 }
 
 func BenchmarkMapDelete(b *testing.B) {
-	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 1, 2, 4))
-	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 1, 2, 4))
-	b.Run("Str", runWith(benchmarkMapDeleteStr, 1, 2, 4))
+	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 100, 1000, 10000))
+	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 100, 1000, 10000))
+	b.Run("Str", runWith(benchmarkMapDeleteStr, 100, 1000, 10000))
 }
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index 3713c50c39..e28bdb8b8d 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -182,6 +182,8 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 func writebarrierptr_prewrite1(dst *uintptr, src uintptr) {
 	mp := acquirem()
 	if mp.inwb || mp.dying > 0 {
+		// We explicitly allow write barriers in startpanic_m,
+		// since we're going down anyway. Ignore them here.
 		releasem(mp)
 		return
 	}
@@ -237,6 +239,10 @@ func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
 
 // typedmemmove copies a value of type t to dst from src.
 // Must be nosplit, see #16026.
+//
+// TODO: Perfect for go:nosplitrec since we can't have a safe point
+// anywhere in the bulk barrier or memmove.
+//
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {
@@ -258,8 +264,8 @@ func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 //go:linkname reflect_typedmemmove reflect.typedmemmove
 func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if raceenabled {
-		raceWriteObjectPC(typ, dst, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
-		raceReadObjectPC(typ, src, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+		raceWriteObjectPC(typ, dst, getcallerpc(), funcPC(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(), funcPC(reflect_typedmemmove))
 	}
 	if msanenabled {
 		msanwrite(dst, typ.size)
@@ -320,8 +326,12 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	dstp := dst.array
 	srcp := src.array
 
+	// The compiler emits calls to typedslicecopy before
+	// instrumentation runs, so unlike the other copying and
+	// assignment operations, it's not instrumented in the calling
+	// code and needs its own instrumentation.
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&typ))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
 		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
@@ -339,41 +349,13 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	// compiler only emits calls to typedslicecopy for types with pointers,
 	// and growslice and reflect_typedslicecopy check for pointers
 	// before calling typedslicecopy.
-	if !writeBarrier.needed {
-		memmove(dstp, srcp, uintptr(n)*typ.size)
-		return n
+	size := uintptr(n) * typ.size
+	if writeBarrier.needed {
+		bulkBarrierPreWrite(uintptr(dstp), uintptr(srcp), size)
 	}
-
-	systemstack(func() {
-		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
-			// Overlap with src before dst.
-			// Copy backward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			dstp = add(dstp, uintptr(n-1)*typ.size)
-			srcp = add(srcp, uintptr(n-1)*typ.size)
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, -typ.size)
-				srcp = add(srcp, -typ.size)
-			}
-		} else {
-			// Copy forward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, typ.size)
-				srcp = add(srcp, typ.size)
-			}
-		}
-	})
+	// See typedmemmove for a discussion of the race between the
+	// barrier and memmove.
+	memmove(dstp, srcp, size)
 	return n
 }
 
@@ -390,7 +372,7 @@ func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 
 		size := uintptr(n) * elemType.size
 		if raceenabled {
-			callerpc := getcallerpc(unsafe.Pointer(&elemType))
+			callerpc := getcallerpc()
 			pc := funcPC(reflect_typedslicecopy)
 			racewriterangepc(dst.array, size, callerpc, pc)
 			racereadrangepc(src.array, size, callerpc, pc)
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 2a9f1b83e5..6e2f12db15 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -449,11 +449,6 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits
 	return
 }
 
-// prefetch the bits.
-func (h heapBits) prefetch() {
-	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
-}
-
 // next returns the heapBits describing the next pointer-sized word in memory.
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
@@ -528,12 +523,13 @@ func (h heapBits) setCheckmarked(size uintptr) {
 	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 }
 
-// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
 // This executes the write barriers necessary before a memmove.
 // src, dst, and size must be pointer-aligned.
 // The range [dst, dst+size) must lie within a single object.
+// It does not perform the actual writes.
 //
 // As a special case, src == 0 indicates that this is being used for a
 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
@@ -583,12 +579,15 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 		return
 	}
 
+	buf := &getg().m.p.ptr().wbBuf
 	h := heapBitsForAddr(dst)
 	if src == 0 {
 		for i := uintptr(0); i < size; i += sys.PtrSize {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -597,7 +596,9 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -617,6 +618,7 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 	bits = addb(bits, word/8)
 	mask := uint8(1) << (word % 8)
 
+	buf := &getg().m.p.ptr().wbBuf
 	for i := uintptr(0); i < size; i += sys.PtrSize {
 		if mask == 0 {
 			bits = addb(bits, 1)
@@ -630,10 +632,14 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 		if *bits&mask != 0 {
 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
 			if src == 0 {
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			} else {
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 		}
 		mask <<= 1
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 96fb273337..6c24650dac 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -104,7 +104,7 @@ func freemcache(c *mcache) {
 
 // Gets a span that has a free object in it and assigns it
 // to be the cached span for the given sizeclass. Returns this span.
-func (c *mcache) refill(spc spanClass) *mspan {
+func (c *mcache) refill(spc spanClass) {
 	_g_ := getg()
 
 	_g_.m.locks++
@@ -131,7 +131,6 @@ func (c *mcache) refill(spc spanClass) *mspan {
 
 	c.alloc[spc] = s
 	_g_.m.locks--
-	return s
 }
 
 func (c *mcache) releaseAll() {
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index e0d234715f..23872b9a63 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -15,8 +15,8 @@ import (
 // which prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	v := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(v) < 4096 {
+	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
 		return nil
 	}
 	mSysStatInc(sysStat, n)
@@ -51,8 +51,8 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 		return v
 	}
 
-	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) < 4096 {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
 		return nil
 	}
 	*reserved = true
@@ -76,22 +76,22 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 			// to do this - we do not on other platforms.
 			flags |= _MAP_FIXED
 		}
-		p := mmap(v, n, _PROT_READ|_PROT_WRITE, flags, -1, 0)
-		if uintptr(p) == _ENOMEM || (GOOS == "solaris" && uintptr(p) == _sunosEAGAIN) {
+		p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, flags, -1, 0)
+		if err == _ENOMEM || (GOOS == "solaris" && err == _sunosEAGAIN) {
 			throw("runtime: out of memory")
 		}
-		if p != v {
-			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+		if p != v || err != 0 {
+			print("runtime: address space conflict: map(", v, ") = ", p, "(err ", err, ")\n")
 			throw("runtime: address space conflict")
 		}
 		return
 	}
 
-	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) == _ENOMEM || (GOOS == "solaris" && uintptr(p) == _sunosEAGAIN) {
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if err == _ENOMEM || (GOOS == "solaris" && err == _sunosEAGAIN) {
 		throw("runtime: out of memory")
 	}
-	if p != v {
+	if p != v || err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3f1c4d76f3..e41452a2c0 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -10,8 +10,8 @@ import "unsafe"
 // which prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	v := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(v) < 4096 {
+	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
 		return nil
 	}
 	mSysStatInc(sysStat, n)
@@ -40,8 +40,8 @@ func sysFault(v unsafe.Pointer, n uintptr) {
 
 func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	*reserved = true
-	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) < 4096 {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
 		return nil
 	}
 	return p
@@ -53,11 +53,11 @@ const (
 
 func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 	mSysStatInc(sysStat, n)
-	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) == _ENOMEM {
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if err == _ENOMEM {
 		throw("runtime: out of memory")
 	}
-	if p != v {
+	if p != v || err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 094658de51..16f44439f1 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -41,30 +41,30 @@ func addrspace_free(v unsafe.Pointer, n uintptr) bool {
 	return true
 }
 
-func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uint32) unsafe.Pointer {
-	p := mmap(v, n, prot, flags, fd, offset)
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uint32) (unsafe.Pointer, int) {
+	p, err := mmap(v, n, prot, flags, fd, offset)
 	// On some systems, mmap ignores v without
 	// MAP_FIXED, so retry if the address space is free.
 	if p != v && addrspace_free(v, n) {
-		if uintptr(p) > 4096 {
+		if err == 0 {
 			munmap(p, n)
 		}
-		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+		p, err = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
 	}
-	return p
+	return p, err
 }
 
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) < 4096 {
-		if uintptr(p) == _EACCES {
+	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		if err == _EACCES {
 			print("runtime: mmap: access denied\n")
 			exit(2)
 		}
-		if uintptr(p) == _EAGAIN {
+		if err == _EAGAIN {
 			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
 			exit(2)
 		}
@@ -186,9 +186,9 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	// if we can reserve at least 64K and check the assumption in SysMap.
 	// Only user-mode Linux (UML) rejects these requests.
 	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
-		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-		if p != v {
-			if uintptr(p) >= 4096 {
+		p, err := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if p != v || err != 0 {
+			if err == 0 {
 				munmap(p, 64<<10)
 			}
 			return nil
@@ -198,8 +198,8 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 		return v
 	}
 
-	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) < 4096 {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
 		return nil
 	}
 	*reserved = true
@@ -211,22 +211,22 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if !reserved {
-		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-		if uintptr(p) == _ENOMEM {
+		p, err := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if err == _ENOMEM {
 			throw("runtime: out of memory")
 		}
-		if p != v {
-			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+		if p != v || err != 0 {
+			print("runtime: address space conflict: map(", v, ") = ", p, " (err ", err, ")\n")
 			throw("runtime: address space conflict")
 		}
 		return
 	}
 
-	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) == _ENOMEM {
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if err == _ENOMEM {
 		throw("runtime: out of memory")
 	}
-	if p != v {
+	if p != v || err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/src/runtime/memclr_amd64p32.s b/src/runtime/memclr_amd64p32.s
new file mode 100644
index 0000000000..26171bfd4a
--- /dev/null
+++ b/src/runtime/memclr_amd64p32.s
@@ -0,0 +1,23 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8
+	MOVL	ptr+0(FP), DI
+	MOVL	n+4(FP), CX
+	MOVQ	CX, BX
+	ANDQ	$3, BX
+	SHRQ	$2, CX
+	MOVQ	$0, AX
+	CLD
+	REP
+	STOSL
+	MOVQ	BX, CX
+	REP
+	STOSB
+	// Note: we zero only 4 bytes at a time so that the tail is at most
+	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
+	// See issue 13160.
+	RET
diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s
index 9d756bcf6d..bf954e047f 100644
--- a/src/runtime/memclr_arm64.s
+++ b/src/runtime/memclr_arm64.s
@@ -6,32 +6,54 @@
 
 // void runtime·memclrNoHeapPointers(void*, uintptr)
 TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	n+8(FP), R4
-	// TODO(mwhudson): this is written this way to avoid tickling
-	// warnings from addpool when written as AND $7, R4, R6 (see
-	// https://golang.org/issue/12708)
-	AND	$~7, R4, R5	// R5 is N&~7
-	SUB	R5, R4, R6	// R6 is N&7
+	MOVD	ptr+0(FP), R0
+	MOVD	n+8(FP), R1
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP	$16, R1
+	BLT	tail_zero
+	// Get buffer offset into 16 byte aligned address for better performance
+	ANDS	$15, R0, ZR
+	BNE	unaligned_to_16
+aligned_to_16:
+	LSR	$4, R1, R2
+zero_by_16:
+	STP.P	(ZR, ZR), 16(R0)
+	SUBS	$1, R2, R2
+	BNE	zero_by_16
 
-	CMP	$0, R5
-	BEQ	nowords
+	ANDS	$15, R1, R1
+	BEQ	ending
 
-	ADD	R3, R5, R5
+	// Zero buffer with size=R1 < 16
+tail_zero:
+	TBZ	$3, R1, tail_zero_4
+	MOVD.P	ZR, 8(R0)
 
-wordloop: // TODO: Optimize for unaligned ptr.
-	MOVD.P	$0, 8(R3)
-	CMP	R3, R5
-	BNE	wordloop
-nowords:
-        CMP	$0, R6
-        BEQ	done
+tail_zero_4:
+	TBZ	$2, R1, tail_zero_2
+	MOVW.P	ZR, 4(R0)
 
-	ADD	R3, R6, R6
+tail_zero_2:
+	TBZ	$1, R1, tail_zero_1
+	MOVH.P	ZR, 2(R0)
 
-byteloop:
-	MOVBU.P	$0, 1(R3)
-	CMP	R3, R6
-	BNE	byteloop
-done:
+tail_zero_1:
+	TBZ	$0, R1, ending
+	MOVB	ZR, (R0)
+
+ending:
 	RET
+
+unaligned_to_16:
+	MOVD	R0, R2
+head_loop:
+	MOVBU.P	ZR, 1(R0)
+	ANDS	$15, R0, ZR
+	BNE	head_loop
+	// Adjust length for what remains
+	SUB	R2, R0, R3
+	SUB	R3, R1
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP	$16, R1
+	BLT	tail_zero
+	B	aligned_to_16
diff --git a/src/runtime/memmove_nacl_amd64p32.s b/src/runtime/memmove_amd64p32.s
index 13907a90b2..13907a90b2 100644
--- a/src/runtime/memmove_nacl_amd64p32.s
+++ b/src/runtime/memmove_amd64p32.s
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 74b8753b5f..62de604e69 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -9,6 +9,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"internal/race"
+	"internal/testenv"
 	. "runtime"
 	"testing"
 )
@@ -88,6 +89,10 @@ func TestMemmoveAlias(t *testing.T) {
 }
 
 func TestMemmoveLarge0x180000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
@@ -96,6 +101,10 @@ func TestMemmoveLarge0x180000(t *testing.T) {
 }
 
 func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index a8729b1aa4..c11a6f15a4 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -461,11 +461,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 	return
 }
 
-// Mark KeepAlive as noinline so that the current compiler will ensure
-// that the argument is alive at the point of the function call.
-// If it were inlined, it would disappear, and there would be nothing
-// keeping the argument alive. Perhaps a future compiler will recognize
-// runtime.KeepAlive specially and do something more efficient.
+// Mark KeepAlive as noinline so that it is easily detectable as an intrinsic.
 //go:noinline
 
 // KeepAlive marks its argument as currently reachable.
@@ -487,4 +483,11 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 // Without the KeepAlive call, the finalizer could run at the start of
 // syscall.Read, closing the file descriptor before syscall.Read makes
 // the actual system call.
-func KeepAlive(interface{}) {}
+func KeepAlive(x interface{}) {
+	// Introduce a use of x that the compiler can't eliminate.
+	// This makes sure x is alive on entry. We need x to be alive
+	// on entry for "defer runtime.KeepAlive(x)"; see issue 21402.
+	if cgoAlwaysFalse {
+		println(x)
+	}
+}
diff --git a/src/runtime/mfinal_test.go b/src/runtime/mfinal_test.go
index e9e3601de6..3ca8d31c60 100644
--- a/src/runtime/mfinal_test.go
+++ b/src/runtime/mfinal_test.go
@@ -241,3 +241,24 @@ var (
 	Foo2 = &Object2{}
 	Foo1 = &Object1{}
 )
+
+func TestDeferKeepAlive(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
+	// See issue 21402.
+	t.Parallel()
+	type T *int // needs to be a pointer base type to avoid tinyalloc and its never-finalized behavior.
+	x := new(T)
+	finRun := false
+	runtime.SetFinalizer(x, func(x *T) {
+		finRun = true
+	})
+	defer runtime.KeepAlive(x)
+	runtime.GC()
+	time.Sleep(time.Second)
+	if finRun {
+		t.Errorf("finalizer ran prematurely")
+	}
+}
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index b708720322..ab90c289a5 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -230,6 +230,24 @@ func setGCPercent(in int32) (out int32) {
 	// Update pacing in response to gcpercent change.
 	gcSetTriggerRatio(memstats.triggerRatio)
 	unlock(&mheap_.lock)
+
+	// If we just disabled GC, wait for any concurrent GC to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		// Disable phase transitions.
+		lock(&work.sweepWaiters.lock)
+		if gcphase == _GCmark {
+			// GC is active. Wait until we reach sweeping.
+			gp := getg()
+			gp.schedlink = work.sweepWaiters.head
+			work.sweepWaiters.head.set(gp)
+			goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+		} else {
+			// GC isn't active.
+			unlock(&work.sweepWaiters.lock)
+		}
+	}
+
 	return out
 }
 
@@ -299,10 +317,10 @@ const (
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
-	// is necessary when GOMAXPROCS*gcGoalUtilization is not an
-	// integer. The fractional worker should run until it is
+	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
+	// an integer. The fractional worker should run until it is
 	// preempted and will be scheduled to pick up the fractional
-	// part of GOMAXPROCS*gcGoalUtilization.
+	// part of GOMAXPROCS*gcBackgroundUtilization.
 	gcMarkWorkerFractionalMode
 
 	// gcMarkWorkerIdleMode indicates that a P is running the mark
@@ -396,23 +414,18 @@ type gcControllerState struct {
 	assistBytesPerWork float64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
-	// time that should be spent in the fractional mark worker.
-	// For example, if the overall mark utilization goal is 25%
-	// and GOMAXPROCS is 6, one P will be a dedicated mark worker
-	// and this will be set to 0.5 so that 50% of the time some P
-	// is in a fractional mark worker. This is computed at the
-	// beginning of each cycle.
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If there goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
 	fractionalUtilizationGoal float64
 
 	_ [sys.CacheLineSize]byte
-
-	// fractionalMarkWorkersNeeded is the number of fractional
-	// mark workers that need to be started. This is either 0 or
-	// 1. This is potentially updated atomically at every
-	// scheduling point (hence it gets its own cache line).
-	fractionalMarkWorkersNeeded int64
-
-	_ [sys.CacheLineSize]byte
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -453,23 +466,33 @@ func (c *gcControllerState) startCycle() {
 		memstats.next_gc = memstats.heap_live + 1024*1024
 	}
 
-	// Compute the total mark utilization goal and divide it among
-	// dedicated and fractional workers.
-	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
-	c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
-	if c.fractionalUtilizationGoal > 0 {
-		c.fractionalMarkWorkersNeeded = 1
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
 	} else {
-		c.fractionalMarkWorkersNeeded = 0
+		c.fractionalUtilizationGoal = 0
 	}
 
 	// Clear per-P state
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
 	}
 
 	// Compute initial values for controls that are updated
@@ -482,7 +505,7 @@ func (c *gcControllerState) startCycle() {
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
 			" workers=", c.dedicatedMarkWorkersNeeded,
-			"+", c.fractionalMarkWorkersNeeded, "\n")
+			"+", c.fractionalUtilizationGoal, "\n")
 	}
 }
 
@@ -495,47 +518,73 @@ func (c *gcControllerState) startCycle() {
 // is when assists are enabled and the necessary statistics are
 // available).
 func (c *gcControllerState) revise() {
-	// Compute the expected scan work remaining.
+	gcpercent := gcpercent
+	if gcpercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcpercent = 100000
+	}
+	live := atomic.Load64(&memstats.heap_live)
+
+	var heapGoal, scanWorkExpected int64
+	if live <= memstats.next_gc {
+		// We're under the soft goal. Pace GC to complete at
+		// next_gc assuming the heap is in steady-state.
+		heapGoal = int64(memstats.next_gc)
+
+		// Compute the expected scan work remaining.
+		//
+		// This is estimated based on the expected
+		// steady-state scannable heap. For example, with
+		// GOGC=100, only half of the scannable heap is
+		// expected to be live, so that's what we target.
+		//
+		// (This is a float calculation to avoid overflowing on
+		// 100*heap_scan.)
+		scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	} else {
+		// We're past the soft goal. Pace GC so that in the
+		// worst case it will complete by the hard goal.
+		const maxOvershoot = 1.1
+		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+
+		// Compute the upper bound on the scan work remaining.
+		scanWorkExpected = int64(memstats.heap_scan)
+	}
+
+	// Compute the remaining scan work estimate.
 	//
 	// Note that we currently count allocations during GC as both
 	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so this difference won't be changed by
-	// allocations during GC.
-	//
-	// This particular estimate is a strict upper bound on the
-	// possible remaining scan work for the current heap.
-	// You might consider dividing this by 2 (or by
-	// (100+GOGC)/100) to counter this over-estimation, but
-	// benchmarks show that this has almost no effect on mean
-	// mutator utilization, heap size, or assist time and it
-	// introduces the danger of under-estimating and letting the
-	// mutator outpace the garbage collector.
-	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
-	if scanWorkExpected < 1000 {
+	// (scanWork), so allocation will change this difference will
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - c.scanWork
+	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
 		// we can miss by a little.
 		//
 		// We *do* need to enforce that this is at least 1,
 		// since marking is racy and double-scanning objects
-		// may legitimately make the expected scan work
-		// negative.
-		scanWorkExpected = 1000
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
 	}
 
 	// Compute the heap distance remaining.
-	heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
-	if heapDistance <= 0 {
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
 		// This shouldn't happen, but if it does, avoid
 		// dividing by zero or setting the assist negative.
-		heapDistance = 1
+		heapRemaining = 1
 	}
 
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
-	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
+	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
+	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -569,7 +618,7 @@ func (c *gcControllerState) endCycle() float64 {
 	assistDuration := nanotime() - c.markStartTime
 
 	// Assume background mark hit its utilization goal.
-	utilization := gcGoalUtilization
+	utilization := gcBackgroundUtilization
 	// Add assist utilization; avoid divide by zero.
 	if assistDuration > 0 {
 		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
@@ -688,51 +737,20 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		return nil
 	} else {
-		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
-			// No more workers are need right now.
-			return nil
-		}
-
-		// This P has picked the token for the fractional worker.
-		// Is the GC currently under or at the utilization goal?
-		// If so, do more work.
-		//
-		// We used to check whether doing one time slice of work
-		// would remain under the utilization goal, but that has the
-		// effect of delaying work until the mutator has run for
-		// enough time slices to pay for the work. During those time
-		// slices, write barriers are enabled, so the mutator is running slower.
-		// Now instead we do the work whenever we're under or at the
-		// utilization work and pay for it by letting the mutator run later.
-		// This doesn't change the overall utilization averages, but it
-		// front loads the GC work so that the GC finishes earlier and
-		// write barriers can be turned off sooner, effectively giving
-		// the mutator a faster machine.
+		// Is this P behind on the fractional utilization
+		// goal?
 		//
-		// The old, slower behavior can be restored by setting
-		//	gcForcePreemptNS = forcePreemptNS.
-		const gcForcePreemptNS = 0
-
-		// TODO(austin): We could fast path this and basically
-		// eliminate contention on c.fractionalMarkWorkersNeeded by
-		// precomputing the minimum time at which it's worth
-		// next scheduling the fractional worker. Then Ps
-		// don't have to fight in the window where we've
-		// passed that deadline and no one has started the
-		// worker yet.
-		//
-		// TODO(austin): Shorter preemption interval for mark
-		// worker to improve fairness and give this
-		// finer-grained control over schedule?
-		now := nanotime() - gcController.markStartTime
-		then := now + gcForcePreemptNS
-		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
-		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
-			// Nope, we'd overshoot the utilization goal
-			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - gcController.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
 			return nil
 		}
+		// Run a fractional worker.
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
@@ -745,6 +763,24 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	return gp
 }
 
+// pollFractionalWorkerExit returns true if a fractional mark worker
+// should self-preempt. It assumes it is called from the fractional
+// worker.
+func pollFractionalWorkerExit() bool {
+	// This should be kept in sync with the fractional worker
+	// scheduler logic in findRunnableGCWorker.
+	now := nanotime()
+	delta := now - gcController.markStartTime
+	if delta <= 0 {
+		return true
+	}
+	p := getg().m.p.ptr()
+	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
+	// Add some slack to the utilization goal so that the
+	// fractional worker isn't behind again the instant it exits.
+	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
+}
+
 // gcSetTriggerRatio sets the trigger ratio and updates everything
 // derived from it: the absolute trigger, the heap goal, mark pacing,
 // and sweep pacing.
@@ -859,9 +895,22 @@ func gcSetTriggerRatio(triggerRatio float64) {
 	}
 }
 
-// gcGoalUtilization is the goal CPU utilization for background
+// gcGoalUtilization is the goal CPU utilization for
 // marking as a fraction of GOMAXPROCS.
-const gcGoalUtilization = 0.25
+const gcGoalUtilization = 0.30
+
+// gcBackgroundUtilization is the fixed CPU utilization for background
+// marking. It must be <= gcGoalUtilization. The difference between
+// gcGoalUtilization and gcBackgroundUtilization will be made up by
+// mark assists. The scheduler will aim to use within 50% of this
+// goal.
+//
+// Setting this to < gcGoalUtilization avoids saturating the trigger
+// feedback controller when there are no assists, which allows it to
+// better control CPU and heap growth. However, the larger the gap,
+// the more mutator assists are expected to happen, which impact
+// mutator latency.
+const gcBackgroundUtilization = 0.25
 
 // gcCreditSlack is the amount of scan work credit that can can
 // accumulate locally before updating gcController.scanWork and,
@@ -1238,7 +1287,7 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		}
 	}
 
-	// Ok, we're doing it!  Stop everybody else
+	// Ok, we're doing it! Stop everybody else
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1251,7 +1300,12 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 
 	gcResetMarkState()
 
-	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
+	if work.stwprocs > ncpu {
+		// This is used to compute CPU time of the STW phases,
+		// so it can't be more than ncpu, even if GOMAXPROCS is.
+		work.stwprocs = ncpu
+	}
 	work.heap0 = atomic.Load64(&memstats.heap_live)
 	work.pauseNS = 0
 	work.mode = mode
@@ -1259,6 +1313,9 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	now := nanotime()
 	work.tSweepTerm = now
 	work.pauseStart = now
+	if trace.enabled {
+		traceGCSTWStart(1)
+	}
 	systemstack(stopTheWorldWithSema)
 	// Finish sweep before we start concurrent scan.
 	systemstack(func() {
@@ -1311,11 +1368,17 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		gcController.markStartTime = now
 
 		// Concurrent mark.
-		systemstack(startTheWorldWithSema)
-		now = nanotime()
+		systemstack(func() {
+			now = startTheWorldWithSema(trace.enabled)
+		})
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	} else {
+		if trace.enabled {
+			// Switch to mark termination STW.
+			traceGCSTWDone()
+			traceGCSTWStart(0)
+		}
 		t := nanotime()
 		work.tMark, work.tMarkTerm = t, t
 		work.heapGoal = work.heap0
@@ -1358,7 +1421,8 @@ top:
 	// TODO(austin): Should dedicated workers keep an eye on this
 	// and exit gcDrain promptly?
 	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
-	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+	prevFractionalGoal := gcController.fractionalUtilizationGoal
+	gcController.fractionalUtilizationGoal = 0
 
 	if !gcBlackenPromptly {
 		// Transition from mark 1 to mark 2.
@@ -1385,6 +1449,7 @@ top:
 			// workers have exited their loop so we can
 			// start new mark 2 workers.
 			forEachP(func(_p_ *p) {
+				wbBufFlush1(_p_)
 				_p_.gcw.dispose()
 			})
 		})
@@ -1401,7 +1466,7 @@ top:
 
 		// Now we can start up mark 2 workers.
 		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
-		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+		gcController.fractionalUtilizationGoal = prevFractionalGoal
 
 		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
@@ -1416,6 +1481,9 @@ top:
 		work.tMarkTerm = now
 		work.pauseStart = now
 		getg().m.preemptoff = "gcing"
+		if trace.enabled {
+			traceGCSTWStart(0)
+		}
 		systemstack(stopTheWorldWithSema)
 		// The gcphase is _GCmark, it will transition to _GCmarktermination
 		// below. The important thing is that the wb remains active until
@@ -1576,7 +1644,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// so events don't leak into the wrong cycle.
 	mProf_NextCycle()
 
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(true) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
 	// This is relatively expensive, so we don't do it with the
@@ -1650,10 +1718,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 func gcBgMarkStartWorkers() {
 	// Background marking is performed by per-P G's. Ensure that
 	// each P has a background GC G.
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		if p.gcBgMarkWorker == 0 {
 			go gcBgMarkWorker(p)
 			notetsleepg(&work.bgMarkReady, -1)
@@ -1753,6 +1818,7 @@ func gcBgMarkWorker(_p_ *p) {
 		}
 
 		startTime := nanotime()
+		_p_.gcMarkWorkerStartTime = startTime
 
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@@ -1794,7 +1860,7 @@ func gcBgMarkWorker(_p_ *p) {
 				// without preemption.
 				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
-				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
 				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			}
@@ -1819,7 +1885,7 @@ func gcBgMarkWorker(_p_ *p) {
 			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
-			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+			atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
 		case gcMarkWorkerIdleMode:
 			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
@@ -1917,10 +1983,6 @@ func gcMark(start_time int64) {
 		work.helperDrainBlock = true
 	}
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
 		helpgc(int32(work.nproc))
@@ -1954,8 +2016,8 @@ func gcMark(start_time int64) {
 
 	// Double-check that all gcWork caches are empty. This should
 	// be ensured by mark 2 before we enter mark termination.
-	for i := 0; i < int(gomaxprocs); i++ {
-		gcw := &allp[i].gcw
+	for _, p := range allp {
+		gcw := &p.gcw
 		if !gcw.empty() {
 			throw("P has cached GC work at end of mark termination")
 		}
@@ -1964,10 +2026,6 @@ func gcMark(start_time int64) {
 		}
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	cachestats()
 
 	// Update the marked heap stat.
@@ -2097,18 +2155,19 @@ func clearpools() {
 	unlock(&sched.deferlock)
 }
 
-// Timing
-
-//go:nowritebarrier
+// gchelper runs mark termination tasks on Ps other than the P
+// coordinating mark termination.
+//
+// The caller is responsible for ensuring that this has a P to run on,
+// even though it's running during STW. Because of this, it's allowed
+// to have write barriers.
+//
+//go:yeswritebarrierrec
 func gchelper() {
 	_g_ := getg()
 	_g_.m.traceback = 2
 	gchelperstart()
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	// Parallel mark over GC roots and heap
 	if gcphase == _GCmarktermination {
 		gcw := &_g_.m.p.ptr().gcw
@@ -2120,10 +2179,6 @@ func gchelper() {
 		gcw.dispose()
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
 	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
@@ -2142,6 +2197,8 @@ func gchelperstart() {
 	}
 }
 
+// Timing
+
 // itoaDiv formats val/(10**dec) into buf.
 func itoaDiv(buf []byte, val uint64, dec int) []byte {
 	i := len(buf) - 1
diff --git a/src/runtime/mgclarge.go b/src/runtime/mgclarge.go
index 757e88d1d9..fe437bf5e8 100644
--- a/src/runtime/mgclarge.go
+++ b/src/runtime/mgclarge.go
@@ -164,11 +164,10 @@ func (root *mTreap) insert(span *mspan) {
 	}
 }
 
-func (root *mTreap) removeNode(t *treapNode) *mspan {
+func (root *mTreap) removeNode(t *treapNode) {
 	if t.spanKey.npages != t.npagesKey {
 		throw("span and treap node npages do not match")
 	}
-	result := t.spanKey
 
 	// Rotate t down to be leaf of tree for removal, respecting priorities.
 	for t.right != nil || t.left != nil {
@@ -192,7 +191,6 @@ func (root *mTreap) removeNode(t *treapNode) *mspan {
 	t.spanKey = nil
 	t.npagesKey = 0
 	mheap_.treapalloc.free(unsafe.Pointer(t))
-	return result
 }
 
 // remove searches for, finds, removes from the treap, and returns the smallest
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 9029d19d43..5664390eae 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -34,13 +34,13 @@ const (
 	// span base.
 	maxObletBytes = 128 << 10
 
-	// idleCheckThreshold specifies how many units of work to do
-	// between run queue checks in an idle worker. Assuming a scan
+	// drainCheckThreshold specifies how many units of work to do
+	// between self-preemption checks in gcDrain. Assuming a scan
 	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
 	// overhead in the scan loop (the scheduler check may perform
 	// a syscall, so its overhead is nontrivial). Higher values
 	// make the system less responsive to incoming work.
-	idleCheckThreshold = 100000
+	drainCheckThreshold = 100000
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
@@ -770,6 +770,13 @@ func scanstack(gp *g, gcw *gcWork) {
 		shrinkstack(gp)
 	}
 
+	// Scan the saved context register. This is effectively a live
+	// register that gets moved back and forth between the
+	// register and sched.ctxt without a write barrier.
+	if gp.sched.ctxt != nil {
+		scanblock(uintptr(unsafe.Pointer(&gp.sched.ctxt)), sys.PtrSize, &oneptrmask[0], gcw)
+	}
+
 	// Scan the stack.
 	var cache pcvalueCache
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
@@ -861,6 +868,7 @@ const (
 	gcDrainNoBlock
 	gcDrainFlushBgCredit
 	gcDrainIdle
+	gcDrainFractional
 
 	// gcDrainBlock means neither gcDrainUntilPreempt or
 	// gcDrainNoBlock. It is the default, but callers should use
@@ -877,6 +885,10 @@ const (
 // If flags&gcDrainIdle != 0, gcDrain returns when there is other work
 // to do. This implies gcDrainNoBlock.
 //
+// If flags&gcDrainFractional != 0, gcDrain self-preempts when
+// pollFractionalWorkerExit() returns true. This implies
+// gcDrainNoBlock.
+//
 // If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
 // unable to get more work. Otherwise, it will block until all
 // blocking calls are blocked in gcDrain.
@@ -893,14 +905,24 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	gp := getg().m.curg
 	preemptible := flags&gcDrainUntilPreempt != 0
-	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainFractional|gcDrainNoBlock) == 0
 	flushBgCredit := flags&gcDrainFlushBgCredit != 0
 	idle := flags&gcDrainIdle != 0
 
 	initScanWork := gcw.scanWork
-	// idleCheck is the scan work at which to perform the next
-	// idle check with the scheduler.
-	idleCheck := initScanWork + idleCheckThreshold
+
+	// checkWork is the scan work before performing the next
+	// self-preempt check.
+	checkWork := int64(1<<63 - 1)
+	var check func() bool
+	if flags&(gcDrainIdle|gcDrainFractional) != 0 {
+		checkWork = initScanWork + drainCheckThreshold
+		if idle {
+			check = pollWork
+		} else if flags&gcDrainFractional != 0 {
+			check = pollFractionalWorkerExit
+		}
+	}
 
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
@@ -910,7 +932,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				break
 			}
 			markroot(gcw, job)
-			if idle && pollWork() {
+			if check != nil && check() {
 				goto done
 			}
 		}
@@ -951,12 +973,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				gcFlushBgCredit(gcw.scanWork - initScanWork)
 				initScanWork = 0
 			}
-			idleCheck -= gcw.scanWork
+			checkWork -= gcw.scanWork
 			gcw.scanWork = 0
 
-			if idle && idleCheck <= 0 {
-				idleCheck += idleCheckThreshold
-				if pollWork() {
+			if checkWork <= 0 {
+				checkWork += drainCheckThreshold
+				if check != nil && check() {
 					break
 				}
 			}
@@ -1212,6 +1234,9 @@ func shade(b uintptr) {
 // obj is the start of an object with mark mbits.
 // If it isn't already marked, mark it and enqueue into gcw.
 // base and off are for debugging only and could be removed.
+//
+// See also wbBufFlush1, which partially duplicates this logic.
+//
 //go:nowritebarrierrec
 func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) {
 	// obj should be start of allocation, and so must be at least pointer-aligned.
@@ -1356,10 +1381,7 @@ func gcmarknewobject(obj, size, scanSize uintptr) {
 //
 // The world must be stopped.
 func gcMarkTinyAllocs() {
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil || c.tiny == 0 {
 			continue
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 461679b934..c6634fc78c 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -85,6 +85,13 @@ type gcWork struct {
 	scanWork int64
 }
 
+// Most of the methods of gcWork are go:nowritebarrierrec because the
+// write barrier itself can invoke gcWork methods but the methods are
+// not generally re-entrant. Hence, if a gcWork method invoked the
+// write barrier while the gcWork was in an inconsistent state, and
+// the write barrier in turn invoked a gcWork method, it could
+// permanently corrupt the gcWork.
+
 func (w *gcWork) init() {
 	w.wbuf1 = getempty()
 	wbuf2 := trygetfull()
@@ -96,7 +103,7 @@ func (w *gcWork) init() {
 
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
 	flushed := false
 	wbuf := w.wbuf1
@@ -129,7 +136,7 @@ func (w *gcWork) put(obj uintptr) {
 
 // putFast does a put and returns true if it can be done quickly
 // otherwise it returns false and the caller needs to call put.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -143,12 +150,45 @@ func (w *gcWork) putFast(obj uintptr) bool {
 	return true
 }
 
+// putBatch performs a put on every pointer in obj. See put for
+// constraints on these pointers.
+//
+//go:nowritebarrierrec
+func (w *gcWork) putBatch(obj []uintptr) {
+	if len(obj) == 0 {
+		return
+	}
+
+	flushed := false
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+	}
+
+	for len(obj) > 0 {
+		for wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
+			wbuf = w.wbuf1
+			flushed = true
+		}
+		n := copy(wbuf.obj[wbuf.nobj:], obj)
+		wbuf.nobj += n
+		obj = obj[n:]
+	}
+
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
 // tryGet dequeues a pointer for the garbage collector to trace.
 //
 // If there are no pointers remaining in this gcWork or in the global
 // queue, tryGet returns 0.  Note that there may still be pointers in
 // other gcWork instances or other caches.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGet() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -177,7 +217,7 @@ func (w *gcWork) tryGet() uintptr {
 // tryGetFast dequeues a pointer for the garbage collector to trace
 // if one is readily available. Otherwise it returns 0 and
 // the caller is expected to call tryGet().
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGetFast() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -194,7 +234,7 @@ func (w *gcWork) tryGetFast() uintptr {
 // get dequeues a pointer for the garbage collector to trace, blocking
 // if necessary to ensure all pointers from all queues and caches have
 // been retrieved.  get returns 0 if there are no pointers remaining.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) get() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -228,7 +268,7 @@ func (w *gcWork) get() uintptr {
 // GC can inspect them. This helps reduce the mutator's
 // ability to hide pointers during the concurrent mark phase.
 //
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) dispose() {
 	if wbuf := w.wbuf1; wbuf != nil {
 		if wbuf.nobj == 0 {
@@ -262,7 +302,7 @@ func (w *gcWork) dispose() {
 
 // balance moves some work that's cached in this gcWork back on the
 // global queue.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) balance() {
 	if w.wbuf1 == nil {
 		return
@@ -282,7 +322,7 @@ func (w *gcWork) balance() {
 }
 
 // empty returns true if w has no mark work available.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) empty() bool {
 	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
 }
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 893587e5d2..12cf29a01d 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -56,6 +56,12 @@ type mheap struct {
 	// Internal pages map to an arbitrary span.
 	// For pages that have never been allocated, spans entries are nil.
 	//
+	// Modifications are protected by mheap.lock. Reads can be
+	// performed without locking, but ONLY from indexes that are
+	// known to contain in-use or stack spans. This means there
+	// must not be a safe-point between establishing that an
+	// address is live and looking it up in the spans array.
+	//
 	// This is backed by a reserved region of the address space so
 	// it can grow without moving. The memory up to len(spans) is
 	// mapped. cap(spans) indicates the total reserved memory.
@@ -154,6 +160,8 @@ type mheap struct {
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
+
+	unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
 }
 
 var mheap_ mheap
@@ -311,6 +319,17 @@ func (s *mspan) layout() (size, n, total uintptr) {
 	return
 }
 
+// recordspan adds a newly allocated span to h.allspans.
+//
+// This only happens the first time a span is allocated from
+// mheap.spanalloc (it is not called when a span is reused).
+//
+// Write barriers are disallowed here because it can be called from
+// gcWork when allocating new workbufs. However, because it's an
+// indirect call from the fixalloc initializer, the compiler can't see
+// this.
+//
+//go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
@@ -331,12 +350,13 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 			copy(new, h.allspans)
 		}
 		oldAllspans := h.allspans
-		h.allspans = new
+		*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
 		if len(oldAllspans) != 0 {
 			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
 		}
 	}
-	h.allspans = append(h.allspans, s)
+	h.allspans = h.allspans[:len(h.allspans)+1]
+	h.allspans[len(h.allspans)-1] = s
 }
 
 // A spanClass represents the size class and noscan-ness of a span.
@@ -857,7 +877,7 @@ HaveSpan:
 // Large spans have a minimum size of 1MByte. The maximum number of large spans to support
 // 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
 // the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
-// by a perfectly balanced tree with a a depth of 20. Twice that is an acceptable 40.
+// by a perfectly balanced tree with a depth of 20. Twice that is an acceptable 40.
 func (h *mheap) isLargeSpan(npages uintptr) bool {
 	return npages >= uintptr(len(h.free))
 }
@@ -1123,34 +1143,35 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 
 	var sumreleased uintptr
 	for s := list.first; s != nil; s = s.next {
-		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-			start := s.base()
-			end := start + s.npages<<_PageShift
-			if physPageSize > _PageSize {
-				// We can only release pages in
-				// physPageSize blocks, so round start
-				// and end in. (Otherwise, madvise
-				// will round them *out* and release
-				// more memory than we want.)
-				start = (start + physPageSize - 1) &^ (physPageSize - 1)
-				end &^= physPageSize - 1
-				if end <= start {
-					// start and end don't span a
-					// whole physical page.
-					continue
-				}
-			}
-			len := end - start
-
-			released := len - (s.npreleased << _PageShift)
-			if physPageSize > _PageSize && released == 0 {
+		if (now-uint64(s.unusedsince)) <= limit || s.npreleased == s.npages {
+			continue
+		}
+		start := s.base()
+		end := start + s.npages<<_PageShift
+		if physPageSize > _PageSize {
+			// We can only release pages in
+			// physPageSize blocks, so round start
+			// and end in. (Otherwise, madvise
+			// will round them *out* and release
+			// more memory than we want.)
+			start = (start + physPageSize - 1) &^ (physPageSize - 1)
+			end &^= physPageSize - 1
+			if end <= start {
+				// start and end don't span a
+				// whole physical page.
 				continue
 			}
-			memstats.heap_released += uint64(released)
-			sumreleased += released
-			s.npreleased = len >> _PageShift
-			sysUnused(unsafe.Pointer(start), len)
 		}
+		len := end - start
+
+		released := len - (s.npreleased << _PageShift)
+		if physPageSize > _PageSize && released == 0 {
+			continue
+		}
+		memstats.heap_released += uint64(released)
+		sumreleased += released
+		s.npreleased = len >> _PageShift
+		sysUnused(unsafe.Pointer(start), len)
 	}
 	return sumreleased
 }
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index d15f1f7346..fb7cbc28fd 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -70,7 +70,7 @@ func zeroAMD64(w io.Writer) {
 		fmt.Fprintln(w, "\tMOVUPS\tX0,16(DI)")
 		fmt.Fprintln(w, "\tMOVUPS\tX0,32(DI)")
 		fmt.Fprintln(w, "\tMOVUPS\tX0,48(DI)")
-		fmt.Fprintln(w, "\tADDQ\t$64,DI")
+		fmt.Fprintln(w, "\tLEAQ\t64(DI),DI") // We use lea instead of add, to avoid clobbering flags
 		fmt.Fprintln(w)
 	}
 	fmt.Fprintln(w, "\tRET")
@@ -151,12 +151,13 @@ func copyARM(w io.Writer) {
 
 func zeroARM64(w io.Writer) {
 	// ZR: always zero
-	// R16 (aka REGRT1): ptr to memory to be zeroed - 8
+	// R16 (aka REGRT1): ptr to memory to be zeroed
 	// On return, R16 points to the last zeroed dword.
 	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
-	for i := 0; i < 128; i++ {
-		fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
+	for i := 0; i < 63; i++ {
+		fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
 	}
+	fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
 	fmt.Fprintln(w, "\tRET")
 }
 
diff --git a/src/runtime/mksizeclasses.go b/src/runtime/mksizeclasses.go
index 0cb2b33a8c..b146dbcd6c 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/runtime/mksizeclasses.go
@@ -24,8 +24,8 @@
 // In practice, only one of the wastes comes into play for a
 // given size (sizes < 512 waste mainly on the round-up,
 // sizes > 512 waste mainly on the page chopping).
-//
-// TODO(rsc): Compute max waste for any given size.
+// For really small sizes, alignment constraints force the
+// overhead higher.
 
 package main
 
@@ -242,15 +242,18 @@ nextk:
 }
 
 func printComment(w io.Writer, classes []class) {
-	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-11s\n", "class", "bytes/obj", "bytes/span", "objects", "waste bytes")
+	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste")
+	prevSize := 0
 	for i, c := range classes {
 		if i == 0 {
 			continue
 		}
 		spanSize := c.npages * pageSize
 		objects := spanSize / c.size
-		waste := spanSize - c.size*(spanSize/c.size)
-		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %11d\n", i, c.size, spanSize, objects, waste)
+		tailWaste := spanSize - c.size*(spanSize/c.size)
+		maxWaste := float64((c.size-prevSize-1)*objects+tailWaste) / float64(spanSize)
+		prevSize = c.size
+		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste)
 	}
 	fmt.Fprintf(w, "\n")
 }
diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 62f3780db8..e1333c62fe 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go
@@ -16,7 +16,8 @@ import "unsafe"
 // We only pass the lower 32 bits of file offset to the
 // assembly routine; the higher bits (if required), should be provided
 // by the assembly routine as 0.
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+// The err result is an OS error code such as ENOMEM.
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
 
 // munmap calls the munmap system call. It is implemented in assembly.
 func munmap(addr unsafe.Pointer, n uintptr)
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 2bd09b6a26..259473c9ce 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -596,7 +596,7 @@ func record(r *MemProfileRecord, b *bucket) {
 	r.AllocObjects = int64(mp.active.allocs)
 	r.FreeObjects = int64(mp.active.frees)
 	if raceenabled {
-		racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(unsafe.Pointer(&r)), funcPC(MemProfile))
+		racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(), funcPC(MemProfile))
 	}
 	if msanenabled {
 		msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
@@ -644,7 +644,7 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 			r.Count = bp.count
 			r.Cycles = bp.cycles
 			if raceenabled {
-				racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(unsafe.Pointer(&p)), funcPC(BlockProfile))
+				racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(), funcPC(BlockProfile))
 			}
 			if msanenabled {
 				msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
@@ -741,7 +741,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 
 		// Save current goroutine.
 		sp := getcallersp(unsafe.Pointer(&p))
-		pc := getcallerpc(unsafe.Pointer(&p))
+		pc := getcallerpc()
 		systemstack(func() {
 			saveg(pc, sp, gp, &r[0])
 		})
@@ -786,7 +786,7 @@ func Stack(buf []byte, all bool) int {
 	if len(buf) > 0 {
 		gp := getg()
 		sp := getcallersp(unsafe.Pointer(&buf))
-		pc := getcallerpc(unsafe.Pointer(&buf))
+		pc := getcallerpc()
 		systemstack(func() {
 			g0 := getg()
 			// Force traceback=1 to override GOTRACEBACK setting,
@@ -826,7 +826,7 @@ func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) {
 	}
 	if gp.m.curg == nil || gp == gp.m.curg {
 		goroutineheader(gp)
-		pc := getcallerpc(unsafe.Pointer(&p))
+		pc := getcallerpc()
 		sp := getcallersp(unsafe.Pointer(&p))
 		systemstack(func() {
 			traceback(pc, sp, 0, gp)
@@ -846,7 +846,7 @@ func tracefree(p unsafe.Pointer, size uintptr) {
 	gp.m.traceback = 2
 	print("tracefree(", p, ", ", hex(size), ")\n")
 	goroutineheader(gp)
-	pc := getcallerpc(unsafe.Pointer(&p))
+	pc := getcallerpc()
 	sp := getcallersp(unsafe.Pointer(&p))
 	systemstack(func() {
 		traceback(pc, sp, 0, gp)
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 1cb44a15dd..53caefc0fe 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -589,12 +589,13 @@ func updatememstats() {
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
+// cachestats flushes all mcache stats.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func cachestats() {
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
@@ -610,9 +611,6 @@ func cachestats() {
 //go:nowritebarrier
 func flushmcache(i int) {
 	p := allp[i]
-	if p == nil {
-		return
-	}
 	c := p.mcache
 	if c == nil {
 		return
@@ -666,7 +664,7 @@ func purgecachedstats(c *mcache) {
 // overflow errors.
 //go:nosplit
 func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, int64(n))
 		return
 	}
@@ -680,7 +678,7 @@ func mSysStatInc(sysStat *uint64, n uintptr) {
 // mSysStatInc apply.
 //go:nosplit
 func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, -int64(n))
 		return
 	}
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
new file mode 100644
index 0000000000..2c06996210
--- /dev/null
+++ b/src/runtime/mwbbuf.go
@@ -0,0 +1,248 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This implements the write barrier buffer. The write barrier itself
+// is gcWriteBarrier and is implemented in assembly.
+//
+// The write barrier has a fast path and a slow path. The fast path
+// simply enqueues to a per-P write barrier buffer. It's written in
+// assembly and doesn't clobber any general purpose registers, so it
+// doesn't have the usual overheads of a Go call.
+//
+// When the buffer fills up, the write barrier invokes the slow path
+// (wbBufFlush) to flush the buffer to the GC work queues. In this
+// path, since the compiler didn't spill registers, we spill *all*
+// registers and disallow any GC safe points that could observe the
+// stack frame (since we don't know the types of the spilled
+// registers).
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// testSmallBuf forces a small write barrier buffer to stress write
+// barrier flushing.
+const testSmallBuf = false
+
+// wbBuf is a per-P buffer of pointers queued by the write barrier.
+// This buffer is flushed to the GC workbufs when it fills up and on
+// various GC transitions.
+//
+// This is closely related to a "sequential store buffer" (SSB),
+// except that SSBs are usually used for maintaining remembered sets,
+// while this is used for marking.
+type wbBuf struct {
+	// next points to the next slot in buf. It must not be a
+	// pointer type because it can point past the end of buf and
+	// must be updated without write barriers.
+	//
+	// This is a pointer rather than an index to optimize the
+	// write barrier assembly.
+	next uintptr
+
+	// end points to just past the end of buf. It must not be a
+	// pointer type because it points past the end of buf and must
+	// be updated without write barriers.
+	end uintptr
+
+	// buf stores a series of pointers to execute write barriers
+	// on. This must be a multiple of wbBufEntryPointers because
+	// the write barrier only checks for overflow once per entry.
+	buf [wbBufEntryPointers * wbBufEntries]uintptr
+}
+
+const (
+	// wbBufEntries is the number of write barriers between
+	// flushes of the write barrier buffer.
+	//
+	// This trades latency for throughput amortization. Higher
+	// values amortize flushing overhead more, but increase the
+	// latency of flushing. Higher values also increase the cache
+	// footprint of the buffer.
+	//
+	// TODO: What is the latency cost of this? Tune this value.
+	wbBufEntries = 256
+
+	// wbBufEntryPointers is the number of pointers added to the
+	// buffer by each write barrier.
+	wbBufEntryPointers = 2
+)
+
+// reset empties b by resetting its next and end pointers.
+func (b *wbBuf) reset() {
+	start := uintptr(unsafe.Pointer(&b.buf[0]))
+	b.next = start
+	if gcBlackenPromptly || writeBarrier.cgo {
+		// Effectively disable the buffer by forcing a flush
+		// on every barrier.
+		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+	} else if testSmallBuf {
+		// For testing, allow two barriers in the buffer. If
+		// we only did one, then barriers of non-heap pointers
+		// would be no-ops. This lets us combine a buffered
+		// barrier with a flush at a later time.
+		b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
+	} else {
+		b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
+	}
+
+	if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
+		throw("bad write barrier buffer bounds")
+	}
+}
+
+// putFast adds old and new to the write barrier buffer and returns
+// false if a flush is necessary. Callers should use this as:
+//
+//     buf := &getg().m.p.ptr().wbBuf
+//     if !buf.putFast(old, new) {
+//         wbBufFlush(...)
+//     }
+//
+// The arguments to wbBufFlush depend on whether the caller is doing
+// its own cgo pointer checks. If it is, then this can be
+// wbBufFlush(nil, 0). Otherwise, it must pass the slot address and
+// new.
+//
+// Since buf is a per-P resource, the caller must ensure there are no
+// preemption points while buf is in use.
+//
+// It must be nowritebarrierrec to because write barriers here would
+// corrupt the write barrier buffer. It (and everything it calls, if
+// it called anything) has to be nosplit to avoid scheduling on to a
+// different P and a different buffer.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) putFast(old, new uintptr) bool {
+	p := (*[2]uintptr)(unsafe.Pointer(b.next))
+	p[0] = old
+	p[1] = new
+	b.next += 2 * sys.PtrSize
+	return b.next != b.end
+}
+
+// wbBufFlush flushes the current P's write barrier buffer to the GC
+// workbufs. It is passed the slot and value of the write barrier that
+// caused the flush so that it can implement cgocheck.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation.
+//
+// This and everything it calls must be nosplit because 1) the stack
+// contains untyped slots from gcWriteBarrier and 2) there must not be
+// a GC safe point between the write barrier test in the caller and
+// flushing the buffer.
+//
+// TODO: A "go:nosplitrec" annotation would be perfect for this.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func wbBufFlush(dst *uintptr, src uintptr) {
+	if getg().m.dying > 0 {
+		// We're going down. Not much point in write barriers
+		// and this way we can allow write barriers in the
+		// panic path.
+		return
+	}
+
+	if writeBarrier.cgo && dst != nil {
+		// This must be called from the stack that did the
+		// write. It's nosplit all the way down.
+		cgoCheckWriteBarrier(dst, src)
+		if !writeBarrier.needed {
+			// We were only called for cgocheck.
+			b := &getg().m.p.ptr().wbBuf
+			b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+			return
+		}
+	}
+
+	// Switch to the system stack so we don't have to worry about
+	// the untyped stack slots or safe points.
+	systemstack(func() {
+		wbBufFlush1(getg().m.p.ptr())
+	})
+}
+
+// wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation, so this may lead to infinite loops or
+// buffer corruption.
+//
+// This must be non-preemptible because it uses the P's workbuf.
+//
+//go:nowritebarrierrec
+//go:systemstack
+func wbBufFlush1(_p_ *p) {
+	// Get the buffered pointers.
+	start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
+	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
+	ptrs := _p_.wbBuf.buf[:n]
+
+	// Reset the buffer.
+	_p_.wbBuf.reset()
+
+	if useCheckmark {
+		// Slow path for checkmark mode.
+		for _, ptr := range ptrs {
+			shade(ptr)
+		}
+		return
+	}
+
+	// Mark all of the pointers in the buffer and record only the
+	// pointers we greyed. We use the buffer itself to temporarily
+	// record greyed pointers.
+	//
+	// TODO: Should scanobject/scanblock just stuff pointers into
+	// the wbBuf? Then this would become the sole greying path.
+	gcw := &_p_.gcw
+	pos := 0
+	arenaStart := mheap_.arena_start
+	for _, ptr := range ptrs {
+		if ptr < arenaStart {
+			// nil pointers are very common, especially
+			// for the "old" values. Filter out these and
+			// other "obvious" non-heap pointers ASAP.
+			//
+			// TODO: Should we filter out nils in the fast
+			// path to reduce the rate of flushes?
+			continue
+		}
+		// TODO: This doesn't use hbits, so calling
+		// heapBitsForObject seems a little silly. We could
+		// easily separate this out since heapBitsForObject
+		// just calls heapBitsForAddr(obj) to get hbits.
+		obj, _, span, objIndex := heapBitsForObject(ptr, 0, 0)
+		if obj == 0 {
+			continue
+		}
+		// TODO: Consider making two passes where the first
+		// just prefetches the mark bits.
+		mbits := span.markBitsForIndex(objIndex)
+		if mbits.isMarked() {
+			continue
+		}
+		mbits.setMarked()
+		if span.spanclass.noscan() {
+			gcw.bytesMarked += uint64(span.elemsize)
+			continue
+		}
+		ptrs[pos] = obj
+		pos++
+	}
+
+	// Enqueue the greyed objects.
+	gcw.putBatch(ptrs[:pos])
+	if gcphase == _GCmarktermination || gcBlackenPromptly {
+		// Ps aren't allowed to cache work during mark
+		// termination.
+		gcw.dispose()
+	}
+}
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 71de98bcd6..4d5d1a4ea8 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -88,10 +88,23 @@ retry:
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 		var mode int32
-		if ev.filter == _EVFILT_READ {
+		switch ev.filter {
+		case _EVFILT_READ:
 			mode += 'r'
-		}
-		if ev.filter == _EVFILT_WRITE {
+
+			// On some systems when the read end of a pipe
+			// is closed the write end will not get a
+			// _EVFILT_WRITE event, but will get a
+			// _EVFILT_READ event with EV_EOF set.
+			// Note that setting 'w' here just means that we
+			// will wake up a goroutine waiting to write;
+			// that goroutine will try the write again,
+			// and the appropriate thing will happen based
+			// on what that write returns (success, EPIPE, EAGAIN).
+			if ev.flags&_EV_EOF != 0 {
+				mode += 'w'
+			}
+		case _EVFILT_WRITE:
 			mode += 'w'
 		}
 		if mode != 0 {
diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 79dafb0279..134071f5e3 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go
@@ -47,7 +47,7 @@ func netpolldescriptor() uintptr {
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	if stdcall4(_CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0 {
-		return -int32(getlasterror())
+		return int32(getlasterror())
 	}
 	return 0
 }
diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index 5d4b5a6698..3b65a2c9ba 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go
@@ -153,3 +153,6 @@ func setThreadCPUProfiler(hz int32) {
 	// TODO: Enable profiling interrupts.
 	getg().m.profilehz = hz
 }
+
+// gsignalStack is unused on Plan 9.
+type gsignalStack struct{}
diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index 067fb3bb0a..c53f6132ee 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go
@@ -181,6 +181,12 @@ func newosproc(mp *m, _ unsafe.Pointer) {
 	}
 }
 
+func exitThread(wait *uint32) {
+	// We should never reach exitThread on Solaris because we let
+	// libc clean up threads.
+	throw("exitThread")
+}
+
 var urandom_dev = []byte("/dev/urandom\x00")
 
 //go:nosplit
@@ -396,12 +402,12 @@ func madvise(addr unsafe.Pointer, n uintptr, flags int32) {
 }
 
 //go:nosplit
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer {
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
 	p, err := doMmap(uintptr(addr), n, uintptr(prot), uintptr(flags), uintptr(fd), uintptr(off))
 	if p == ^uintptr(0) {
-		return unsafe.Pointer(err)
+		return nil, int(err)
 	}
-	return unsafe.Pointer(p)
+	return unsafe.Pointer(p), 0
 }
 
 //go:nosplit
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 15281674ae..4ab5a76373 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -135,7 +135,7 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 // not safe to use after initialization as it does not pass an M as fnarg.
 //
 //go:nosplit
-func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
+func newosproc0(stacksize uintptr, fn uintptr) {
 	stack := sysAlloc(stacksize, &memstats.stacks_sys)
 	if stack == nil {
 		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
@@ -145,7 +145,7 @@ func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
 
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	errno := bsdthread_create(stk, fn, fnarg)
+	errno := bsdthread_create(stk, nil, fn)
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 
 	if errno < 0 {
@@ -188,7 +188,11 @@ func minit() {
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	unminitSignals()
+	// The alternate signal stack is buggy on arm and arm64.
+	// See minit.
+	if GOARCH != "arm" && GOARCH != "arm64" {
+		unminitSignals()
+	}
 }
 
 // Mach IPC, to get at semaphores
diff --git a/src/runtime/os_darwin_arm64.go b/src/runtime/os_darwin_arm64.go
index 01285afa19..8de132d8e2 100644
--- a/src/runtime/os_darwin_arm64.go
+++ b/src/runtime/os_darwin_arm64.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-var supportCRC32 = false
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index 7c989de109..31708e2454 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go
@@ -69,15 +69,19 @@ func sysctlnametomib(name []byte, mib *[_CTL_MAXNAME]uint32) uint32 {
 }
 
 const (
-	_CPU_SETSIZE_MAX = 32 // Limited by _MaxGomaxprocs(256) in runtime2.go.
 	_CPU_CURRENT_PID = -1 // Current process ID.
 )
 
 //go:noescape
 func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
 
+//go:systemstack
 func getncpu() int32 {
-	var mask [_CPU_SETSIZE_MAX]byte
+	// Use a large buffer for the CPU mask. We're on the system
+	// stack, so this is fine, and we can't allocate memory for a
+	// dynamically-sized buffer at this point.
+	const maxCPUs = 64 * 1024
+	var mask [maxCPUs / 8]byte
 	var mib [_CTL_MAXNAME]uint32
 
 	// According to FreeBSD's /usr/src/sys/kern/kern_cpuset.c,
@@ -99,21 +103,20 @@ func getncpu() int32 {
 		return 1
 	}
 
-	size := maxcpus / _NBBY
-	ptrsize := uint32(unsafe.Sizeof(uintptr(0)))
-	if size < ptrsize {
-		size = ptrsize
+	maskSize := int(maxcpus+7) / 8
+	if maskSize < sys.PtrSize {
+		maskSize = sys.PtrSize
 	}
-	if size > _CPU_SETSIZE_MAX {
-		return 1
+	if maskSize > len(mask) {
+		maskSize = len(mask)
 	}
 
 	if cpuset_getaffinity(_CPU_LEVEL_WHICH, _CPU_WHICH_PID, _CPU_CURRENT_PID,
-		int(size), (*byte)(unsafe.Pointer(&mask[0]))) != 0 {
+		maskSize, (*byte)(unsafe.Pointer(&mask[0]))) != 0 {
 		return 1
 	}
 	n := int32(0)
-	for _, v := range mask[:size] {
+	for _, v := range mask[:maskSize] {
 		for v != 0 {
 			n += int32(v & 1)
 			v >>= 1
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 78899737b6..98e7f52b9e 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -89,13 +89,13 @@ func getproccount() int32 {
 	// buffers, but we don't have a dynamic memory allocator at the
 	// moment, so that's a bit tricky and seems like overkill.
 	const maxCPUs = 64 * 1024
-	var buf [maxCPUs / (sys.PtrSize * 8)]uintptr
+	var buf [maxCPUs / 8]byte
 	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
 	if r < 0 {
 		return 1
 	}
 	n := int32(0)
-	for _, v := range buf[:r/sys.PtrSize] {
+	for _, v := range buf[:r] {
 		for v != 0 {
 			n += int32(v & 1)
 			v >>= 1
@@ -193,6 +193,8 @@ const (
 
 var procAuxv = []byte("/proc/self/auxv\x00")
 
+func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+
 func sysargs(argc int32, argv **byte) {
 	n := argc + 1
 
@@ -206,45 +208,46 @@ func sysargs(argc int32, argv **byte) {
 
 	// now argv+n is auxv
 	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-	if sysauxv(auxv[:]) == 0 {
-		// In some situations we don't get a loader-provided
-		// auxv, such as when loaded as a library on Android.
-		// Fall back to /proc/self/auxv.
-		fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
-		if fd < 0 {
-			// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
-			// try using mincore to detect the physical page size.
-			// mincore should return EINVAL when address is not a multiple of system page size.
-			const size = 256 << 10 // size of memory region to allocate
-			p := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-			if uintptr(p) < 4096 {
-				return
-			}
-			var n uintptr
-			for n = 4 << 10; n < size; n <<= 1 {
-				err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
-				if err == 0 {
-					physPageSize = n
-					break
-				}
-			}
-			if physPageSize == 0 {
-				physPageSize = size
-			}
-			munmap(p, size)
+	if sysauxv(auxv[:]) != 0 {
+		return
+	}
+	// In some situations we don't get a loader-provided
+	// auxv, such as when loaded as a library on Android.
+	// Fall back to /proc/self/auxv.
+	fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
+	if fd < 0 {
+		// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
+		// try using mincore to detect the physical page size.
+		// mincore should return EINVAL when address is not a multiple of system page size.
+		const size = 256 << 10 // size of memory region to allocate
+		p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if err != 0 {
 			return
 		}
-		var buf [128]uintptr
-		n := read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
-		closefd(fd)
-		if n < 0 {
-			return
+		var n uintptr
+		for n = 4 << 10; n < size; n <<= 1 {
+			err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
+			if err == 0 {
+				physPageSize = n
+				break
+			}
+		}
+		if physPageSize == 0 {
+			physPageSize = size
 		}
-		// Make sure buf is terminated, even if we didn't read
-		// the whole file.
-		buf[len(buf)-2] = _AT_NULL
-		sysauxv(buf[:])
+		munmap(p, size)
+		return
+	}
+	var buf [128]uintptr
+	n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
+	closefd(fd)
+	if n < 0 {
+		return
 	}
+	// Make sure buf is terminated, even if we didn't read
+	// the whole file.
+	buf[len(buf)-2] = _AT_NULL
+	sysauxv(buf[:])
 }
 
 func sysauxv(auxv []uintptr) int {
@@ -382,7 +385,7 @@ func raise(sig uint32)
 func raiseproc(sig uint32)
 
 //go:noescape
-func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
+func sched_getaffinity(pid, len uintptr, buf *byte) int32
 func osyield()
 
 //go:nosplit
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 986a34135e..96827e7c9f 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -2,14 +2,22 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build arm64
+
 package runtime
 
-const (
-	_ARM64_FEATURE_HAS_CRC32 = 0x80
-)
+// For go:linkname
+import _ "unsafe"
 
 var randomNumber uint32
-var supportCRC32 bool
+
+// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
+// HWCAP/HWCAP2 bits for hardware capabilities.
+
+//go:linkname cpu_hwcap internal/cpu.arm64_hwcap
+//go:linkname cpu_hwcap2 internal/cpu.arm64_hwcap2
+var cpu_hwcap uint
+var cpu_hwcap2 uint
 
 func archauxv(tag, val uintptr) {
 	switch tag {
@@ -20,7 +28,9 @@ func archauxv(tag, val uintptr) {
 		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
 			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
 	case _AT_HWCAP:
-		supportCRC32 = val&_ARM64_FEATURE_HAS_CRC32 != 0
+		cpu_hwcap = uint(val)
+	case _AT_HWCAP2:
+		cpu_hwcap2 = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_noauxv.go b/src/runtime/os_linux_noauxv.go
index 5e9f03120d..db6e5a0530 100644
--- a/src/runtime/os_linux_noauxv.go
+++ b/src/runtime/os_linux_noauxv.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
+// +build !386,!amd64,!arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/src/runtime/os_linux_ppc64x.go b/src/runtime/os_linux_ppc64x.go
index b0da98b0bd..e37bfc453a 100644
--- a/src/runtime/os_linux_ppc64x.go
+++ b/src/runtime/os_linux_ppc64x.go
@@ -6,55 +6,22 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-)
+// For go:linkname
+import _ "unsafe"
 
-const (
-	// ISA level
-	// Go currently requires POWER5 as a minimum for ppc64, so we need
-	// to check for ISA 2.03 and beyond.
-	_PPC_FEATURE_POWER5_PLUS = 0x00020000 // ISA 2.03 (POWER5+)
-	_PPC_FEATURE_ARCH_2_05   = 0x00001000 // ISA 2.05 (POWER6)
-	_PPC_FEATURE_POWER6_EXT  = 0x00000200 // mffgpr/mftgpr extension (POWER6x)
-	_PPC_FEATURE_ARCH_2_06   = 0x00000100 // ISA 2.06 (POWER7)
-	_PPC_FEATURE2_ARCH_2_07  = 0x80000000 // ISA 2.07 (POWER8)
+// ppc64x doesn't have a 'cpuid' instruction equivalent and relies on
+// HWCAP/HWCAP2 bits for hardware capabilities.
 
-	// Standalone capabilities
-	_PPC_FEATURE_HAS_ALTIVEC = 0x10000000 // SIMD/Vector unit
-	_PPC_FEATURE_HAS_VSX     = 0x00000080 // Vector scalar unit
-)
-
-type facilities struct {
-	_         [sys.CacheLineSize]byte
-	isPOWER5x bool // ISA 2.03
-	isPOWER6  bool // ISA 2.05
-	isPOWER6x bool // ISA 2.05 + mffgpr/mftgpr extension
-	isPOWER7  bool // ISA 2.06
-	isPOWER8  bool // ISA 2.07
-	hasVMX    bool // Vector unit
-	hasVSX    bool // Vector scalar unit
-	_         [sys.CacheLineSize]byte
-}
-
-// cpu can be tested at runtime in go assembler code to check for
-// a certain ISA level or hardware capability, for example:
-//	  ·cpu+facilities_hasVSX(SB) for checking the availability of VSX
-//	  or
-//	  ·cpu+facilities_isPOWER7(SB) for checking if the processor implements
-//	  ISA 2.06 instructions.
-var cpu facilities
+//go:linkname cpu_hwcap internal/cpu.ppc64x_hwcap
+//go:linkname cpu_hwcap2 internal/cpu.ppc64x_hwcap2
+var cpu_hwcap uint
+var cpu_hwcap2 uint
 
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		cpu.isPOWER5x = val&_PPC_FEATURE_POWER5_PLUS != 0
-		cpu.isPOWER6 = val&_PPC_FEATURE_ARCH_2_05 != 0
-		cpu.isPOWER6x = val&_PPC_FEATURE_POWER6_EXT != 0
-		cpu.isPOWER7 = val&_PPC_FEATURE_ARCH_2_06 != 0
-		cpu.hasVMX = val&_PPC_FEATURE_HAS_ALTIVEC != 0
-		cpu.hasVSX = val&_PPC_FEATURE_HAS_VSX != 0
+		cpu_hwcap = uint(val)
 	case _AT_HWCAP2:
-		cpu.isPOWER8 = val&_PPC_FEATURE2_ARCH_2_07 != 0
+		cpu_hwcap2 = uint(val)
 	}
 }
diff --git a/src/runtime/os_nacl.go b/src/runtime/os_nacl.go
index 18e6ce6232..6830da4c4f 100644
--- a/src/runtime/os_nacl.go
+++ b/src/runtime/os_nacl.go
@@ -33,7 +33,7 @@ func nacl_thread_create(fn uintptr, stk, tls, xx unsafe.Pointer) int32
 //go:noescape
 func nacl_nanosleep(ts, extra *timespec) int32
 func nanotime() int64
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
 func exit(code int32)
 func osyield()
 
@@ -168,6 +168,9 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 	}
 }
 
+//go:noescape
+func exitThread(wait *uint32)
+
 //go:nosplit
 func semacreate(mp *m) {
 	if mp.waitsema != 0 {
@@ -285,6 +288,9 @@ func sigenable(uint32)                                    {}
 func sigignore(uint32)                                    {}
 func closeonexec(int32)                                   {}
 
+// gsignalStack is unused on nacl.
+type gsignalStack struct{}
+
 var writelock uint32 // test-and-set spin lock for write
 
 /*
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index c26c3c9550..3778969318 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -21,6 +21,9 @@ const (
 	_UC_SIGMASK = 0x01
 	_UC_CPU     = 0x04
 
+	// From <sys/lwp.h>
+	_LWP_DETACHED = 0x00000040
+
 	_EAGAIN = 35
 )
 
@@ -55,7 +58,7 @@ func getcontext(ctxt unsafe.Pointer)
 func lwp_create(ctxt unsafe.Pointer, flags uintptr, lwpid unsafe.Pointer) int32
 
 //go:noescape
-func lwp_park(abstime *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
+func lwp_park(clockid, flags int32, ts *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
 
 //go:noescape
 func lwp_unpark(lwp int32, hint unsafe.Pointer) int32
@@ -73,6 +76,9 @@ const (
 	_CLOCK_VIRTUAL   = 1
 	_CLOCK_PROF      = 2
 	_CLOCK_MONOTONIC = 3
+
+	_TIMER_RELTIME = 0
+	_TIMER_ABSTIME = 1
 )
 
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
@@ -116,10 +122,9 @@ func semasleep(ns int64) int32 {
 
 	// Compute sleep deadline.
 	var tsp *timespec
+	var ts timespec
 	if ns >= 0 {
-		var ts timespec
 		var nsec int32
-		ns += nanotime()
 		ts.set_sec(timediv(ns, 1000000000, &nsec))
 		ts.set_nsec(nsec)
 		tsp = &ts
@@ -135,9 +140,18 @@ func semasleep(ns int64) int32 {
 		}
 
 		// Sleep until unparked by semawakeup or timeout.
-		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
+		ret := lwp_park(_CLOCK_MONOTONIC, _TIMER_RELTIME, tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
 		if ret == _ETIMEDOUT {
 			return -1
+		} else if ret == _EINTR && ns >= 0 {
+			// Avoid sleeping forever if we keep getting
+			// interrupted (for example by the profiling
+			// timer). It would be if tsp upon return had the
+			// remaining time to sleep, but this is good enough.
+			var nsec int32
+			ns /= 2
+			ts.set_sec(timediv(ns, 1000000000, &nsec))
+			ts.set_nsec(nsec)
 		}
 	}
 }
@@ -182,7 +196,7 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 
 	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(netbsdMstart))
 
-	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
+	ret := lwp_create(unsafe.Pointer(&uc), _LWP_DETACHED, unsafe.Pointer(&mp.procid))
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret < 0 {
 		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 45e881aa41..32fdabb29f 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -393,7 +393,7 @@ func postnote(pid uint64, msg []byte) int {
 }
 
 //go:nosplit
-func exit(e int) {
+func exit(e int32) {
 	var status []byte
 	if e == 0 {
 		status = emptystatus
@@ -421,6 +421,12 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 	}
 }
 
+func exitThread(wait *uint32) {
+	// We should never reach exitThread on Plan 9 because we let
+	// the OS clean up threads.
+	throw("exitThread")
+}
+
 //go:nosplit
 func semacreate(mp *m) {
 }
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 233cc165aa..025ff50a08 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -640,6 +640,9 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", getlasterror(), ")\n")
 		throw("runtime.newosproc")
 	}
+
+	// Close thandle to avoid leaking the thread object if it exits.
+	stdcall1(_CloseHandle, thandle)
 }
 
 // Used by the C library build mode. On Linux this function would allocate a
@@ -651,6 +654,12 @@ func newosproc0(mp *m, stk unsafe.Pointer) {
 	newosproc(mp, stk)
 }
 
+func exitThread(wait *uint32) {
+	// We should never reach exitThread on Windows because we let
+	// the OS clean up threads.
+	throw("exitThread")
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
 func mpreinit(mp *m) {
@@ -701,7 +710,7 @@ func stdcall(fn stdFunction) uintptr {
 	if mp.profilehz != 0 {
 		// leave pc/sp for cpu profiler
 		mp.libcallg.set(gp)
-		mp.libcallpc = getcallerpc(unsafe.Pointer(&fn))
+		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp(unsafe.Pointer(&fn))
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 43bfdd7a1e..6fa99d6493 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -83,7 +83,7 @@ func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn
 	// Until the copy completes, we can only call nosplit routines.
 	sp := getcallersp(unsafe.Pointer(&siz))
 	argp := uintptr(unsafe.Pointer(&fn)) + unsafe.Sizeof(fn)
-	callerpc := getcallerpc(unsafe.Pointer(&siz))
+	callerpc := getcallerpc()
 
 	d := newdefer(siz)
 	if d._panic != nil {
@@ -244,36 +244,47 @@ func freedefer(d *_defer) {
 		freedeferfn()
 	}
 	sc := deferclass(uintptr(d.siz))
-	if sc < uintptr(len(p{}.deferpool)) {
-		pp := getg().m.p.ptr()
-		if len(pp.deferpool[sc]) == cap(pp.deferpool[sc]) {
-			// Transfer half of local cache to the central cache.
-			//
-			// Take this slow path on the system stack so
-			// we don't grow freedefer's stack.
-			systemstack(func() {
-				var first, last *_defer
-				for len(pp.deferpool[sc]) > cap(pp.deferpool[sc])/2 {
-					n := len(pp.deferpool[sc])
-					d := pp.deferpool[sc][n-1]
-					pp.deferpool[sc][n-1] = nil
-					pp.deferpool[sc] = pp.deferpool[sc][:n-1]
-					if first == nil {
-						first = d
-					} else {
-						last.link = d
-					}
-					last = d
+	if sc >= uintptr(len(p{}.deferpool)) {
+		return
+	}
+	pp := getg().m.p.ptr()
+	if len(pp.deferpool[sc]) == cap(pp.deferpool[sc]) {
+		// Transfer half of local cache to the central cache.
+		//
+		// Take this slow path on the system stack so
+		// we don't grow freedefer's stack.
+		systemstack(func() {
+			var first, last *_defer
+			for len(pp.deferpool[sc]) > cap(pp.deferpool[sc])/2 {
+				n := len(pp.deferpool[sc])
+				d := pp.deferpool[sc][n-1]
+				pp.deferpool[sc][n-1] = nil
+				pp.deferpool[sc] = pp.deferpool[sc][:n-1]
+				if first == nil {
+					first = d
+				} else {
+					last.link = d
 				}
-				lock(&sched.deferlock)
-				last.link = sched.deferpool[sc]
-				sched.deferpool[sc] = first
-				unlock(&sched.deferlock)
-			})
-		}
-		*d = _defer{}
-		pp.deferpool[sc] = append(pp.deferpool[sc], d)
+				last = d
+			}
+			lock(&sched.deferlock)
+			last.link = sched.deferpool[sc]
+			sched.deferpool[sc] = first
+			unlock(&sched.deferlock)
+		})
 	}
+
+	// These lines used to be simply `*d = _defer{}` but that
+	// started causing a nosplit stack overflow via typedmemmove.
+	d.siz = 0
+	d.started = false
+	d.sp = 0
+	d.pc = 0
+	d.fn = nil
+	d._panic = nil
+	d.link = nil
+
+	pp.deferpool[sc] = append(pp.deferpool[sc], d)
 }
 
 // Separate function so that it can split stack.
@@ -336,7 +347,7 @@ func deferreturn(arg0 uintptr) {
 
 // Goexit terminates the goroutine that calls it. No other goroutine is affected.
 // Goexit runs all deferred calls before terminating the goroutine. Because Goexit
-// is not panic, however, any recover calls in those deferred functions will return nil.
+// is not a panic, any recover calls in those deferred functions will return nil.
 //
 // Calling Goexit from the main goroutine terminates that goroutine
 // without func main returning. Since func main has not returned,
@@ -580,7 +591,7 @@ func startpanic() {
 
 //go:nosplit
 func dopanic(unused int) {
-	pc := getcallerpc(unsafe.Pointer(&unused))
+	pc := getcallerpc()
 	sp := getcallersp(unsafe.Pointer(&unused))
 	gp := getg()
 	systemstack(func() {
@@ -643,6 +654,12 @@ func recovery(gp *g) {
 	gogo(&gp.sched)
 }
 
+// startpanic_m implements unrecoverable panic.
+//
+// It can have write barriers because the write barrier explicitly
+// ignores writes once dying > 0.
+//
+//go:yeswritebarrierrec
 func startpanic_m() {
 	_g_ := getg()
 	if mheap_.cachealloc.size == 0 { // very early
@@ -679,7 +696,7 @@ func startpanic_m() {
 		exit(4)
 		fallthrough
 	default:
-		// Can't even print!  Just exit.
+		// Can't even print! Just exit.
 		exit(5)
 	}
 }
diff --git a/src/runtime/plugin.go b/src/runtime/plugin.go
index 682caacb21..5e05be71ec 100644
--- a/src/runtime/plugin.go
+++ b/src/runtime/plugin.go
@@ -7,22 +7,29 @@ package runtime
 import "unsafe"
 
 //go:linkname plugin_lastmoduleinit plugin.lastmoduleinit
-func plugin_lastmoduleinit() (path string, syms map[string]interface{}, mismatchpkg string) {
-	md := firstmoduledata.next
+func plugin_lastmoduleinit() (path string, syms map[string]interface{}, errstr string) {
+	var md *moduledata
+	for pmd := firstmoduledata.next; pmd != nil; pmd = pmd.next {
+		if pmd.bad {
+			md = nil // we only want the last module
+			continue
+		}
+		md = pmd
+	}
 	if md == nil {
 		throw("runtime: no plugin module data")
 	}
-	for md.next != nil {
-		md = md.next
+	if md.pluginpath == "" {
+		throw("runtime: plugin has empty pluginpath")
 	}
 	if md.typemap != nil {
-		throw("runtime: plugin already initialized")
+		return "", nil, "plugin already loaded"
 	}
 
 	for _, pmd := range activeModules() {
 		if pmd.pluginpath == md.pluginpath {
-			println("plugin: plugin", md.pluginpath, "already loaded")
-			throw("plugin: plugin already loaded")
+			md.bad = true
+			return "", nil, "plugin already loaded"
 		}
 
 		if inRange(pmd.text, pmd.etext, md.text, md.etext) ||
@@ -43,7 +50,8 @@ func plugin_lastmoduleinit() (path string, syms map[string]interface{}, mismatch
 	}
 	for _, pkghash := range md.pkghashes {
 		if pkghash.linktimehash != *pkghash.runtimehash {
-			return "", nil, pkghash.modulename
+			md.bad = true
+			return "", nil, "plugin was built with a different version of package " + pkghash.modulename
 		}
 	}
 
@@ -54,13 +62,11 @@ func plugin_lastmoduleinit() (path string, syms map[string]interface{}, mismatch
 	pluginftabverify(md)
 	moduledataverify1(md)
 
-	lock(&ifaceLock)
+	lock(&itabLock)
 	for _, i := range md.itablinks {
-		if !i.inhash {
-			additab(i, true, false)
-		}
+		itabAdd(i)
 	}
-	unlock(&ifaceLock)
+	unlock(&itabLock)
 
 	// Build a map of symbol names to symbols. Here in the runtime
 	// we fill out the first word of the interface, the type. We
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index 21ea25ce36..d3382a5589 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -18,7 +18,7 @@
 // To add equivalent profiling support to a standalone program, add
 // code like the following to your main function:
 //
-//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile `file`")
+//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`")
 //    var memprofile = flag.String("memprofile", "", "write memory profile to `file`")
 //
 //    func main() {
@@ -319,7 +319,15 @@ func (p *Profile) WriteTo(w io.Writer, debug int) error {
 	p.mu.Unlock()
 
 	// Map order is non-deterministic; make output deterministic.
-	sort.Sort(stackProfile(all))
+	sort.Slice(all, func(i, j int) bool {
+		t, u := all[i], all[j]
+		for k := 0; k < len(t) && k < len(u); k++ {
+			if t[k] != u[k] {
+				return t[k] < u[k]
+			}
+		}
+		return len(t) < len(u)
+	})
 
 	return printCountProfile(w, debug, p.name, stackProfile(all))
 }
@@ -328,16 +336,6 @@ type stackProfile [][]uintptr
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
-func (x stackProfile) Swap(i, j int)         { x[i], x[j] = x[j], x[i] }
-func (x stackProfile) Less(i, j int) bool {
-	t, u := x[i], x[j]
-	for k := 0; k < len(t) && k < len(u); k++ {
-		if t[k] != u[k] {
-			return t[k] < u[k]
-		}
-	}
-	return len(t) < len(u)
-}
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -348,6 +346,41 @@ type countProfile interface {
 	Stack(i int) []uintptr
 }
 
+// printCountCycleProfile outputs block profile records (for block or mutex profiles)
+// as the pprof-proto format output. Translations from cycle count to time duration
+// are done because The proto expects count and time (nanoseconds) instead of count
+// and the number of cycles for block, contention profiles.
+func printCountCycleProfile(w io.Writer, countName, cycleName string, records []runtime.BlockProfileRecord) error {
+	// Output profile in protobuf form.
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, countName, "count")
+	b.pb.int64Opt(tagProfile_Period, 1)
+	b.pbValueType(tagProfile_SampleType, countName, "count")
+	b.pbValueType(tagProfile_SampleType, cycleName, "nanoseconds")
+
+	cpuGHz := float64(runtime_cyclesPerSecond()) / 1e9
+
+	values := []int64{0, 0}
+	var locs []uint64
+	for _, r := range records {
+		values[0] = int64(r.Count)
+		values[1] = int64(float64(r.Cycles) / cpuGHz) // to nanoseconds
+		locs = locs[:0]
+		for _, addr := range r.Stack() {
+			// For count profiles, all stack addresses are
+			// return PCs, which is what locForPC expects.
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
+			}
+			locs = append(locs, l)
+		}
+		b.pbSample(values, locs, nil)
+	}
+	b.build()
+	return nil
+}
+
 // printCountProfile prints a countProfile at the specified debug level.
 // The profile will be in compressed proto format unless debug is nonzero.
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
@@ -476,6 +509,14 @@ func countHeap() int {
 
 // writeHeap writes the current runtime heap profile to w.
 func writeHeap(w io.Writer, debug int) error {
+	var memStats *runtime.MemStats
+	if debug != 0 {
+		// Read mem stats first, so that our other allocations
+		// do not appear in the statistics.
+		memStats = new(runtime.MemStats)
+		runtime.ReadMemStats(memStats)
+	}
+
 	// Find out how many records there are (MemProfile(nil, true)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
@@ -538,8 +579,7 @@ func writeHeap(w io.Writer, debug int) error {
 
 	// Print memstats information too.
 	// Pprof will ignore, but useful for people
-	s := new(runtime.MemStats)
-	runtime.ReadMemStats(s)
+	s := memStats
 	fmt.Fprintf(w, "\n# runtime.MemStats\n")
 	fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
 	fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
@@ -765,14 +805,14 @@ func writeBlock(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- contention:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	for i := range p {
@@ -809,14 +849,14 @@ func writeMutex(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- mutex:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 955964c721..96fcfc9703 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -26,16 +26,18 @@ import (
 	"time"
 )
 
-func cpuHogger(f func() int, dur time.Duration) {
+func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
 	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
 	// otherwise TestCPUProfileMultithreaded can fail if only one
 	// thread is scheduled during the testing period.
 	t0 := time.Now()
+	accum := *y
 	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
-		f()
+		accum = f(accum)
 	}
+	*y = accum
 }
 
 var (
@@ -46,8 +48,8 @@ var (
 // The actual CPU hogging function.
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
-func cpuHog1() int {
-	foo := salt1
+func cpuHog1(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -58,8 +60,8 @@ func cpuHog1() int {
 	return foo
 }
 
-func cpuHog2() int {
-	foo := salt2
+func cpuHog2(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -72,7 +74,7 @@ func cpuHog2() int {
 
 func TestCPUProfile(t *testing.T) {
 	testCPUProfile(t, []string{"runtime/pprof.cpuHog1"}, func(dur time.Duration) {
-		cpuHogger(cpuHog1, dur)
+		cpuHogger(cpuHog1, &salt1, dur)
 	})
 }
 
@@ -81,29 +83,29 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	testCPUProfile(t, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 			c <- 1
 		}()
-		cpuHogger(cpuHog2, dur)
+		cpuHogger(cpuHog2, &salt2, dur)
 		<-c
 	})
 }
 
 func TestCPUProfileInlining(t *testing.T) {
 	testCPUProfile(t, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, func(dur time.Duration) {
-		cpuHogger(inlinedCaller, dur)
+		cpuHogger(inlinedCaller, &salt1, dur)
 	})
 }
 
-func inlinedCaller() int {
-	inlinedCallee()
-	return 0
+func inlinedCaller(x int) int {
+	x = inlinedCallee(x)
+	return x
 }
 
-func inlinedCallee() {
+func inlinedCallee(x int) int {
 	// We could just use cpuHog1, but for loops prevent inlining
 	// right now. :(
-	foo := salt1
+	foo := x
 	i := 0
 loop:
 	if foo > 0 {
@@ -114,7 +116,7 @@ loop:
 	if i++; i < 1e5 {
 		goto loop
 	}
-	salt1 = foo
+	return foo
 }
 
 func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
@@ -177,9 +179,9 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 		}
 	}
 
-	if badOS[runtime.GOOS] {
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "netbsd", "solaris":
 		t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
-		return
 	}
 	// Ignore the failure if the tests are running in a QEMU-based emulator,
 	// QEMU is not perfect at emulating everything.
@@ -187,7 +189,6 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
 	if os.Getenv("IN_QEMU") == "1" {
 		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
-		return
 	}
 	t.FailNow()
 }
@@ -394,59 +395,107 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
-// Operating systems that are expected to fail the tests. See issue 13841.
-var badOS = map[string]bool{
-	"darwin":    true,
-	"netbsd":    true,
-	"plan9":     true,
-	"dragonfly": true,
-	"solaris":   true,
-}
-
 func TestBlockProfile(t *testing.T) {
 	type TestCase struct {
 		name string
 		f    func()
+		stk  []string
 		re   string
 	}
 	tests := [...]TestCase{
-		{"chan recv", blockChanRecv, `
+		{
+			name: "chan recv",
+			f:    blockChanRecv,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanRecv",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan send", blockChanSend, `
+		{
+			name: "chan send",
+			f:    blockChanSend,
+			stk: []string{
+				"runtime.chansend1",
+				"runtime/pprof.blockChanSend",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chansend1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanSend\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan close", blockChanClose, `
+		{
+			name: "chan close",
+			f:    blockChanClose,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanClose",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanClose\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select recv async", blockSelectRecvAsync, `
+		{
+			name: "select recv async",
+			f:    blockSelectRecvAsync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectRecvAsync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select send sync", blockSelectSendSync, `
+		{
+			name: "select send sync",
+			f:    blockSelectSendSync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectSendSync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"mutex", blockMutex, `
+		{
+			name: "mutex",
+			f:    blockMutex,
+			stk: []string{
+				"sync.(*Mutex).Lock",
+				"runtime/pprof.blockMutex",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9a-f]+	.*/src/sync/mutex\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockMutex\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"cond", blockCond, `
+		{
+			name: "cond",
+			f:    blockCond,
+			stk: []string{
+				"sync.(*Cond).Wait",
+				"runtime/pprof.blockCond",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9a-f]+	.*/src/sync/cond\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockCond\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
@@ -454,28 +503,84 @@ func TestBlockProfile(t *testing.T) {
 `},
 	}
 
+	// Generate block profile
 	runtime.SetBlockProfileRate(1)
 	defer runtime.SetBlockProfileRate(0)
 	for _, test := range tests {
 		test.f()
 	}
-	var w bytes.Buffer
-	Lookup("block").WriteTo(&w, 1)
-	prof := w.String()
 
-	if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
-		t.Fatalf("Bad profile header:\n%v", prof)
-	}
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 1)
+		prof := w.String()
 
-	if strings.HasSuffix(prof, "#\t0x0\n\n") {
-		t.Errorf("Useless 0 suffix:\n%v", prof)
+		if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
+			t.Fatalf("Bad profile header:\n%v", prof)
+		}
+
+		if strings.HasSuffix(prof, "#\t0x0\n\n") {
+			t.Errorf("Useless 0 suffix:\n%v", prof)
+		}
+
+		for _, test := range tests {
+			if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
+				t.Errorf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+			}
+		}
+	})
+
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, test := range tests {
+			if !containsStack(stks, test.stk) {
+				t.Errorf("No matching stack entry for %v, want %+v", test.name, test.stk)
+			}
+		}
+	})
+
+}
+
+func stacks(p *profile.Profile) (res [][]string) {
+	for _, s := range p.Sample {
+		var stk []string
+		for _, l := range s.Location {
+			for _, line := range l.Line {
+				stk = append(stk, line.Function.Name)
+			}
+		}
+		res = append(res, stk)
 	}
+	return res
+}
 
-	for _, test := range tests {
-		if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
-			t.Fatalf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+func containsStack(got [][]string, want []string) bool {
+	for _, stk := range got {
+		if len(stk) < len(want) {
+			continue
+		}
+		for i, f := range want {
+			if f != stk[i] {
+				break
+			}
+			if i == len(want)-1 {
+				return true
+			}
 		}
 	}
+	return false
 }
 
 const blockDelay = 10 * time.Millisecond
@@ -567,6 +672,8 @@ func blockCond() {
 }
 
 func TestMutexProfile(t *testing.T) {
+	// Generate mutex profile
+
 	old := runtime.SetMutexProfileFraction(1)
 	defer runtime.SetMutexProfileFraction(old)
 	if old != 0 {
@@ -575,31 +682,57 @@ func TestMutexProfile(t *testing.T) {
 
 	blockMutex()
 
-	var w bytes.Buffer
-	Lookup("mutex").WriteTo(&w, 1)
-	prof := w.String()
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 1)
+		prof := w.String()
+		t.Logf("received profile: %v", prof)
 
-	if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
-		t.Errorf("Bad profile header:\n%v", prof)
-	}
-	prof = strings.Trim(prof, "\n")
-	lines := strings.Split(prof, "\n")
-	if len(lines) != 6 {
-		t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
-	}
-	if len(lines) < 6 {
-		return
-	}
-	// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
-	r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
-	//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
-	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
-		t.Errorf("%q didn't match %q", lines[3], r2)
-	}
-	r3 := "^#.*runtime/pprof.blockMutex.*$"
-	if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
-		t.Errorf("%q didn't match %q", lines[5], r3)
-	}
+		if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
+			t.Errorf("Bad profile header:\n%v", prof)
+		}
+		prof = strings.Trim(prof, "\n")
+		lines := strings.Split(prof, "\n")
+		if len(lines) != 6 {
+			t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
+		}
+		if len(lines) < 6 {
+			return
+		}
+		// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
+		r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
+		//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
+		if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
+			t.Errorf("%q didn't match %q", lines[3], r2)
+		}
+		r3 := "^#.*runtime/pprof.blockMutex.*$"
+		if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
+			t.Errorf("%q didn't match %q", lines[5], r3)
+		}
+		t.Logf(prof)
+	})
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, want := range [][]string{
+			{"sync.(*Mutex).Unlock", "runtime/pprof.blockMutex.func1"},
+		} {
+			if !containsStack(stks, want) {
+				t.Errorf("No matching stack entry for %+v", want)
+			}
+		}
+	})
 }
 
 func func1(c chan int) { <-c }
@@ -712,7 +845,7 @@ func TestEmptyCallStack(t *testing.T) {
 func TestCPUProfileLabel(t *testing.T) {
 	testCPUProfile(t, []string{"runtime/pprof.cpuHogger;key=value"}, func(dur time.Duration) {
 		Do(context.Background(), Labels("key", "value"), func(context.Context) {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 		})
 	})
 }
@@ -725,14 +858,15 @@ func TestLabelRace(t *testing.T) {
 		start := time.Now()
 		var wg sync.WaitGroup
 		for time.Since(start) < dur {
+			var salts [10]int
 			for i := 0; i < 10; i++ {
 				wg.Add(1)
-				go func() {
+				go func(j int) {
 					Do(context.Background(), Labels("key", "value"), func(context.Context) {
-						cpuHogger(cpuHog1, time.Millisecond)
+						cpuHogger(cpuHog1, &salts[j], time.Millisecond)
 					})
 					wg.Done()
-				}()
+				}(i)
 			}
 			wg.Wait()
 		}
diff --git a/src/runtime/print.go b/src/runtime/print.go
index 8fa3d39905..a698fcb0e0 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -56,7 +56,7 @@ var debuglock mutex
 
 // The compiler emits calls to printlock and printunlock around
 // the multiple calls that implement a single Go print or println
-// statement. Some of the print helpers (printsp, for example)
+// statement. Some of the print helpers (printslice, for example)
 // call print recursively. There is also the problem of a crash
 // happening during the print routines and needing to acquire
 // the print lock to print information about the crash.
@@ -98,31 +98,31 @@ func gwrite(b []byte) {
 }
 
 func printsp() {
-	print(" ")
+	printstring(" ")
 }
 
 func printnl() {
-	print("\n")
+	printstring("\n")
 }
 
 func printbool(v bool) {
 	if v {
-		print("true")
+		printstring("true")
 	} else {
-		print("false")
+		printstring("false")
 	}
 }
 
 func printfloat(v float64) {
 	switch {
 	case v != v:
-		print("NaN")
+		printstring("NaN")
 		return
 	case v+v == v && v > 0:
-		print("+Inf")
+		printstring("+Inf")
 		return
 	case v+v == v && v < 0:
-		print("-Inf")
+		printstring("-Inf")
 		return
 	}
 
@@ -204,7 +204,7 @@ func printuint(v uint64) {
 
 func printint(v int64) {
 	if v < 0 {
-		print("-")
+		printstring("-")
 		v = -v
 	}
 	printuint(uint64(v))
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 5787991f07..ff441badde 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -176,6 +176,9 @@ func main() {
 		if _cgo_notify_runtime_init_done == nil {
 			throw("_cgo_notify_runtime_init_done missing")
 		}
+		// Start the template thread in case we enter Go from
+		// a C-created thread and need to create a new thread.
+		startTemplateThread()
 		cgocall(_cgo_notify_runtime_init_done, nil)
 	}
 
@@ -252,9 +255,10 @@ func forcegchelper() {
 	}
 }
 
+//go:nosplit
+
 // Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
-//go:nosplit
 func Gosched() {
 	mcall(gosched_m)
 }
@@ -342,8 +346,8 @@ func releaseSudog(s *sudog) {
 	if s.elem != nil {
 		throw("runtime: sudog with non-nil elem")
 	}
-	if s.selectdone != nil {
-		throw("runtime: sudog with non-nil selectdone")
+	if s.isSelect {
+		throw("runtime: sudog with non-false isSelect")
 	}
 	if s.next != nil {
 		throw("runtime: sudog with non-nil next")
@@ -432,7 +436,7 @@ func badctxt() {
 
 func lockedOSThread() bool {
 	gp := getg()
-	return gp.lockedm != nil && gp.m.lockedg != nil
+	return gp.lockedm != 0 && gp.m.lockedg != 0
 }
 
 var (
@@ -498,13 +502,21 @@ func schedinit() {
 	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
 		procs = n
 	}
-	if procs > _MaxGomaxprocs {
-		procs = _MaxGomaxprocs
-	}
 	if procresize(procs) != nil {
 		throw("unknown runnable goroutine during bootstrap")
 	}
 
+	// For cgocheck > 1, we turn on the write barrier at all times
+	// and check all pointer writes. We can't do this until after
+	// procresize because the write barrier needs a P.
+	if debug.cgocheck > 1 {
+		writeBarrier.cgo = true
+		writeBarrier.enabled = true
+		for _, p := range allp {
+			p.wbBuf.reset()
+		}
+	}
+
 	if buildVersion == "" {
 		// Condition should never trigger. This code just serves
 		// to ensure runtime·buildVersion is kept in the resulting binary.
@@ -520,7 +532,7 @@ func dumpgstatus(gp *g) {
 
 func checkmcount() {
 	// sched lock is held
-	if sched.mcount > sched.maxmcount {
+	if mcount() > sched.maxmcount {
 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
 		throw("thread exhaustion")
 	}
@@ -534,15 +546,20 @@ func mcommoninit(mp *m) {
 		callers(1, mp.createstack[:])
 	}
 
-	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
-	if mp.fastrand == 0 {
-		mp.fastrand = 0x49f6428a
-	}
-
 	lock(&sched.lock)
-	mp.id = sched.mcount
-	sched.mcount++
+	if sched.mnext+1 < sched.mnext {
+		throw("runtime: thread ID overflow")
+	}
+	mp.id = sched.mnext
+	sched.mnext++
 	checkmcount()
+
+	mp.fastrand[0] = 1597334677 * uint32(mp.id)
+	mp.fastrand[1] = uint32(cputicks())
+	if mp.fastrand[0]|mp.fastrand[1] == 0 {
+		mp.fastrand[1] = 1
+	}
+
 	mpreinit(mp)
 	if mp.gsignal != nil {
 		mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
@@ -765,8 +782,10 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		// _Grunning or _Grunning|_Gscan; either way,
 		// we own gp.gcscanvalid, so it's safe to read.
 		// gp.gcscanvalid must not be true when we are running.
-		print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-		throw("casgstatus")
+		systemstack(func() {
+			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
+			throw("casgstatus")
+		})
 	}
 
 	// See http://golang.org/cl/21503 for justification of the yield delay.
@@ -951,7 +970,7 @@ func stopTheWorld(reason string) {
 
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(false) })
 	// worldsema must be held over startTheWorldWithSema to ensure
 	// gomaxprocs cannot change while worldsema is held.
 	semrelease(&worldsema)
@@ -1001,8 +1020,7 @@ func stopTheWorldWithSema() {
 	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	sched.stopwait--
 	// try to retake all P's in Psyscall status
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
 			if trace.enabled {
@@ -1042,8 +1060,7 @@ func stopTheWorldWithSema() {
 	if sched.stopwait != 0 {
 		bad = "stopTheWorld: not stopped (stopwait != 0)"
 	} else {
-		for i := 0; i < int(gomaxprocs); i++ {
-			p := allp[i]
+		for _, p := range allp {
 			if p.status != _Pgcstop {
 				bad = "stopTheWorld: not stopped (status != _Pgcstop)"
 			}
@@ -1067,12 +1084,14 @@ func mhelpgc() {
 	_g_.m.helpgc = -1
 }
 
-func startTheWorldWithSema() {
+func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	_g_ := getg()
 
-	_g_.m.locks++        // disable preemption because it can be holding p in a local var
-	gp := netpoll(false) // non-blocking
-	injectglist(gp)
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+	if netpollinited() {
+		gp := netpoll(false) // non-blocking
+		injectglist(gp)
+	}
 	add := needaddgcproc()
 	lock(&sched.lock)
 
@@ -1107,6 +1126,12 @@ func startTheWorldWithSema() {
 		}
 	}
 
+	// Capture start-the-world time before doing clean-up tasks.
+	startTime := nanotime()
+	if emitTraceEvent {
+		traceGCSTWDone()
+	}
+
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
@@ -1128,14 +1153,25 @@ func startTheWorldWithSema() {
 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
 		_g_.stackguard0 = stackPreempt
 	}
+
+	return startTime
 }
 
 // Called to start an M.
+//
+// This must not split the stack because we may not even have stack
+// bounds set up yet.
+//
+// May run during STW (because it doesn't have a P yet), so write
+// barriers are not allowed.
+//
 //go:nosplit
+//go:nowritebarrierrec
 func mstart() {
 	_g_ := getg()
 
-	if _g_.stack.lo == 0 {
+	osStack := _g_.stack.lo == 0
+	if osStack {
 		// Initialize stack bounds from system stack.
 		// Cgo may have left stack size in stack.hi.
 		size := _g_.stack.hi
@@ -1149,33 +1185,37 @@ func mstart() {
 	// both Go and C functions with stack growth prologues.
 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
 	_g_.stackguard1 = _g_.stackguard0
-	mstart1()
+	mstart1(0)
+
+	// Exit this thread.
+	if GOOS == "windows" || GOOS == "solaris" || GOOS == "plan9" {
+		// Window, Solaris and Plan 9 always system-allocate
+		// the stack, but put it in _g_.stack before mstart,
+		// so the logic above hasn't set osStack yet.
+		osStack = true
+	}
+	mexit(osStack)
 }
 
-func mstart1() {
+func mstart1(dummy int32) {
 	_g_ := getg()
 
 	if _g_ != _g_.m.g0 {
 		throw("bad runtime·mstart")
 	}
 
-	// Record top of stack for use by mcall.
-	// Once we call schedule we're never coming back,
-	// so other calls can reuse this stack space.
-	gosave(&_g_.m.g0.sched)
-	_g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
+	// Record the caller for use as the top of stack in mcall and
+	// for terminating the thread.
+	// We're never coming back to mstart1 after we call schedule,
+	// so other calls can reuse the current frame.
+	save(getcallerpc(), getcallersp(unsafe.Pointer(&dummy)))
 	asminit()
 	minit()
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
 	if _g_.m == &m0 {
-		// Create an extra M for callbacks on threads not created by Go.
-		if iscgo && !cgoHasExtraM {
-			cgoHasExtraM = true
-			newextram()
-		}
-		initsig(false)
+		mstartm0()
 	}
 
 	if fn := _g_.m.mstartfn; fn != nil {
@@ -1192,6 +1232,114 @@ func mstart1() {
 	schedule()
 }
 
+// mstartm0 implements part of mstart1 that only runs on the m0.
+//
+// Write barriers are allowed here because we know the GC can't be
+// running yet, so they'll be no-ops.
+//
+//go:yeswritebarrierrec
+func mstartm0() {
+	// Create an extra M for callbacks on threads not created by Go.
+	if iscgo && !cgoHasExtraM {
+		cgoHasExtraM = true
+		newextram()
+	}
+	initsig(false)
+}
+
+// mexit tears down and exits the current thread.
+//
+// Don't call this directly to exit the thread, since it must run at
+// the top of the thread stack. Instead, use gogo(&_g_.m.g0.sched) to
+// unwind the stack to the point that exits the thread.
+//
+// It is entered with m.p != nil, so write barriers are allowed. It
+// will release the P before exiting.
+//
+//go:yeswritebarrierrec
+func mexit(osStack bool) {
+	g := getg()
+	m := g.m
+
+	if m == &m0 {
+		// This is the main thread. Just wedge it.
+		//
+		// On Linux, exiting the main thread puts the process
+		// into a non-waitable zombie state. On Plan 9,
+		// exiting the main thread unblocks wait even though
+		// other threads are still running. On Solaris we can
+		// neither exitThread nor return from mstart. Other
+		// bad things probably happen on other platforms.
+		//
+		// We could try to clean up this M more before wedging
+		// it, but that complicates signal handling.
+		handoffp(releasep())
+		lock(&sched.lock)
+		sched.nmfreed++
+		checkdead()
+		unlock(&sched.lock)
+		notesleep(&m.park)
+		throw("locked m0 woke up")
+	}
+
+	sigblock()
+	unminit()
+
+	// Free the gsignal stack.
+	if m.gsignal != nil {
+		stackfree(m.gsignal.stack)
+	}
+
+	// Remove m from allm.
+	lock(&sched.lock)
+	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
+		if *pprev == m {
+			*pprev = m.alllink
+			goto found
+		}
+	}
+	throw("m not found in allm")
+found:
+	if !osStack {
+		// Delay reaping m until it's done with the stack.
+		//
+		// If this is using an OS stack, the OS will free it
+		// so there's no need for reaping.
+		atomic.Store(&m.freeWait, 1)
+		// Put m on the free list, though it will not be reaped until
+		// freeWait is 0. Note that the free list must not be linked
+		// through alllink because some functions walk allm without
+		// locking, so may be using alllink.
+		m.freelink = sched.freem
+		sched.freem = m
+	}
+	unlock(&sched.lock)
+
+	// Release the P.
+	handoffp(releasep())
+	// After this point we must not have write barriers.
+
+	// Invoke the deadlock detector. This must happen after
+	// handoffp because it may have started a new M to take our
+	// P's work.
+	lock(&sched.lock)
+	sched.nmfreed++
+	checkdead()
+	unlock(&sched.lock)
+
+	if osStack {
+		// Return from mstart and let the system thread
+		// library free the g0 stack and terminate the thread.
+		return
+	}
+
+	// mstart is the thread's entry point, so there's nothing to
+	// return to. Exit the thread directly. exitThread will clear
+	// m.freeWait when it's done with the stack and the m can be
+	// reaped.
+	exitThread(&m.freeWait)
+}
+
 // forEachP calls fn(p) for every P p when p reaches a GC safe point.
 // If a P is currently executing code, this will bring the P to a GC
 // safe point and execute fn on that P. If the P is not executing code
@@ -1215,7 +1363,7 @@ func forEachP(fn func(*p)) {
 	sched.safePointFn = fn
 
 	// Ask all Ps to run the safe point function.
-	for _, p := range allp[:gomaxprocs] {
+	for _, p := range allp {
 		if p != _p_ {
 			atomic.Store(&p.runSafePointFn, 1)
 		}
@@ -1243,8 +1391,7 @@ func forEachP(fn func(*p)) {
 
 	// Force Ps currently in _Psyscall into _Pidle and hand them
 	// off to induce safe point function execution.
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
 			if trace.enabled {
@@ -1273,8 +1420,7 @@ func forEachP(fn func(*p)) {
 	if sched.safePointWait != 0 {
 		throw("forEachP: not done")
 	}
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		if p.runSafePointFn != 0 {
 			throw("forEachP: P did not run fn")
 		}
@@ -1339,6 +1485,27 @@ func allocm(_p_ *p, fn func()) *m {
 	if _g_.m.p == 0 {
 		acquirep(_p_) // temporarily borrow p for mallocs in this function
 	}
+
+	// Release the free M list. We need to do this somewhere and
+	// this may free up a stack we can use.
+	if sched.freem != nil {
+		lock(&sched.lock)
+		var newList *m
+		for freem := sched.freem; freem != nil; {
+			if freem.freeWait != 0 {
+				next := freem.freelink
+				freem.freelink = newList
+				newList = freem
+				freem = next
+				continue
+			}
+			stackfree(freem.g0.stack)
+			freem = freem.freelink
+		}
+		sched.freem = newList
+		unlock(&sched.lock)
+	}
+
 	mp := new(m)
 	mp.mstartfn = fn
 	mcommoninit(mp)
@@ -1498,9 +1665,9 @@ func oneNewExtraM() {
 	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
-	mp.locked = _LockInternal
-	mp.lockedg = gp
-	gp.lockedm = mp
+	mp.lockedInt++
+	mp.lockedg.set(gp)
+	gp.lockedm.set(mp)
 	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
 	if raceenabled {
 		gp.racectx = racegostart(funcPC(newextram) + sys.PCQuantum)
@@ -1629,6 +1796,27 @@ func unlockextra(mp *m) {
 // around exec'ing while creating/destroying threads.  See issue #19546.
 var execLock rwmutex
 
+// newmHandoff contains a list of m structures that need new OS threads.
+// This is used by newm in situations where newm itself can't safely
+// start an OS thread.
+var newmHandoff struct {
+	lock mutex
+
+	// newm points to a list of M structures that need new OS
+	// threads. The list is linked through m.schedlink.
+	newm muintptr
+
+	// waiting indicates that wake needs to be notified when an m
+	// is put on the list.
+	waiting bool
+	wake    note
+
+	// haveTemplateThread indicates that the templateThread has
+	// been started. This is not protected by lock. Use cas to set
+	// to 1.
+	haveTemplateThread uint32
+}
+
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
@@ -1637,6 +1825,35 @@ func newm(fn func(), _p_ *p) {
 	mp := allocm(_p_, fn)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
+	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
+		// We're on a locked M or a thread that may have been
+		// started by C. The kernel state of this thread may
+		// be strange (the user may have locked it for that
+		// purpose). We don't want to clone that into another
+		// thread. Instead, ask a known-good thread to create
+		// the thread for us.
+		//
+		// This is disabled on Plan 9. See golang.org/issue/22227.
+		//
+		// TODO: This may be unnecessary on Windows, which
+		// doesn't model thread creation off fork.
+		lock(&newmHandoff.lock)
+		if newmHandoff.haveTemplateThread == 0 {
+			throw("on a locked thread with no template thread")
+		}
+		mp.schedlink = newmHandoff.newm
+		newmHandoff.newm.set(mp)
+		if newmHandoff.waiting {
+			newmHandoff.waiting = false
+			notewakeup(&newmHandoff.wake)
+		}
+		unlock(&newmHandoff.lock)
+		return
+	}
+	newm1(mp)
+}
+
+func newm1(mp *m) {
 	if iscgo {
 		var ts cgothreadstart
 		if _cgo_thread_start == nil {
@@ -1658,6 +1875,56 @@ func newm(fn func(), _p_ *p) {
 	execLock.runlock()
 }
 
+// startTemplateThread starts the template thread if it is not already
+// running.
+//
+// The calling thread must itself be in a known-good state.
+func startTemplateThread() {
+	if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+		return
+	}
+	newm(templateThread, nil)
+}
+
+// tmeplateThread is a thread in a known-good state that exists solely
+// to start new threads in known-good states when the calling thread
+// may not be a a good state.
+//
+// Many programs never need this, so templateThread is started lazily
+// when we first enter a state that might lead to running on a thread
+// in an unknown state.
+//
+// templateThread runs on an M without a P, so it must not have write
+// barriers.
+//
+//go:nowritebarrierrec
+func templateThread() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
+	for {
+		lock(&newmHandoff.lock)
+		for newmHandoff.newm != 0 {
+			newm := newmHandoff.newm.ptr()
+			newmHandoff.newm = 0
+			unlock(&newmHandoff.lock)
+			for newm != nil {
+				next := newm.schedlink.ptr()
+				newm.schedlink = 0
+				newm1(newm)
+				newm = next
+			}
+			lock(&newmHandoff.lock)
+		}
+		newmHandoff.waiting = true
+		noteclear(&newmHandoff.wake)
+		unlock(&newmHandoff.lock)
+		notesleep(&newmHandoff.wake)
+	}
+}
+
 // Stops execution of the current m until new work is available.
 // Returns with acquired P.
 func stopm() {
@@ -1680,7 +1947,9 @@ retry:
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
 	if _g_.m.helpgc != 0 {
+		// helpgc() set _g_.m.p and _g_.m.mcache, so we have a P.
 		gchelper()
+		// Undo the effects of helpgc().
 		_g_.m.helpgc = 0
 		_g_.m.mcache = nil
 		_g_.m.p = 0
@@ -1814,7 +2083,7 @@ func wakep() {
 func stoplockedm() {
 	_g_ := getg()
 
-	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+	if _g_.m.lockedg == 0 || _g_.m.lockedg.ptr().lockedm.ptr() != _g_.m {
 		throw("stoplockedm: inconsistent locking")
 	}
 	if _g_.m.p != 0 {
@@ -1826,7 +2095,7 @@ func stoplockedm() {
 	// Wait until another thread schedules lockedg again.
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
-	status := readgstatus(_g_.m.lockedg)
+	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
 		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
 		dumpgstatus(_g_)
@@ -1842,7 +2111,7 @@ func stoplockedm() {
 func startlockedm(gp *g) {
 	_g_ := getg()
 
-	mp := gp.lockedm
+	mp := gp.lockedm.ptr()
 	if mp == _g_.m {
 		throw("startlockedm: locked to me")
 	}
@@ -1968,11 +2237,12 @@ top:
 
 	// Poll network.
 	// This netpoll is only an optimization before we resort to stealing.
-	// We can safely skip it if there a thread blocked in netpoll already.
-	// If there is any kind of logical race with that blocked thread
-	// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
-	// this thread will do blocking netpoll below anyway.
-	if netpollinited() && sched.lastpoll != 0 {
+	// We can safely skip it if there are no waiters or a thread is blocked
+	// in netpoll already. If there is any kind of logical race with that
+	// blocked thread (e.g. it has already returned from netpoll, but does
+	// not set lastpoll yet), this thread will do blocking netpoll below
+	// anyway.
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
 		if gp := netpoll(false); gp != nil { // non-blocking
 			// netpoll returns list of goroutines linked by schedlink.
 			injectglist(gp.schedlink.ptr())
@@ -2068,9 +2338,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for i := 0; i < int(gomaxprocs); i++ {
-		_p_ := allp[i]
-		if _p_ != nil && !runqempty(_p_) {
+	for _, _p_ := range allp {
+		if !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2209,9 +2478,15 @@ func schedule() {
 		throw("schedule: holding locks")
 	}
 
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		stoplockedm()
-		execute(_g_.m.lockedg, false) // Never returns.
+		execute(_g_.m.lockedg.ptr(), false) // Never returns.
+	}
+
+	// We should not schedule away from a g that is executing a cgo call,
+	// since the cgo call is using the m's g0 stack.
+	if _g_.m.incgo {
+		throw("schedule: in cgo")
 	}
 
 top:
@@ -2262,7 +2537,7 @@ top:
 		resetspinning()
 	}
 
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
 		// then blocks waiting for a new p.
 		startlockedm(gp)
@@ -2381,8 +2656,9 @@ func goexit0(gp *g) {
 		atomic.Xadd(&sched.ngsys, -1)
 	}
 	gp.m = nil
-	gp.lockedm = nil
-	_g_.m.lockedg = nil
+	locked := gp.lockedm != 0
+	gp.lockedm = 0
+	_g_.m.lockedg = 0
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
@@ -2392,17 +2668,37 @@ func goexit0(gp *g) {
 	gp.labels = nil
 	gp.timer = nil
 
+	if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 {
+		// Flush assist credit to the global pool. This gives
+		// better information to pacing if the application is
+		// rapidly creating an exiting goroutines.
+		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
+		gp.gcAssistBytes = 0
+	}
+
 	// Note that gp's stack scan is now "valid" because it has no
 	// stack.
 	gp.gcscanvalid = true
 	dropg()
 
-	if _g_.m.locked&^_LockExternal != 0 {
-		print("invalid m->locked = ", _g_.m.locked, "\n")
+	if _g_.m.lockedInt != 0 {
+		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
 		throw("internal lockOSThread error")
 	}
-	_g_.m.locked = 0
+	_g_.m.lockedExt = 0
 	gfput(_g_.m.p.ptr(), gp)
+	if locked {
+		// The goroutine may have locked this thread because
+		// it put it in an unusual kernel state. Kill it
+		// rather than returning it to the thread pool.
+
+		// Return to mstart, which will release the P and exit
+		// the thread.
+		if GOOS != "plan9" { // See golang.org/issue/22227.
+			gogo(&_g_.m.g0.sched)
+		}
+	}
 	schedule()
 }
 
@@ -2532,7 +2828,7 @@ func reentersyscall(pc, sp uintptr) {
 // Standard syscall entry used by the go syscall library and normal cgo calls.
 //go:nosplit
 func entersyscall(dummy int32) {
-	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+	reentersyscall(getcallerpc(), getcallersp(unsafe.Pointer(&dummy)))
 }
 
 func entersyscall_sysmon() {
@@ -2575,7 +2871,7 @@ func entersyscallblock(dummy int32) {
 	_g_.m.p.ptr().syscalltick++
 
 	// Leave SP around for GC and traceback.
-	pc := getcallerpc(unsafe.Pointer(&dummy))
+	pc := getcallerpc()
 	sp := getcallersp(unsafe.Pointer(&dummy))
 	save(pc, sp)
 	_g_.syscallsp = _g_.sched.sp
@@ -2600,7 +2896,7 @@ func entersyscallblock(dummy int32) {
 	systemstack(entersyscallblock_handoff)
 
 	// Resave for traceback during blocked call.
-	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+	save(getcallerpc(), getcallersp(unsafe.Pointer(&dummy)))
 
 	_g_.m.locks--
 }
@@ -2639,7 +2935,9 @@ func exitsyscall(dummy int32) {
 	oldp := _g_.m.p.ptr()
 	if exitsyscallfast() {
 		if _g_.m.mcache == nil {
-			throw("lost mcache")
+			systemstack(func() {
+				throw("lost mcache")
+			})
 		}
 		if trace.enabled {
 			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
@@ -2686,7 +2984,9 @@ func exitsyscall(dummy int32) {
 	mcall(exitsyscall0)
 
 	if _g_.m.mcache == nil {
-		throw("lost mcache")
+		systemstack(func() {
+			throw("lost mcache")
+		})
 	}
 
 	// Scheduler returned, so we're allowed to run now.
@@ -2810,7 +3110,7 @@ func exitsyscall0(gp *g) {
 		acquirep(_p_)
 		execute(gp, false) // Never returns.
 	}
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		// Wait until another thread schedules gp and so m again.
 		stoplockedm()
 		execute(gp, false) // Never returns.
@@ -2928,17 +3228,16 @@ func malg(stacksize int32) *g {
 //go:nosplit
 func newproc(siz int32, fn *funcval) {
 	argp := add(unsafe.Pointer(&fn), sys.PtrSize)
-	pc := getcallerpc(unsafe.Pointer(&siz))
+	pc := getcallerpc()
 	systemstack(func() {
-		newproc1(fn, (*uint8)(argp), siz, 0, pc)
+		newproc1(fn, (*uint8)(argp), siz, pc)
 	})
 }
 
 // Create a new g running fn with narg bytes of arguments starting
-// at argp and returning nret bytes of results.  callerpc is the
-// address of the go statement that created this. The new g is put
-// on the queue of g's waiting to run.
-func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
+// at argp. callerpc is the address of the go statement that created
+// this. The new g is put on the queue of g's waiting to run.
+func newproc1(fn *funcval, argp *uint8, narg int32, callerpc uintptr) {
 	_g_ := getg()
 
 	if fn == nil {
@@ -2946,7 +3245,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 		throw("go of nil func value")
 	}
 	_g_.m.locks++ // disable preemption because it can be holding p in a local var
-	siz := narg + nret
+	siz := narg
 	siz = (siz + 7) &^ 7
 
 	// We could allocate a larger initial stack if necessary.
@@ -3041,7 +3340,6 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
 		_g_.stackguard0 = stackPreempt
 	}
-	return newg
 }
 
 // Put on gfree list.
@@ -3160,23 +3458,41 @@ func Breakpoint() {
 //go:nosplit
 func dolockOSThread() {
 	_g_ := getg()
-	_g_.m.lockedg = _g_
-	_g_.lockedm = _g_.m
+	_g_.m.lockedg.set(_g_)
+	_g_.lockedm.set(_g_.m)
 }
 
 //go:nosplit
 
 // LockOSThread wires the calling goroutine to its current operating system thread.
-// Until the calling goroutine exits or calls UnlockOSThread, it will always
-// execute in that thread, and no other goroutine can.
+// The calling goroutine will always execute in that thread,
+// and no other goroutine will execute in it,
+// until the calling goroutine has made as many calls to
+// UnlockOSThread as to LockOSThread.
+// If the calling goroutine exits without unlocking the thread,
+// the thread will be terminated.
+//
+// A goroutine should call LockOSThread before calling OS services or
+// non-Go library functions that depend on per-thread state.
 func LockOSThread() {
-	getg().m.locked |= _LockExternal
+	if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" {
+		// If we need to start a new thread from the locked
+		// thread, we need the template thread. Start it now
+		// while we're in a known-good state.
+		startTemplateThread()
+	}
+	_g_ := getg()
+	_g_.m.lockedExt++
+	if _g_.m.lockedExt == 0 {
+		_g_.m.lockedExt--
+		panic("LockOSThread nesting overflow")
+	}
 	dolockOSThread()
 }
 
 //go:nosplit
 func lockOSThread() {
-	getg().m.locked += _LockInternal
+	getg().m.lockedInt++
 	dolockOSThread()
 }
 
@@ -3186,29 +3502,43 @@ func lockOSThread() {
 //go:nosplit
 func dounlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked != 0 {
+	if _g_.m.lockedInt != 0 || _g_.m.lockedExt != 0 {
 		return
 	}
-	_g_.m.lockedg = nil
-	_g_.lockedm = nil
+	_g_.m.lockedg = 0
+	_g_.lockedm = 0
 }
 
 //go:nosplit
 
-// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
-// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+// UnlockOSThread undoes an earlier call to LockOSThread.
+// If this drops the number of active LockOSThread calls on the
+// calling goroutine to zero, it unwires the calling goroutine from
+// its fixed operating system thread.
+// If there are no active LockOSThread calls, this is a no-op.
+//
+// Before calling UnlockOSThread, the caller must ensure that the OS
+// thread is suitable for running other goroutines. If the caller made
+// any permanent changes to the state of the thread that would affect
+// other goroutines, it should not call this function and thus leave
+// the goroutine locked to the OS thread until the goroutine (and
+// hence the thread) exits.
 func UnlockOSThread() {
-	getg().m.locked &^= _LockExternal
+	_g_ := getg()
+	if _g_.m.lockedExt == 0 {
+		return
+	}
+	_g_.m.lockedExt--
 	dounlockOSThread()
 }
 
 //go:nosplit
 func unlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked < _LockInternal {
+	if _g_.m.lockedInt == 0 {
 		systemstack(badunlockosthread)
 	}
-	_g_.m.locked -= _LockInternal
+	_g_.m.lockedInt--
 	dounlockOSThread()
 }
 
@@ -3218,10 +3548,7 @@ func badunlockosthread() {
 
 func gcount() int32 {
 	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
-	for _, _p_ := range &allp {
-		if _p_ == nil {
-			break
-		}
+	for _, _p_ := range allp {
 		n -= _p_.gfreecnt
 	}
 
@@ -3234,7 +3561,7 @@ func gcount() int32 {
 }
 
 func mcount() int32 {
-	return sched.mcount
+	return int32(sched.mnext - sched.nmfreed)
 }
 
 var prof struct {
@@ -3516,7 +3843,7 @@ func setcpuprofilerate(hz int32) {
 // Returns list of Ps with local work, they need to be scheduled by the caller.
 func procresize(nprocs int32) *p {
 	old := gomaxprocs
-	if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
+	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
 	}
 	if trace.enabled {
@@ -3530,6 +3857,23 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	// Grow allp if necessary.
+	if nprocs > int32(len(allp)) {
+		// Synchronize with retake, which could be running
+		// concurrently since it doesn't run on a P.
+		lock(&allpLock)
+		if nprocs <= int32(cap(allp)) {
+			allp = allp[:nprocs]
+		} else {
+			nallp := make([]*p, nprocs)
+			// Copy everything up to allp's cap so we
+			// never lose old allocated Ps.
+			copy(nallp, allp[:cap(allp)])
+			allp = nallp
+		}
+		unlock(&allpLock)
+	}
+
 	// initialize new P's
 	for i := int32(0); i < nprocs; i++ {
 		pp := allp[i]
@@ -3541,6 +3885,7 @@ func procresize(nprocs int32) *p {
 			for i := range pp.deferpool {
 				pp.deferpool[i] = pp.deferpoolbuf[i][:0]
 			}
+			pp.wbBuf.reset()
 			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
 		}
 		if pp.mcache == nil {
@@ -3566,13 +3911,11 @@ func procresize(nprocs int32) *p {
 	// free unused P's
 	for i := nprocs; i < old; i++ {
 		p := allp[i]
-		if trace.enabled {
-			if p == getg().m.p.ptr() {
-				// moving to p[0], pretend that we were descheduled
-				// and then scheduled again to keep the trace sane.
-				traceGoSched()
-				traceProcStop(p)
-			}
+		if trace.enabled && p == getg().m.p.ptr() {
+			// moving to p[0], pretend that we were descheduled
+			// and then scheduled again to keep the trace sane.
+			traceGoSched()
+			traceProcStop(p)
 		}
 		// move all runnable goroutines to the global queue
 		for p.runqhead != p.runqtail {
@@ -3598,6 +3941,11 @@ func procresize(nprocs int32) *p {
 			// world is stopped.
 			p.gcBgMarkWorker.set(nil)
 		}
+		// Flush p's write barrier buffer.
+		if gcphase != _GCoff {
+			wbBufFlush1(p)
+			p.gcw.dispose()
+		}
 		for i := range p.sudogbuf {
 			p.sudogbuf[i] = nil
 		}
@@ -3616,10 +3964,18 @@ func procresize(nprocs int32) *p {
 			raceprocdestroy(p.racectx)
 			p.racectx = 0
 		}
+		p.gcAssistTime = 0
 		p.status = _Pdead
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
+	// Trim allp.
+	if int32(len(allp)) != nprocs {
+		lock(&allpLock)
+		allp = allp[:nprocs]
+		unlock(&allpLock)
+	}
+
 	_g_ := getg()
 	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
 		// continue to use the current P
@@ -3691,7 +4047,7 @@ func acquirep1(_p_ *p) {
 		throw("acquirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
-		id := int32(0)
+		id := int64(0)
 		if _p_.m != 0 {
 			id = _p_.m.ptr().id
 		}
@@ -3736,6 +4092,7 @@ func incidlelocked(v int32) {
 
 // Check for deadlock situation.
 // The check is based on number of running M's, if 0 -> deadlock.
+// sched.lock must be held.
 func checkdead() {
 	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
 	// there are no running goroutines. The calling program is
@@ -3752,13 +4109,12 @@ func checkdead() {
 		return
 	}
 
-	// -1 for sysmon
-	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+	run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
 	if run > 0 {
 		return
 	}
 	if run < 0 {
-		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
 		throw("checkdead: inconsistent counts")
 	}
 
@@ -3821,6 +4177,11 @@ var forcegcperiod int64 = 2 * 60 * 1e9
 //
 //go:nowritebarrierrec
 func sysmon() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
 	// If a heap span goes unused for 5 minutes after a garbage collection,
 	// we hand it back to the operating system.
 	scavengelimit := int64(5 * 60 * 1e9)
@@ -3860,15 +4221,11 @@ func sysmon() {
 				}
 				shouldRelax := true
 				if osRelaxMinNS > 0 {
-					lock(&timers.lock)
-					if timers.sleeping {
-						now := nanotime()
-						next := timers.sleepUntil
-						if next-now < osRelaxMinNS {
-							shouldRelax = false
-						}
+					next := timeSleepUntil()
+					now := nanotime()
+					if next-now < osRelaxMinNS {
+						shouldRelax = false
 					}
-					unlock(&timers.lock)
 				}
 				if shouldRelax {
 					osRelax(true)
@@ -3892,7 +4249,7 @@ func sysmon() {
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
-		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
 			if gp != nil {
@@ -3949,9 +4306,17 @@ const forcePreemptNS = 10 * 1000 * 1000 // 10ms
 
 func retake(now int64) uint32 {
 	n := 0
-	for i := int32(0); i < gomaxprocs; i++ {
+	// Prevent allp slice changes. This lock will be completely
+	// uncontended unless we're already stopping the world.
+	lock(&allpLock)
+	// We can't use a range loop over allp because we may
+	// temporarily drop the allpLock. Hence, we need to re-fetch
+	// allp each time around the loop.
+	for i := 0; i < len(allp); i++ {
 		_p_ := allp[i]
 		if _p_ == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
 			continue
 		}
 		pd := &_p_.sysmontick
@@ -3970,6 +4335,8 @@ func retake(now int64) uint32 {
 			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
 				continue
 			}
+			// Drop allpLock so we can take sched.lock.
+			unlock(&allpLock)
 			// Need to decrement number of idle locked M's
 			// (pretending that one more is running) before the CAS.
 			// Otherwise the M from which we retake can exit the syscall,
@@ -3985,6 +4352,7 @@ func retake(now int64) uint32 {
 				handoffp(_p_)
 			}
 			incidlelocked(1)
+			lock(&allpLock)
 		} else if s == _Prunning {
 			// Preempt G if it's running for too long.
 			t := int64(_p_.schedtick)
@@ -3999,6 +4367,7 @@ func retake(now int64) uint32 {
 			preemptone(_p_)
 		}
 	}
+	unlock(&allpLock)
 	return uint32(n)
 }
 
@@ -4009,9 +4378,8 @@ func retake(now int64) uint32 {
 // Returns true if preemption request was issued to at least one goroutine.
 func preemptall() bool {
 	res := false
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil || _p_.status != _Prunning {
+	for _, _p_ := range allp {
+		if _p_.status != _Prunning {
 			continue
 		}
 		if preemptone(_p_) {
@@ -4060,23 +4428,19 @@ func schedtrace(detailed bool) {
 	}
 
 	lock(&sched.lock)
-	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
 	if detailed {
 		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
 	}
 	// We must be careful while reading data from P's, M's and G's.
 	// Even if we hold schedlock, most data can be changed concurrently.
 	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			continue
-		}
+	for i, _p_ := range allp {
 		mp := _p_.m.ptr()
 		h := atomic.Load(&_p_.runqhead)
 		t := atomic.Load(&_p_.runqtail)
 		if detailed {
-			id := int32(-1)
+			id := int64(-1)
 			if mp != nil {
 				id = mp.id
 			}
@@ -4089,7 +4453,7 @@ func schedtrace(detailed bool) {
 				print("[")
 			}
 			print(t - h)
-			if i == gomaxprocs-1 {
+			if i == len(allp)-1 {
 				print("]\n")
 			}
 		}
@@ -4103,7 +4467,7 @@ func schedtrace(detailed bool) {
 	for mp := allm; mp != nil; mp = mp.alllink {
 		_p_ := mp.p.ptr()
 		gp := mp.curg
-		lockedg := mp.lockedg
+		lockedg := mp.lockedg.ptr()
 		id1 := int32(-1)
 		if _p_ != nil {
 			id1 = _p_.id
@@ -4123,12 +4487,12 @@ func schedtrace(detailed bool) {
 	for gi := 0; gi < len(allgs); gi++ {
 		gp := allgs[gi]
 		mp := gp.m
-		lockedm := gp.lockedm
-		id1 := int32(-1)
+		lockedm := gp.lockedm.ptr()
+		id1 := int64(-1)
 		if mp != nil {
 			id1 = mp.id
 		}
-		id2 := int32(-1)
+		id2 := int64(-1)
 		if lockedm != nil {
 			id2 = lockedm.id
 		}
@@ -4410,22 +4774,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			if stealRunNextG {
 				// Try to steal from _p_.runnext.
 				if next := _p_.runnext; next != 0 {
-					// Sleep to ensure that _p_ isn't about to run the g we
-					// are about to steal.
-					// The important use case here is when the g running on _p_
-					// ready()s another g and then almost immediately blocks.
-					// Instead of stealing runnext in this window, back off
-					// to give _p_ a chance to schedule runnext. This will avoid
-					// thrashing gs between different Ps.
-					// A sync chan send/recv takes ~50ns as of time of writing,
-					// so 3us gives ~50x overshoot.
-					if GOOS != "windows" {
-						usleep(3)
-					} else {
-						// On windows system timer granularity is 1-15ms,
-						// which is way too much for this optimization.
-						// So just yield.
-						osyield()
+					if _p_.status == _Prunning {
+						// Sleep to ensure that _p_ isn't about to run the g
+						// we are about to steal.
+						// The important use case here is when the g running
+						// on _p_ ready()s another g and then almost
+						// immediately blocks. Instead of stealing runnext
+						// in this window, back off to give _p_ a chance to
+						// schedule runnext. This will avoid thrashing gs
+						// between different Ps.
+						// A sync chan send/recv takes ~50ns as of time of
+						// writing, so 3us gives ~50x overshoot.
+						if GOOS != "windows" {
+							usleep(3)
+						} else {
+							// On windows system timer granularity is
+							// 1-15ms, which is way too much for this
+							// optimization. So just yield.
+							osyield()
+						}
 					}
 					if !_p_.runnext.cas(next, 0) {
 						continue
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 90a6cab874..2ece829071 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -655,6 +655,116 @@ func BenchmarkClosureCall(b *testing.B) {
 	_ = sum
 }
 
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		b.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	wakeDelay := 5 * time.Microsecond
+	for _, delay := range []time.Duration{
+		0,
+		1 * time.Microsecond,
+		2 * time.Microsecond,
+		5 * time.Microsecond,
+		10 * time.Microsecond,
+		20 * time.Microsecond,
+		50 * time.Microsecond,
+		100 * time.Microsecond,
+	} {
+		b.Run(delay.String(), func(b *testing.B) {
+			if b.N == 0 {
+				return
+			}
+			// Start two goroutines, which alternate between being
+			// sender and receiver in the following protocol:
+			//
+			// - The receiver spins for `delay` and then does a
+			// blocking receive on a channel.
+			//
+			// - The sender spins for `delay+wakeDelay` and then
+			// sends to the same channel. (The addition of
+			// `wakeDelay` improves the probability that the
+			// receiver will be blocking when the send occurs when
+			// the goroutines execute in parallel.)
+			//
+			// In each iteration of the benchmark, each goroutine
+			// acts once as sender and once as receiver, so each
+			// goroutine spins for delay twice.
+			//
+			// BenchmarkWakeupParallel is used to estimate how
+			// efficiently the scheduler parallelizes goroutines in
+			// the presence of blocking:
+			//
+			// - If both goroutines are executed on the same core,
+			// an increase in delay by N will increase the time per
+			// iteration by 4*N, because all 4 delays are
+			// serialized.
+			//
+			// - Otherwise, an increase in delay by N will increase
+			// the time per iteration by 2*N, and the time per
+			// iteration is 2 * (runtime overhead + chan
+			// send/receive pair + delay + wakeDelay). This allows
+			// the runtime overhead, including the time it takes
+			// for the unblocked goroutine to be scheduled, to be
+			// estimated.
+			ping, pong := make(chan struct{}), make(chan struct{})
+			start := make(chan struct{})
+			done := make(chan struct{})
+			go func() {
+				<-start
+				for i := 0; i < b.N; i++ {
+					// sender
+					spin(delay + wakeDelay)
+					ping <- struct{}{}
+					// receiver
+					spin(delay)
+					<-pong
+				}
+				done <- struct{}{}
+			}()
+			go func() {
+				for i := 0; i < b.N; i++ {
+					// receiver
+					spin(delay)
+					<-ping
+					// sender
+					spin(delay + wakeDelay)
+					pong <- struct{}{}
+				}
+				done <- struct{}{}
+			}()
+			b.ResetTimer()
+			start <- struct{}{}
+			<-done
+			<-done
+		})
+	}
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		end := time.Now().Add(d)
+		for time.Now().Before(end) {
+			// do nothing
+		}
+	})
+}
+
+// sysNanosleep is defined by OS-specific files (such as runtime_linux_test.go)
+// to sleep for the given duration. If nil, dependent tests are skipped.
+// The implementation should invoke a blocking system call and not
+// call time.Sleep, which would deschedule the goroutine.
+var sysNanosleep func(d time.Duration)
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+	if sysNanosleep == nil {
+		b.Skipf("skipping on %v; sysNanosleep not defined", runtime.GOOS)
+	}
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		sysNanosleep(d)
+	})
+}
+
 type Matrix [][]float64
 
 func BenchmarkMatmult(b *testing.B) {
@@ -722,3 +832,44 @@ func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, thres
 func TestStealOrder(t *testing.T) {
 	runtime.RunStealOrderTest()
 }
+
+func TestLockOSThreadNesting(t *testing.T) {
+	go func() {
+		e, i := runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.LockOSThread()
+		runtime.LockOSThread()
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 1 || i != 0 {
+			t.Errorf("want locked counts 1, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+	}()
+}
+
+func TestLockOSThreadExit(t *testing.T) {
+	testLockOSThreadExit(t, "testprog")
+}
+
+func testLockOSThreadExit(t *testing.T, prog string) {
+	output := runTestProg(t, prog, "LockOSThreadMain", "GOMAXPROCS=1")
+	want := "OK\n"
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+
+	output = runTestProg(t, prog, "LockOSThreadAlt")
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/race.go b/src/runtime/race.go
index 49495cc783..2f5713d30e 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go
@@ -4,14 +4,14 @@
 
 // +build race
 
-// Public race detection API, present iff build with -race.
-
 package runtime
 
 import (
 	"unsafe"
 )
 
+// Public race detection API, present iff build with -race.
+
 func RaceRead(addr unsafe.Pointer)
 func RaceWrite(addr unsafe.Pointer)
 func RaceReadRange(addr unsafe.Pointer, len int)
@@ -23,7 +23,69 @@ func RaceErrors() int {
 	return int(n)
 }
 
-// private interface for the runtime
+//go:nosplit
+
+// RaceAcquire/RaceRelease/RaceReleaseMerge establish happens-before relations
+// between goroutines. These inform the race detector about actual synchronization
+// that it can't see for some reason (e.g. synchronization within RaceDisable/RaceEnable
+// sections of code).
+// RaceAcquire establishes a happens-before relation with the preceding
+// RaceReleaseMerge on addr up to and including the last RaceRelease on addr.
+// In terms of the C memory model (C11 §5.1.2.4, §7.17.3),
+// RaceAcquire is equivalent to atomic_load(memory_order_acquire).
+func RaceAcquire(addr unsafe.Pointer) {
+	raceacquire(addr)
+}
+
+//go:nosplit
+
+// RaceRelease performs a release operation on addr that
+// can synchronize with a later RaceAcquire on addr.
+//
+// In terms of the C memory model, RaceRelease is equivalent to
+// atomic_store(memory_order_release).
+func RaceRelease(addr unsafe.Pointer) {
+	racerelease(addr)
+}
+
+//go:nosplit
+
+// RaceReleaseMerge is like RaceRelease, but also establishes a happens-before
+// relation with the preceding RaceRelease or RaceReleaseMerge on addr.
+//
+// In terms of the C memory model, RaceReleaseMerge is equivalent to
+// atomic_exchange(memory_order_release).
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	racereleasemerge(addr)
+}
+
+//go:nosplit
+
+// RaceDisable disables handling of race synchronization events in the current goroutine.
+// Handling is re-enabled with RaceEnable. RaceDisable/RaceEnable can be nested.
+// Non-synchronization events (memory accesses, function entry/exit) still affect
+// the race detector.
+func RaceDisable() {
+	_g_ := getg()
+	if _g_.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_begin, _g_.racectx, 0, 0, 0)
+	}
+	_g_.raceignore++
+}
+
+//go:nosplit
+
+// RaceEnable re-enables handling of race events in the current goroutine.
+func RaceEnable() {
+	_g_ := getg()
+	_g_.raceignore--
+	if _g_.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_end, _g_.racectx, 0, 0, 0)
+	}
+}
+
+// Private interface for the runtime.
+
 const raceenabled = true
 
 // For all functions accepting callerpc and pc,
@@ -433,43 +495,3 @@ func racereleasemergeg(gp *g, addr unsafe.Pointer) {
 func racefingo() {
 	racecall(&__tsan_finalizer_goroutine, getg().racectx, 0, 0, 0)
 }
-
-//go:nosplit
-
-func RaceAcquire(addr unsafe.Pointer) {
-	raceacquire(addr)
-}
-
-//go:nosplit
-
-func RaceRelease(addr unsafe.Pointer) {
-	racerelease(addr)
-}
-
-//go:nosplit
-
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	racereleasemerge(addr)
-}
-
-//go:nosplit
-
-// RaceDisable disables handling of race events in the current goroutine.
-func RaceDisable() {
-	_g_ := getg()
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_begin, _g_.racectx, 0, 0, 0)
-	}
-	_g_.raceignore++
-}
-
-//go:nosplit
-
-// RaceEnable re-enables handling of race events in the current goroutine.
-func RaceEnable() {
-	_g_ := getg()
-	_g_.raceignore--
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_end, _g_.racectx, 0, 0, 0)
-	}
-}
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index 13dfc33b47..adf9ce8851 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -19,6 +19,16 @@ import (
 )
 
 func TestOutput(t *testing.T) {
+	pkgdir, err := ioutil.TempDir("", "go-build-race-output")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(pkgdir)
+	out, err := exec.Command(testenv.GoToolPath(t), "install", "-race", "-pkgdir="+pkgdir, "-gcflags=all=-l", "testing").CombinedOutput()
+	if err != nil {
+		t.Fatalf("go install -race: %v\n%s", err, out)
+	}
+
 	for _, test := range tests {
 		if test.goos != "" && test.goos != runtime.GOOS {
 			t.Logf("test %v runs only on %v, skipping: ", test.name, test.goos)
@@ -47,7 +57,7 @@ func TestOutput(t *testing.T) {
 			t.Fatalf("failed to close file: %v", err)
 		}
 		// Pass -l to the compiler to test stack traces.
-		cmd := exec.Command(testenv.GoToolPath(t), test.run, "-race", "-gcflags=-l", src)
+		cmd := exec.Command(testenv.GoToolPath(t), test.run, "-race", "-pkgdir="+pkgdir, "-gcflags=all=-l", src)
 		// GODEBUG spoils program output, GOMAXPROCS makes it flaky.
 		for _, env := range os.Environ() {
 			if strings.HasPrefix(env, "GODEBUG=") ||
@@ -175,6 +185,7 @@ import "testing"
 func TestFail(t *testing.T) {
 	done := make(chan bool)
 	x := 0
+	_ = x
 	go func() {
 		x = 42
 		done <- true
@@ -186,7 +197,7 @@ func TestFail(t *testing.T) {
 `, `
 ==================
 --- FAIL: TestFail \(0...s\)
-.*main_test.go:13: true
+.*main_test.go:14: true
 .*testing.go:.*: race detected during execution of test
 FAIL`},
 
@@ -253,7 +264,7 @@ Previous write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.goCallback\(\)
       .*/main\.go:27 \+0x[0-9,a-f]+
   main._cgoexpwrap_[0-9a-z]+_goCallback\(\)
-      .*/_cgo_gotypes\.go:[0-9]+ \+0x[0-9,a-f]+
+      .*_cgo_gotypes\.go:[0-9]+ \+0x[0-9,a-f]+
 
 Goroutine [0-9] \(running\) created at:
   runtime\.newextram\(\)
@@ -265,6 +276,7 @@ import "testing"
 func TestFail(t *testing.T) {
 	done := make(chan bool)
 	x := 0
+	_ = x
 	go func() {
 		x = 42
 		done <- true
diff --git a/src/runtime/race/testdata/atomic_test.go b/src/runtime/race/testdata/atomic_test.go
index 232744b3dd..769c8d7398 100644
--- a/src/runtime/race/testdata/atomic_test.go
+++ b/src/runtime/race/testdata/atomic_test.go
@@ -14,6 +14,7 @@ import (
 
 func TestNoRaceAtomicAddInt64(t *testing.T) {
 	var x1, x2 int8
+	_ = x1 + x2
 	var s int64
 	ch := make(chan bool, 2)
 	go func() {
@@ -36,6 +37,7 @@ func TestNoRaceAtomicAddInt64(t *testing.T) {
 
 func TestRaceAtomicAddInt64(t *testing.T) {
 	var x1, x2 int8
+	_ = x1 + x2
 	var s int64
 	ch := make(chan bool, 2)
 	go func() {
@@ -58,6 +60,7 @@ func TestRaceAtomicAddInt64(t *testing.T) {
 
 func TestNoRaceAtomicAddInt32(t *testing.T) {
 	var x1, x2 int8
+	_ = x1 + x2
 	var s int32
 	ch := make(chan bool, 2)
 	go func() {
@@ -80,6 +83,7 @@ func TestNoRaceAtomicAddInt32(t *testing.T) {
 
 func TestNoRaceAtomicLoadAddInt32(t *testing.T) {
 	var x int64
+	_ = x
 	var s int32
 	go func() {
 		x = 2
@@ -93,6 +97,7 @@ func TestNoRaceAtomicLoadAddInt32(t *testing.T) {
 
 func TestNoRaceAtomicLoadStoreInt32(t *testing.T) {
 	var x int64
+	_ = x
 	var s int32
 	go func() {
 		x = 2
@@ -106,6 +111,7 @@ func TestNoRaceAtomicLoadStoreInt32(t *testing.T) {
 
 func TestNoRaceAtomicStoreCASInt32(t *testing.T) {
 	var x int64
+	_ = x
 	var s int32
 	go func() {
 		x = 2
@@ -119,6 +125,7 @@ func TestNoRaceAtomicStoreCASInt32(t *testing.T) {
 
 func TestNoRaceAtomicCASLoadInt32(t *testing.T) {
 	var x int64
+	_ = x
 	var s int32
 	go func() {
 		x = 2
@@ -134,6 +141,7 @@ func TestNoRaceAtomicCASLoadInt32(t *testing.T) {
 
 func TestNoRaceAtomicCASCASInt32(t *testing.T) {
 	var x int64
+	_ = x
 	var s int32
 	go func() {
 		x = 2
@@ -149,6 +157,7 @@ func TestNoRaceAtomicCASCASInt32(t *testing.T) {
 
 func TestNoRaceAtomicCASCASInt32_2(t *testing.T) {
 	var x1, x2 int8
+	_ = x1 + x2
 	var s int32
 	ch := make(chan bool, 2)
 	go func() {
@@ -171,6 +180,7 @@ func TestNoRaceAtomicCASCASInt32_2(t *testing.T) {
 
 func TestNoRaceAtomicLoadInt64(t *testing.T) {
 	var x int32
+	_ = x
 	var s int64
 	go func() {
 		x = 2
@@ -184,6 +194,7 @@ func TestNoRaceAtomicLoadInt64(t *testing.T) {
 
 func TestNoRaceAtomicCASCASUInt64(t *testing.T) {
 	var x int64
+	_ = x
 	var s uint64
 	go func() {
 		x = 2
@@ -199,6 +210,7 @@ func TestNoRaceAtomicCASCASUInt64(t *testing.T) {
 
 func TestNoRaceAtomicLoadStorePointer(t *testing.T) {
 	var x int64
+	_ = x
 	var s unsafe.Pointer
 	var y int = 2
 	var p unsafe.Pointer = unsafe.Pointer(&y)
@@ -214,6 +226,7 @@ func TestNoRaceAtomicLoadStorePointer(t *testing.T) {
 
 func TestNoRaceAtomicStoreCASUint64(t *testing.T) {
 	var x int64
+	_ = x
 	var s uint64
 	go func() {
 		x = 2
diff --git a/src/runtime/race/testdata/chan_test.go b/src/runtime/race/testdata/chan_test.go
index 449191639e..7f349c42ed 100644
--- a/src/runtime/race/testdata/chan_test.go
+++ b/src/runtime/race/testdata/chan_test.go
@@ -12,6 +12,7 @@ import (
 
 func TestNoRaceChanSync(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		v = 1
@@ -23,6 +24,7 @@ func TestNoRaceChanSync(t *testing.T) {
 
 func TestNoRaceChanSyncRev(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		c <- 0
@@ -34,6 +36,7 @@ func TestNoRaceChanSyncRev(t *testing.T) {
 
 func TestNoRaceChanAsync(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -45,6 +48,7 @@ func TestNoRaceChanAsync(t *testing.T) {
 
 func TestRaceChanAsyncRev(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		c <- 0
@@ -56,6 +60,7 @@ func TestRaceChanAsyncRev(t *testing.T) {
 
 func TestNoRaceChanAsyncCloseRecv(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -72,6 +77,7 @@ func TestNoRaceChanAsyncCloseRecv(t *testing.T) {
 
 func TestNoRaceChanAsyncCloseRecv2(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -83,6 +89,7 @@ func TestNoRaceChanAsyncCloseRecv2(t *testing.T) {
 
 func TestNoRaceChanAsyncCloseRecv3(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -95,6 +102,7 @@ func TestNoRaceChanAsyncCloseRecv3(t *testing.T) {
 
 func TestNoRaceChanSyncCloseRecv(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		v = 1
@@ -111,6 +119,7 @@ func TestNoRaceChanSyncCloseRecv(t *testing.T) {
 
 func TestNoRaceChanSyncCloseRecv2(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		v = 1
@@ -122,6 +131,7 @@ func TestNoRaceChanSyncCloseRecv2(t *testing.T) {
 
 func TestNoRaceChanSyncCloseRecv3(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		v = 1
@@ -134,6 +144,7 @@ func TestNoRaceChanSyncCloseRecv3(t *testing.T) {
 
 func TestRaceChanSyncCloseSend(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int)
 	go func() {
 		v = 1
@@ -150,6 +161,7 @@ func TestRaceChanSyncCloseSend(t *testing.T) {
 
 func TestRaceChanAsyncCloseSend(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -170,6 +182,7 @@ func TestRaceChanCloseClose(t *testing.T) {
 	compl := make(chan bool, 2)
 	v1 := 0
 	v2 := 0
+	_ = v1 + v2
 	c := make(chan int)
 	go func() {
 		defer func() {
@@ -197,6 +210,7 @@ func TestRaceChanCloseClose(t *testing.T) {
 
 func TestRaceChanSendLen(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	go func() {
 		v = 1
@@ -210,6 +224,7 @@ func TestRaceChanSendLen(t *testing.T) {
 
 func TestRaceChanRecvLen(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	c <- 1
 	go func() {
@@ -226,6 +241,7 @@ func TestRaceChanSendSend(t *testing.T) {
 	compl := make(chan bool, 2)
 	v1 := 0
 	v2 := 0
+	_ = v1 + v2
 	c := make(chan int, 1)
 	go func() {
 		v1 = 1
@@ -264,6 +280,7 @@ func TestNoRaceChanPtr(t *testing.T) {
 func TestRaceChanWrongSend(t *testing.T) {
 	v1 := 0
 	v2 := 0
+	_ = v1 + v2
 	c := make(chan int, 2)
 	go func() {
 		v1 = 1
@@ -284,6 +301,7 @@ func TestRaceChanWrongSend(t *testing.T) {
 func TestRaceChanWrongClose(t *testing.T) {
 	v1 := 0
 	v2 := 0
+	_ = v1 + v2
 	c := make(chan int, 1)
 	done := make(chan bool)
 	go func() {
@@ -561,6 +579,7 @@ func TestRaceChanItselfCap(t *testing.T) {
 
 func TestRaceChanCloseLen(t *testing.T) {
 	v := 0
+	_ = v
 	c := make(chan int, 10)
 	c <- 0
 	go func() {
@@ -587,6 +606,7 @@ func TestNoRaceChanMutex(t *testing.T) {
 	done := make(chan struct{})
 	mtx := make(chan struct{}, 1)
 	data := 0
+	_ = data
 	go func() {
 		mtx <- struct{}{}
 		data = 42
@@ -604,6 +624,7 @@ func TestNoRaceSelectMutex(t *testing.T) {
 	mtx := make(chan struct{}, 1)
 	aux := make(chan bool)
 	data := 0
+	_ = data
 	go func() {
 		select {
 		case mtx <- struct{}{}:
@@ -632,6 +653,7 @@ func TestRaceChanSem(t *testing.T) {
 	done := make(chan struct{})
 	mtx := make(chan bool, 2)
 	data := 0
+	_ = data
 	go func() {
 		mtx <- true
 		data = 42
diff --git a/src/runtime/race/testdata/finalizer_test.go b/src/runtime/race/testdata/finalizer_test.go
index 222cbf67a8..3ac33d2b59 100644
--- a/src/runtime/race/testdata/finalizer_test.go
+++ b/src/runtime/race/testdata/finalizer_test.go
@@ -53,6 +53,7 @@ func TestNoRaceFinGlobal(t *testing.T) {
 func TestRaceFin(t *testing.T) {
 	c := make(chan bool)
 	y := 0
+	_ = y
 	go func() {
 		x := new(string)
 		runtime.SetFinalizer(x, func(x *string) {
diff --git a/src/runtime/race/testdata/map_test.go b/src/runtime/race/testdata/map_test.go
index a8d8148d0e..88e735ecd3 100644
--- a/src/runtime/race/testdata/map_test.go
+++ b/src/runtime/race/testdata/map_test.go
@@ -130,6 +130,7 @@ func TestRaceMapLenDelete(t *testing.T) {
 func TestRaceMapVariable(t *testing.T) {
 	ch := make(chan bool, 1)
 	m := make(map[int]int)
+	_ = m
 	go func() {
 		m = make(map[int]int)
 		ch <- true
@@ -230,6 +231,7 @@ func TestRaceMapAssignMultipleReturn(t *testing.T) {
 	conns[1] = []int{0}
 	ch := make(chan bool, 1)
 	var err error
+	_ = err
 	go func() {
 		conns[1][0], err = connect()
 		ch <- true
diff --git a/src/runtime/race/testdata/mop_test.go b/src/runtime/race/testdata/mop_test.go
index 560a762315..b60cabfe86 100644
--- a/src/runtime/race/testdata/mop_test.go
+++ b/src/runtime/race/testdata/mop_test.go
@@ -60,6 +60,7 @@ func TestRaceIntRWGlobalFuncs(t *testing.T) {
 
 func TestRaceIntRWClosures(t *testing.T) {
 	var x, y int
+	_ = y
 	ch := make(chan int, 2)
 
 	go func() {
@@ -76,6 +77,7 @@ func TestRaceIntRWClosures(t *testing.T) {
 
 func TestNoRaceIntRWClosures(t *testing.T) {
 	var x, y int
+	_ = y
 	ch := make(chan int, 1)
 
 	go func() {
@@ -93,6 +95,7 @@ func TestNoRaceIntRWClosures(t *testing.T) {
 
 func TestRaceInt32RWClosures(t *testing.T) {
 	var x, y int32
+	_ = y
 	ch := make(chan bool, 2)
 
 	go func() {
@@ -168,6 +171,7 @@ func TestRaceCaseCondition2(t *testing.T) {
 
 func TestRaceCaseBody(t *testing.T) {
 	var x, y int
+	_ = y
 	ch := make(chan int, 2)
 
 	go func() {
@@ -189,6 +193,7 @@ func TestRaceCaseBody(t *testing.T) {
 
 func TestNoRaceCaseFallthrough(t *testing.T) {
 	var x, y, z int
+	_ = y
 	ch := make(chan int, 2)
 	z = 1
 
@@ -210,6 +215,7 @@ func TestNoRaceCaseFallthrough(t *testing.T) {
 
 func TestRaceCaseFallthrough(t *testing.T) {
 	var x, y, z int
+	_ = y
 	ch := make(chan int, 2)
 	z = 1
 
@@ -323,6 +329,7 @@ func TestRaceRange(t *testing.T) {
 	const N = 2
 	var a [N]int
 	var x, y int
+	_ = x + y
 	done := make(chan bool, N)
 	for i, v := range a {
 		go func(i int) {
@@ -433,6 +440,7 @@ func TestNoRaceForIncr(t *testing.T) {
 
 func TestRacePlus(t *testing.T) {
 	var x, y, z int
+	_ = y
 	ch := make(chan int, 2)
 
 	go func() {
@@ -449,6 +457,7 @@ func TestRacePlus(t *testing.T) {
 
 func TestRacePlus2(t *testing.T) {
 	var x, y, z int
+	_ = y
 	ch := make(chan int, 2)
 
 	go func() {
@@ -465,6 +474,7 @@ func TestRacePlus2(t *testing.T) {
 
 func TestNoRacePlus(t *testing.T) {
 	var x, y, z, f int
+	_ = x + y + f
 	ch := make(chan int, 2)
 
 	go func() {
@@ -481,6 +491,7 @@ func TestNoRacePlus(t *testing.T) {
 
 func TestRaceComplement(t *testing.T) {
 	var x, y, z int
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -497,6 +508,7 @@ func TestRaceComplement(t *testing.T) {
 
 func TestRaceDiv(t *testing.T) {
 	var x, y, z int
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -513,6 +525,7 @@ func TestRaceDiv(t *testing.T) {
 
 func TestRaceDivConst(t *testing.T) {
 	var x, y, z uint32
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -529,6 +542,7 @@ func TestRaceDivConst(t *testing.T) {
 
 func TestRaceMod(t *testing.T) {
 	var x, y, z int
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -545,6 +559,7 @@ func TestRaceMod(t *testing.T) {
 
 func TestRaceModConst(t *testing.T) {
 	var x, y, z int
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -561,6 +576,7 @@ func TestRaceModConst(t *testing.T) {
 
 func TestRaceRotate(t *testing.T) {
 	var x, y, z uint32
+	_ = x
 	ch := make(chan int, 2)
 
 	go func() {
@@ -932,6 +948,7 @@ func TestRaceFuncVariableRW(t *testing.T) {
 
 func TestRaceFuncVariableWW(t *testing.T) {
 	var f func(x int) int
+	_ = f
 	ch := make(chan bool, 1)
 	go func() {
 		f = func(x int) int {
@@ -948,6 +965,7 @@ func TestRaceFuncVariableWW(t *testing.T) {
 // This one should not belong to mop_test
 func TestRacePanic(t *testing.T) {
 	var x int
+	_ = x
 	var zero int = 0
 	ch := make(chan bool, 2)
 	go func() {
@@ -1284,6 +1302,7 @@ func TestNoRaceFuncUnlock(t *testing.T) {
 	ch := make(chan bool, 1)
 	var mu sync.Mutex
 	x := 0
+	_ = x
 	go func() {
 		mu.Lock()
 		x = 42
@@ -1812,6 +1831,7 @@ func TestNoRaceAsFunc4(t *testing.T) {
 	c := make(chan bool, 1)
 	var mu sync.Mutex
 	x := 0
+	_ = x
 	go func() {
 		x = func() int { // Write of x must be under the mutex.
 			mu.Lock()
@@ -2042,6 +2062,7 @@ func TestNoRaceTinyAlloc(t *testing.T) {
 	const P = 4
 	const N = 1e6
 	var tinySink *byte
+	_ = tinySink
 	done := make(chan bool)
 	for p := 0; p < P; p++ {
 		go func() {
diff --git a/src/runtime/race/testdata/mutex_test.go b/src/runtime/race/testdata/mutex_test.go
index 3cf03ae6b8..cbed2d370c 100644
--- a/src/runtime/race/testdata/mutex_test.go
+++ b/src/runtime/race/testdata/mutex_test.go
@@ -13,6 +13,7 @@ import (
 func TestNoRaceMutex(t *testing.T) {
 	var mu sync.Mutex
 	var x int16 = 0
+	_ = x
 	ch := make(chan bool, 2)
 	go func() {
 		mu.Lock()
@@ -33,6 +34,7 @@ func TestNoRaceMutex(t *testing.T) {
 func TestRaceMutex(t *testing.T) {
 	var mu sync.Mutex
 	var x int16 = 0
+	_ = x
 	ch := make(chan bool, 2)
 	go func() {
 		x = 1
@@ -54,6 +56,7 @@ func TestRaceMutex2(t *testing.T) {
 	var mu1 sync.Mutex
 	var mu2 sync.Mutex
 	var x int8 = 0
+	_ = x
 	ch := make(chan bool, 2)
 	go func() {
 		mu1.Lock()
@@ -74,6 +77,7 @@ func TestRaceMutex2(t *testing.T) {
 func TestNoRaceMutexPureHappensBefore(t *testing.T) {
 	var mu sync.Mutex
 	var x int16 = 0
+	_ = x
 	ch := make(chan bool, 2)
 	go func() {
 		x = 1
@@ -96,6 +100,7 @@ func TestNoRaceMutexSemaphore(t *testing.T) {
 	var mu sync.Mutex
 	ch := make(chan bool, 2)
 	x := 0
+	_ = x
 	mu.Lock()
 	go func() {
 		x = 1
diff --git a/src/runtime/race/testdata/rwmutex_test.go b/src/runtime/race/testdata/rwmutex_test.go
index 7ac829d759..39219e58ae 100644
--- a/src/runtime/race/testdata/rwmutex_test.go
+++ b/src/runtime/race/testdata/rwmutex_test.go
@@ -14,6 +14,7 @@ func TestRaceMutexRWMutex(t *testing.T) {
 	var mu1 sync.Mutex
 	var mu2 sync.RWMutex
 	var x int16 = 0
+	_ = x
 	ch := make(chan bool, 2)
 	go func() {
 		mu1.Lock()
@@ -34,6 +35,7 @@ func TestRaceMutexRWMutex(t *testing.T) {
 func TestNoRaceRWMutex(t *testing.T) {
 	var mu sync.RWMutex
 	var x, y int64 = 0, 1
+	_ = y
 	ch := make(chan bool, 2)
 	go func() {
 		mu.Lock()
diff --git a/src/runtime/race/testdata/select_test.go b/src/runtime/race/testdata/select_test.go
index 9969f47e8e..3827867687 100644
--- a/src/runtime/race/testdata/select_test.go
+++ b/src/runtime/race/testdata/select_test.go
@@ -11,6 +11,7 @@ import (
 
 func TestNoRaceSelect1(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool)
 	c := make(chan bool)
 	c1 := make(chan bool)
@@ -36,6 +37,7 @@ func TestNoRaceSelect1(t *testing.T) {
 
 func TestNoRaceSelect2(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool)
 	c := make(chan bool)
 	c1 := make(chan bool)
@@ -55,6 +57,7 @@ func TestNoRaceSelect2(t *testing.T) {
 
 func TestNoRaceSelect3(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool)
 	c := make(chan bool, 10)
 	c1 := make(chan bool)
@@ -112,6 +115,7 @@ func TestNoRaceSelect4(t *testing.T) {
 func TestNoRaceSelect5(t *testing.T) {
 	test := func(sel, needSched bool) {
 		var x int
+		_ = x
 		ch := make(chan bool)
 		c1 := make(chan bool)
 
@@ -158,6 +162,7 @@ func TestNoRaceSelect5(t *testing.T) {
 
 func TestRaceSelect1(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool, 2)
 	c := make(chan bool)
 	c1 := make(chan bool)
@@ -182,6 +187,7 @@ func TestRaceSelect1(t *testing.T) {
 
 func TestRaceSelect2(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool)
 	c := make(chan bool)
 	c1 := make(chan bool)
@@ -200,6 +206,7 @@ func TestRaceSelect2(t *testing.T) {
 
 func TestRaceSelect3(t *testing.T) {
 	var x int
+	_ = x
 	compl := make(chan bool)
 	c := make(chan bool)
 	c1 := make(chan bool)
diff --git a/src/runtime/race/testdata/sync_test.go b/src/runtime/race/testdata/sync_test.go
index d48680d5e6..2b2d95d76b 100644
--- a/src/runtime/race/testdata/sync_test.go
+++ b/src/runtime/race/testdata/sync_test.go
@@ -12,6 +12,7 @@ import (
 
 func TestNoRaceCond(t *testing.T) {
 	x := 0
+	_ = x
 	condition := 0
 	var mu sync.Mutex
 	cond := sync.NewCond(&mu)
@@ -35,6 +36,7 @@ func TestRaceCond(t *testing.T) {
 	var mu sync.Mutex
 	cond := sync.NewCond(&mu)
 	x := 0
+	_ = x
 	condition := 0
 	go func() {
 		time.Sleep(10 * time.Millisecond) // Enter cond.Wait loop
@@ -67,6 +69,7 @@ func TestRaceAnnounceThreads(t *testing.T) {
 	allDone := make(chan bool, N)
 
 	var x int
+	_ = x
 
 	var f, g, h func()
 	f = func() {
@@ -133,6 +136,7 @@ func TestNoRaceAfterFunc2(t *testing.T) {
 func TestNoRaceAfterFunc3(t *testing.T) {
 	c := make(chan bool, 1)
 	x := 0
+	_ = x
 	time.AfterFunc(1e7, func() {
 		x = 1
 		c <- true
@@ -143,6 +147,7 @@ func TestNoRaceAfterFunc3(t *testing.T) {
 func TestRaceAfterFunc3(t *testing.T) {
 	c := make(chan bool, 2)
 	x := 0
+	_ = x
 	time.AfterFunc(1e7, func() {
 		x = 1
 		c <- true
@@ -161,6 +166,7 @@ func TestRaceAfterFunc3(t *testing.T) {
 // comprehensible.
 func TestRaceGoroutineCreationStack(t *testing.T) {
 	var x int
+	_ = x
 	var ch = make(chan bool, 1)
 
 	f1 := func() {
diff --git a/src/runtime/race/testdata/waitgroup_test.go b/src/runtime/race/testdata/waitgroup_test.go
index ff152b0abe..169337315b 100644
--- a/src/runtime/race/testdata/waitgroup_test.go
+++ b/src/runtime/race/testdata/waitgroup_test.go
@@ -13,6 +13,7 @@ import (
 
 func TestNoRaceWaitGroup(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	n := 1
 	for i := 0; i < n; i++ {
@@ -28,6 +29,7 @@ func TestNoRaceWaitGroup(t *testing.T) {
 
 func TestRaceWaitGroup(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	n := 2
 	for i := 0; i < n; i++ {
@@ -43,6 +45,7 @@ func TestRaceWaitGroup(t *testing.T) {
 
 func TestNoRaceWaitGroup2(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
@@ -56,6 +59,7 @@ func TestNoRaceWaitGroup2(t *testing.T) {
 // incrementing counter in Add and locking wg's mutex
 func TestRaceWaitGroupAsMutex(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	c := make(chan bool, 2)
 	go func() {
@@ -82,6 +86,7 @@ func TestRaceWaitGroupAsMutex(t *testing.T) {
 func TestRaceWaitGroupWrongWait(t *testing.T) {
 	c := make(chan bool, 2)
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	go func() {
 		wg.Add(1)
@@ -187,6 +192,7 @@ func TestNoRaceWaitGroupMultipleWait3(t *testing.T) {
 // Correct usage but still a race
 func TestRaceWaitGroup2(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	wg.Add(2)
 	go func() {
@@ -202,6 +208,7 @@ func TestRaceWaitGroup2(t *testing.T) {
 
 func TestNoRaceWaitGroupPanicRecover(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	defer func() {
 		err := recover()
@@ -219,6 +226,7 @@ func TestNoRaceWaitGroupPanicRecover(t *testing.T) {
 // Is it possible to get a race by synchronization via panic?
 func TestNoRaceWaitGroupPanicRecover2(t *testing.T) {
 	var x int
+	_ = x
 	var wg sync.WaitGroup
 	ch := make(chan bool, 1)
 	var f func() = func() {
diff --git a/src/runtime/rt0_android_386.s b/src/runtime/rt0_android_386.s
index 9d20fc8f89..eabdf81235 100644
--- a/src/runtime/rt0_android_386.s
+++ b/src/runtime/rt0_android_386.s
@@ -4,21 +4,13 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_android(SB),NOSPLIT,$8
-	MOVL	8(SP), AX  // argc
-	LEAL	12(SP), BX  // argv
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_android(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
 
 TEXT _rt0_386_android_lib(SB),NOSPLIT,$0
 	PUSHL	$_rt0_386_android_argv(SB)  // argv
 	PUSHL	$1  // argc
-	CALL	_rt0_386_linux_lib(SB)
-	POPL	AX
-	POPL	AX
-	RET
+	JMP	_rt0_386_lib(SB)
 
 DATA _rt0_386_android_argv+0x00(SB)/4,$_rt0_386_android_argv0(SB)
 DATA _rt0_386_android_argv+0x04(SB)/4,$0  // argv terminate
diff --git a/src/runtime/rt0_android_amd64.s b/src/runtime/rt0_android_amd64.s
index 6420c9f35d..6bda3bfcc1 100644
--- a/src/runtime/rt0_android_amd64.s
+++ b/src/runtime/rt0_android_amd64.s
@@ -5,16 +5,12 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_android(SB),NOSPLIT,$-8
-	MOVQ	0(SP), DI // argc
-	LEAQ	8(SP), SI // argv
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
 TEXT _rt0_amd64_android_lib(SB),NOSPLIT,$0
 	MOVQ	$1, DI // argc
 	MOVQ	$_rt0_amd64_android_argv(SB), SI  // argv
-	MOVQ	$_rt0_amd64_linux_lib(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64_lib(SB)
 
 DATA _rt0_amd64_android_argv+0x00(SB)/8,$_rt0_amd64_android_argv0(SB)
 DATA _rt0_amd64_android_argv+0x08(SB)/8,$0 // end argv
diff --git a/src/runtime/rt0_android_arm.s b/src/runtime/rt0_android_arm.s
index 189e290e35..1246238be0 100644
--- a/src/runtime/rt0_android_arm.s
+++ b/src/runtime/rt0_android_arm.s
@@ -10,13 +10,10 @@ TEXT _rt0_arm_android(SB),NOSPLIT,$-4
 	MOVW		$_rt0_arm_linux1(SB), R4
 	B		(R4)
 
-// When building with -buildmode=c-shared, this symbol is called when the shared
-// library is loaded.
 TEXT _rt0_arm_android_lib(SB),NOSPLIT,$0
 	MOVW	$1, R0                          // argc
 	MOVW	$_rt0_arm_android_argv(SB), R1  // **argv
-	BL _rt0_arm_linux_lib(SB)
-	RET
+	B	_rt0_arm_lib(SB)
 
 DATA _rt0_arm_android_argv+0x00(SB)/4,$_rt0_arm_android_argv0(SB)
 DATA _rt0_arm_android_argv+0x04(SB)/4,$0 // end argv
diff --git a/src/runtime/rt0_darwin_386.s b/src/runtime/rt0_darwin_386.s
index 6b404db3a4..a8d3a796d4 100644
--- a/src/runtime/rt0_darwin_386.s
+++ b/src/runtime/rt0_darwin_386.s
@@ -4,72 +4,14 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_darwin(SB),NOSPLIT,$8
-	MOVL	8(SP), AX
-	LEAL	12(SP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_darwin(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
 
-// With -buildmode=c-archive, this symbol is called from a global constructor.
 TEXT _rt0_386_darwin_lib(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	PUSHL	BX
-	PUSHL	SI
-	PUSHL	DI
-
-	MOVL	8(BP), AX
-	MOVL	AX, _rt0_386_darwin_lib_argc<>(SB)
-	MOVL	12(BP), AX
-	MOVL	AX, _rt0_386_darwin_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVL	$runtime·libpreinit(SB), AX
-	CALL	AX
-
-	SUBL	$12, SP
-
-	// Create a new thread to do the runtime initialization and return.
-	MOVL	_cgo_sys_thread_create(SB), AX
-	TESTL	AX, AX
-	JZ	nocgo
-	MOVL	$_rt0_386_darwin_lib_go(SB), BX
-	MOVL	BX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	AX
-	JMP     restore
-
-nocgo:
-	MOVL	$0x800000, 0(SP)               // stacksize = 8192KB
-	MOVL	$_rt0_386_darwin_lib_go(SB), AX
-	MOVL	AX, 4(SP)                      // fn
-	MOVL	$0, 8(SP)                      // fnarg
-	MOVL	$runtime·newosproc0(SB), AX
-	CALL	AX
-
-restore:
-	ADDL	$12, SP
-	POPL	DI
-	POPL	SI
-	POPL	BX
-	POPL	BP
-	RET
-
-TEXT _rt0_386_darwin_lib_go(SB),NOSPLIT,$12
-	MOVL	_rt0_386_darwin_lib_argc<>(SB), AX
-	MOVL	AX, 0(SP)
-	MOVL	_rt0_386_darwin_lib_argv<>(SB), AX
-	MOVL	AX, 4(SP)
-	MOVL	$runtime·rt0_go(SB), AX
-	CALL	AX
-	RET
-
-DATA _rt0_386_darwin_lib_argc<>(SB)/4, $0
-GLOBL _rt0_386_darwin_lib_argc<>(SB),NOPTR, $4
-DATA _rt0_386_darwin_lib_argv<>(SB)/4, $0
-GLOBL _rt0_386_darwin_lib_argv<>(SB),NOPTR, $4
+	JMP	_rt0_386_lib(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_darwin_amd64.s b/src/runtime/rt0_darwin_amd64.s
index 655e77a86b..ed804d47c5 100644
--- a/src/runtime/rt0_darwin_amd64.s
+++ b/src/runtime/rt0_darwin_amd64.s
@@ -5,75 +5,9 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_darwin(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
 // When linking with -shared, this symbol is called when the shared library
 // is loaded.
-TEXT _rt0_amd64_darwin_lib(SB),NOSPLIT,$0x58
-	// Align stack. We don't know whether Go is adding a frame pointer here or not.
-	MOVQ	SP, R8
-	SUBQ	$16, R8
-	ANDQ	$~15, R8
-	XCHGQ	SP, R8
-
-	MOVQ	R8, 0x48(SP)
-	MOVQ	BX, 0x18(SP)
-	MOVQ	BP, 0x20(SP)
-	MOVQ	R12, 0x28(SP)
-	MOVQ	R13, 0x30(SP)
-	MOVQ	R14, 0x38(SP)
-	MOVQ	R15, 0x40(SP)
-
-	MOVQ	DI, _rt0_amd64_darwin_lib_argc<>(SB)
-	MOVQ	SI, _rt0_amd64_darwin_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVQ	$runtime·libpreinit(SB), AX
-	CALL	AX
-
-	// Create a new thread to do the runtime initialization and return.
-	MOVQ	_cgo_sys_thread_create(SB), AX
-	TESTQ	AX, AX
-	JZ	nocgo
-	MOVQ	$_rt0_amd64_darwin_lib_go(SB), DI
-	MOVQ	$0, SI
-	CALL	AX
-	JMP	restore
-
-nocgo:
-	MOVQ	$8388608, 0(SP)                    // stacksize
-	MOVQ	$_rt0_amd64_darwin_lib_go(SB), AX
-	MOVQ	AX, 8(SP)                          // fn
-	MOVQ	$0, 16(SP)                         // fnarg
-	MOVQ	$runtime·newosproc0(SB), AX
-	CALL	AX
-
-restore:
-	MOVQ	0x18(SP), BX
-	MOVQ	0x20(SP), BP
-	MOVQ	0x28(SP), R12
-	MOVQ	0x30(SP), R13
-	MOVQ	0x38(SP), R14
-	MOVQ	0x40(SP), R15
-	
-	MOVQ	0x48(SP), R8
-	MOVQ	R8, SP
-	RET
-
-TEXT _rt0_amd64_darwin_lib_go(SB),NOSPLIT,$0
-	MOVQ	_rt0_amd64_darwin_lib_argc<>(SB), DI
-	MOVQ	_rt0_amd64_darwin_lib_argv<>(SB), SI
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
-
-DATA _rt0_amd64_darwin_lib_argc<>(SB)/8, $0
-GLOBL _rt0_amd64_darwin_lib_argc<>(SB),NOPTR, $8
-DATA _rt0_amd64_darwin_lib_argv<>(SB)/8, $0
-GLOBL _rt0_amd64_darwin_lib_argv<>(SB),NOPTR, $8
-
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_darwin_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_darwin_arm.s b/src/runtime/rt0_darwin_arm.s
index 526d88f13d..71fbe5f68a 100644
--- a/src/runtime/rt0_darwin_arm.s
+++ b/src/runtime/rt0_darwin_arm.s
@@ -4,94 +4,8 @@
 
 #include "textflag.h"
 
-TEXT _rt0_arm_darwin(SB),7,$-4
-	// prepare arguments for main (_rt0_go)
-	MOVW	(R13), R0	// argc
-	MOVW	$4(R13), R1		// argv
-	MOVW	$main(SB), R4
-	B		(R4)
+TEXT _rt0_arm_darwin(SB),7,$0
+	B	_rt0_asm(SB)
 
-// When linking with -buildmode=c-archive or -buildmode=c-shared,
-// this symbol is called from a global initialization function.
-//
-// Note that all currently shipping darwin/arm platforms require
-// cgo and do not support c-shared.
-TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$104
-	// Preserve callee-save registers.
-	MOVW    R4, 12(R13)
-	MOVW    R5, 16(R13)
-	MOVW    R6, 20(R13)
-	MOVW    R7, 24(R13)
-	MOVW    R8, 28(R13)
-	MOVW    R11, 32(R13)
-
-	MOVD	F8, (32+8*1)(R13)
-	MOVD	F9, (32+8*2)(R13)
-	MOVD	F10, (32+8*3)(R13)
-	MOVD	F11, (32+8*4)(R13)
-	MOVD	F12, (32+8*5)(R13)
-	MOVD	F13, (32+8*6)(R13)
-	MOVD	F14, (32+8*7)(R13)
-	MOVD	F15, (32+8*8)(R13)
-
-	MOVW  R0, _rt0_arm_darwin_lib_argc<>(SB)
-	MOVW  R1, _rt0_arm_darwin_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVW	$runtime·libpreinit(SB), R3
-	CALL	(R3)
-
-	// Create a new thread to do the runtime initialization and return.
-	MOVW  _cgo_sys_thread_create(SB), R3
-	CMP   $0, R3
-	B.EQ  nocgo
-	MOVW  $_rt0_arm_darwin_lib_go(SB), R0
-	MOVW  $0, R1
-	BL    (R3)
-	B rr
-nocgo:
-	MOVW  $0x400000, R0
-	MOVW  R0, (R13) // stacksize
-	MOVW  $_rt0_arm_darwin_lib_go(SB), R0
-	MOVW  R0, 4(R13) // fn
-	MOVW  $0, R0
-	MOVW  R0, 8(R13) // fnarg
-	MOVW  $runtime·newosproc0(SB), R3
-	BL    (R3)
-rr:
-	// Restore callee-save registers and return.
-	MOVW    12(R13), R4
-	MOVW    16(R13), R5
-	MOVW    20(R13), R6
-	MOVW    24(R13), R7
-	MOVW    28(R13), R8
-	MOVW    32(R13), R11
-	MOVD	(32+8*1)(R13), F8
-	MOVD	(32+8*2)(R13), F9
-	MOVD	(32+8*3)(R13), F10
-	MOVD	(32+8*4)(R13), F11
-	MOVD	(32+8*5)(R13), F12
-	MOVD	(32+8*6)(R13), F13
-	MOVD	(32+8*7)(R13), F14
-	MOVD	(32+8*8)(R13), F15
-	RET
-
-
-TEXT _rt0_arm_darwin_lib_go(SB),NOSPLIT,$0
-	MOVW  _rt0_arm_darwin_lib_argc<>(SB), R0
-	MOVW  _rt0_arm_darwin_lib_argv<>(SB), R1
-	MOVW  R0,  (R13)
-	MOVW  R1, 4(R13)
-	MOVW  $runtime·rt0_go(SB), R4
-	B     (R4)
-
-DATA  _rt0_arm_darwin_lib_argc<>(SB)/4, $0
-GLOBL _rt0_arm_darwin_lib_argc<>(SB),NOPTR, $4
-DATA  _rt0_arm_darwin_lib_argv<>(SB)/4, $0
-GLOBL _rt0_arm_darwin_lib_argv<>(SB),NOPTR, $4
-
-TEXT main(SB),NOSPLIT,$-8
-	// save argc and argv onto stack
-	MOVM.DB.W [R0-R1], (R13)
-	MOVW	$runtime·rt0_go(SB), R4
-	B		(R4)
+TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$0
+	B	_rt0_arm_lib(SB)
diff --git a/src/runtime/rt0_dragonfly_amd64.s b/src/runtime/rt0_dragonfly_amd64.s
index fb56618d8f..e76f9b9b52 100644
--- a/src/runtime/rt0_dragonfly_amd64.s
+++ b/src/runtime/rt0_dragonfly_amd64.s
@@ -4,12 +4,11 @@
 
 #include "textflag.h"
 
+// On Dragonfly argc/argv are passed in DI, not SP, so we can't use _rt0_amd64.
 TEXT _rt0_amd64_dragonfly(SB),NOSPLIT,$-8
 	LEAQ	8(DI), SI // argv
 	MOVQ	0(DI), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	runtime·rt0_go(SB)
 
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_dragonfly_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_freebsd_386.s b/src/runtime/rt0_freebsd_386.s
index cd7a915f84..1808059838 100644
--- a/src/runtime/rt0_freebsd_386.s
+++ b/src/runtime/rt0_freebsd_386.s
@@ -4,13 +4,14 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_freebsd(SB),NOSPLIT,$8
-	MOVL	8(SP), AX
-	LEAL	12(SP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_freebsd(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
+
+TEXT _rt0_386_freebsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_386_lib(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_freebsd_amd64.s b/src/runtime/rt0_freebsd_amd64.s
index 7989f7c3e9..ccc48f66b4 100644
--- a/src/runtime/rt0_freebsd_amd64.s
+++ b/src/runtime/rt0_freebsd_amd64.s
@@ -4,12 +4,11 @@
 
 #include "textflag.h"
 
+// On FreeBSD argc/argv are passed in DI, not SP, so we can't use _rt0_amd64.
 TEXT _rt0_amd64_freebsd(SB),NOSPLIT,$-8
 	LEAQ	8(DI), SI // argv
 	MOVQ	0(DI), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	runtime·rt0_go(SB)
 
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_freebsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_freebsd_arm.s b/src/runtime/rt0_freebsd_arm.s
index e1bb13d53a..62ecd9aeb5 100644
--- a/src/runtime/rt0_freebsd_arm.s
+++ b/src/runtime/rt0_freebsd_arm.s
@@ -4,13 +4,8 @@
 
 #include "textflag.h"
 
-TEXT _rt0_arm_freebsd(SB),NOSPLIT,$-4
-	MOVW	(R13), R0		// argc
-	MOVW	$4(R13), R1		// argv
-	MOVM.DB.W [R0-R1], (R13)
-	B	runtime·rt0_go(SB)
+TEXT _rt0_arm_freebsd(SB),NOSPLIT,$0
+	B	_rt0_arm(SB)
 
-TEXT main(SB),NOSPLIT,$-4
-	MOVM.DB.W [R0-R1], (R13)
-	MOVW	$runtime·rt0_go(SB), R4
-	B	(R4)
+TEXT _rt0_arm_freebsd_lib(SB),NOSPLIT,$0
+	B	_rt0_arm_lib(SB)
diff --git a/src/runtime/rt0_linux_386.s b/src/runtime/rt0_linux_386.s
index 23bfc98b10..325066fc1d 100644
--- a/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s
@@ -4,72 +4,14 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_linux(SB),NOSPLIT,$8
-	MOVL	8(SP), AX
-	LEAL	12(SP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_linux(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
 
-// When building with -buildmode=c-shared, this symbol is called when the shared
-// library is loaded.
 TEXT _rt0_386_linux_lib(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	PUSHL	BX
-	PUSHL	SI
-	PUSHL	DI
-
-	MOVL	8(BP), AX
-	MOVL	AX, _rt0_386_linux_lib_argc<>(SB)
-	MOVL	12(BP), AX
-	MOVL	AX, _rt0_386_linux_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVL	$runtime·libpreinit(SB), AX
-	CALL	AX
-
-	SUBL	$8, SP
-
-	// Create a new thread to do the runtime initialization.
-	MOVL	_cgo_sys_thread_create(SB), AX
-	TESTL	AX, AX
-	JZ	nocgo
-	MOVL	$_rt0_386_linux_lib_go(SB), BX
-	MOVL	BX, 0(SP)
-	MOVL	$0, 4(SP)
-	CALL	AX
-	JMP	restore
-
-nocgo:
-	MOVL	$0x800000, 0(SP)                    // stacksize = 8192KB
-	MOVL	$_rt0_386_linux_lib_go(SB), AX
-	MOVL	AX, 4(SP)                           // fn
-	MOVL	$runtime·newosproc0(SB), AX
-	CALL	AX
-
-restore:
-	ADDL	$8, SP
-	POPL	DI
-	POPL	SI
-	POPL	BX
-	POPL	BP
-	RET
-
-TEXT _rt0_386_linux_lib_go(SB),NOSPLIT,$12
-	MOVL	_rt0_386_linux_lib_argc<>(SB), AX
-	MOVL	AX, 0(SP)
-	MOVL	_rt0_386_linux_lib_argv<>(SB), AX
-	MOVL	AX, 4(SP)
-	MOVL	$runtime·rt0_go(SB), AX
-	CALL	AX
-	RET
-
-DATA _rt0_386_linux_lib_argc<>(SB)/4, $0
-GLOBL _rt0_386_linux_lib_argc<>(SB),NOPTR, $4
-DATA _rt0_386_linux_lib_argv<>(SB)/4, $0
-GLOBL _rt0_386_linux_lib_argv<>(SB),NOPTR, $4
+	JMP	_rt0_386_lib(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_linux_amd64.s b/src/runtime/rt0_linux_amd64.s
index ced471f5cb..94ff7094d6 100644
--- a/src/runtime/rt0_linux_amd64.s
+++ b/src/runtime/rt0_linux_amd64.s
@@ -5,70 +5,7 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_linux(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
-// When building with -buildmode=c-shared, this symbol is called when the shared
-// library is loaded.
-// Note: This function calls external C code, which might required 16-byte stack
-// alignment after cmd/internal/obj applies its transformations.
-TEXT _rt0_amd64_linux_lib(SB),NOSPLIT,$0x50
-	MOVQ	SP, AX
-	ANDQ	$-16, SP
-	MOVQ	BX, 0x10(SP)
-	MOVQ	BP, 0x18(SP)
-	MOVQ	R12, 0x20(SP)
-	MOVQ	R13, 0x28(SP)
-	MOVQ	R14, 0x30(SP)
-	MOVQ	R15, 0x38(SP)
-	MOVQ	AX, 0x40(SP)
-
-	MOVQ	DI, _rt0_amd64_linux_lib_argc<>(SB)
-	MOVQ	SI, _rt0_amd64_linux_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVQ	$runtime·libpreinit(SB), AX
-	CALL	AX
-
-	// Create a new thread to do the runtime initialization and return.
-	MOVQ	_cgo_sys_thread_create(SB), AX
-	TESTQ	AX, AX
-	JZ	nocgo
-	MOVQ	$_rt0_amd64_linux_lib_go(SB), DI
-	MOVQ	$0, SI
-	CALL	AX
-	JMP	restore
-
-nocgo:
-	MOVQ	$8388608, 0(SP)                    // stacksize
-	MOVQ	$_rt0_amd64_linux_lib_go(SB), AX
-	MOVQ	AX, 8(SP)                          // fn
-	MOVQ	$runtime·newosproc0(SB), AX
-	CALL	AX
-
-restore:
-	MOVQ	0x10(SP), BX
-	MOVQ	0x18(SP), BP
-	MOVQ	0x20(SP), R12
-	MOVQ	0x28(SP), R13
-	MOVQ	0x30(SP), R14
-	MOVQ	0x38(SP), R15
-	MOVQ	0x40(SP), SP
-	RET
-
-TEXT _rt0_amd64_linux_lib_go(SB),NOSPLIT,$0
-	MOVQ	_rt0_amd64_linux_lib_argc<>(SB), DI
-	MOVQ	_rt0_amd64_linux_lib_argv<>(SB), SI
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
-
-DATA _rt0_amd64_linux_lib_argc<>(SB)/8, $0
-GLOBL _rt0_amd64_linux_lib_argc<>(SB),NOPTR, $8
-DATA _rt0_amd64_linux_lib_argv<>(SB)/8, $0
-GLOBL _rt0_amd64_linux_lib_argv<>(SB),NOPTR, $8
-
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_linux_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index 597e642adb..ba4ca2b10a 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -12,158 +12,22 @@ TEXT _rt0_arm_linux(SB),NOSPLIT,$-4
 
 // When building with -buildmode=c-shared, this symbol is called when the shared
 // library is loaded.
-TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$104
-	// Preserve callee-save registers. Raspberry Pi's dlopen(), for example,
-	// actually cares that R11 is preserved.
-	MOVW	R4, 12(R13)
-	MOVW	R5, 16(R13)
-	MOVW	R6, 20(R13)
-	MOVW	R7, 24(R13)
-	MOVW	R8, 28(R13)
-	MOVW	R11, 32(R13)
-
-	// Skip floating point registers on GOARM < 6.
-	MOVB    runtime·goarm(SB), R11
-	CMP $6, R11
-	BLT skipfpsave
-	MOVD	F8, (32+8*1)(R13)
-	MOVD	F9, (32+8*2)(R13)
-	MOVD	F10, (32+8*3)(R13)
-	MOVD	F11, (32+8*4)(R13)
-	MOVD	F12, (32+8*5)(R13)
-	MOVD	F13, (32+8*6)(R13)
-	MOVD	F14, (32+8*7)(R13)
-	MOVD	F15, (32+8*8)(R13)
-skipfpsave:
-	// Save argc/argv.
-	MOVW	R0, _rt0_arm_linux_lib_argc<>(SB)
-	MOVW	R1, _rt0_arm_linux_lib_argv<>(SB)
-
-	// Synchronous initialization.
-	MOVW	$runtime·libpreinit(SB), R2
-	CALL	(R2)
-
-	// Create a new thread to do the runtime initialization.
-	MOVW	_cgo_sys_thread_create(SB), R2
-	CMP	$0, R2
-	BEQ	nocgo
-	MOVW	$_rt0_arm_linux_lib_go<>(SB), R0
-	MOVW	$0, R1
-	BL	(R2)
-	B	rr
-nocgo:
-	MOVW	$0x800000, R0                     // stacksize = 8192KB
-	MOVW	$_rt0_arm_linux_lib_go<>(SB), R1  // fn
-	MOVW	R0, 4(R13)
-	MOVW	R1, 8(R13)
-	BL	runtime·newosproc0(SB)
-rr:
-	// Restore callee-save registers and return.
-	MOVB    runtime·goarm(SB), R11
-	CMP $6, R11
-	BLT skipfprest
-	MOVD	(32+8*1)(R13), F8
-	MOVD	(32+8*2)(R13), F9
-	MOVD	(32+8*3)(R13), F10
-	MOVD	(32+8*4)(R13), F11
-	MOVD	(32+8*5)(R13), F12
-	MOVD	(32+8*6)(R13), F13
-	MOVD	(32+8*7)(R13), F14
-	MOVD	(32+8*8)(R13), F15
-skipfprest:
-	MOVW	12(R13), R4
-	MOVW	16(R13), R5
-	MOVW	20(R13), R6
-	MOVW	24(R13), R7
-	MOVW	28(R13), R8
-	MOVW	32(R13), R11
-	RET
-
-TEXT _rt0_arm_linux_lib_go<>(SB),NOSPLIT,$8
-	MOVW	_rt0_arm_linux_lib_argc<>(SB), R0
-	MOVW	_rt0_arm_linux_lib_argv<>(SB), R1
-	MOVW	R0, 0(R13)
-	MOVW	R1, 4(R13)
-	B	runtime·rt0_go(SB)
-
-DATA _rt0_arm_linux_lib_argc<>(SB)/4,$0
-GLOBL _rt0_arm_linux_lib_argc<>(SB),NOPTR,$4
-DATA _rt0_arm_linux_lib_argv<>(SB)/4,$0
-GLOBL _rt0_arm_linux_lib_argv<>(SB),NOPTR,$4
+TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$0
+	B	_rt0_arm_lib(SB)
 
 TEXT _rt0_arm_linux1(SB),NOSPLIT,$-4
 	// We first need to detect the kernel ABI, and warn the user
-	// if the system only supports OABI
+	// if the system only supports OABI.
 	// The strategy here is to call some EABI syscall to see if
 	// SIGILL is received.
-	// To catch SIGILL, we have to first setup sigaction, this is
-	// a chicken-and-egg problem, because we can't do syscall if
-	// we don't know the kernel ABI... Oh, not really, we can do
-	// syscall in Thumb mode.
+	// If you get a SIGILL here, you have the wrong kernel.
 
-	// Save argc and argv
+	// Save argc and argv (syscall will clobber at least R0).
 	MOVM.DB.W [R0-R1], (R13)
 
-	// Thumb mode OABI check disabled because there are some
-	// EABI systems that do not support Thumb execution.
-	// We can run on them except for this check!
-
-	// // set up sa_handler
-	// MOVW	$bad_abi<>(SB), R0 // sa_handler
-	// MOVW	$0, R1 // sa_flags
-	// MOVW	$0, R2 // sa_restorer
-	// MOVW	$0, R3 // sa_mask
-	// MOVM.DB.W [R0-R3], (R13)
-	// MOVW	$4, R0 // SIGILL
-	// MOVW	R13, R1 // sa
-	// SUB	$16, R13
-	// MOVW	R13, R2 // old_sa
-	// MOVW	$8, R3 // c
-	// MOVW	$174, R7 // sys_sigaction
-	// BL	oabi_syscall<>(SB)
-
 	// do an EABI syscall
 	MOVW	$20, R7 // sys_getpid
 	SWI	$0 // this will trigger SIGILL on OABI systems
-	
-	// MOVW	$4, R0  // SIGILL
-	// MOVW	R13, R1 // sa
-	// MOVW	$0, R2 // old_sa
-	// MOVW	$8, R3 // c
-	// MOVW	$174, R7 // sys_sigaction
-	// SWI	$0 // restore signal handler
-	// ADD	$32, R13
 
+	MOVM.IA.W (R13), [R0-R1]
 	B	runtime·rt0_go(SB)
-
-TEXT bad_abi<>(SB),NOSPLIT,$-4
-	// give diagnosis and exit
-	MOVW	$2, R0 // stderr
-	MOVW	$bad_abi_msg(SB), R1 // data
-	MOVW	$45, R2 // len
-	MOVW	$4, R7 // sys_write
-	BL	oabi_syscall<>(SB)
-	MOVW	$1, R0
-	MOVW	$1, R7 // sys_exit
-	BL	oabi_syscall<>(SB)
-	B  	0(PC)
-
-DATA bad_abi_msg+0x00(SB)/8, $"This pro"
-DATA bad_abi_msg+0x08(SB)/8, $"gram can"
-DATA bad_abi_msg+0x10(SB)/8, $" only be"
-DATA bad_abi_msg+0x18(SB)/8, $" run on "
-DATA bad_abi_msg+0x20(SB)/8, $"EABI ker"
-DATA bad_abi_msg+0x28(SB)/4, $"nels"
-DATA bad_abi_msg+0x2c(SB)/1, $0xa
-GLOBL bad_abi_msg(SB), RODATA, $45
-
-TEXT oabi_syscall<>(SB),NOSPLIT,$-4
-	ADD $1, R15, R4 // R15 is hardware PC
-	WORD $0xe12fff14 //BX	(R4) // enter thumb mode
-	// TODO(minux): only supports little-endian CPUs
-	WORD $0x4770df01 // swi $1; bx lr
-
-TEXT main(SB),NOSPLIT,$-4
-	MOVW	$_rt0_arm_linux1(SB), R4
-	B		(R4)
-
diff --git a/src/runtime/rt0_linux_ppc64le.s b/src/runtime/rt0_linux_ppc64le.s
index 81b991349a..73b9ae392d 100644
--- a/src/runtime/rt0_linux_ppc64le.s
+++ b/src/runtime/rt0_linux_ppc64le.s
@@ -2,6 +2,7 @@
 #include "textflag.h"
 
 TEXT _rt0_ppc64le_linux(SB),NOSPLIT,$0
+	XOR R0, R0	  // Make sure R0 is zero before _main
 	BR _main<>(SB)
 
 TEXT _rt0_ppc64le_linux_lib(SB),NOSPLIT,$-8
@@ -10,7 +11,6 @@ TEXT _rt0_ppc64le_linux_lib(SB),NOSPLIT,$-8
 	MOVD	R0, 16(R1) // Save LR in caller's frame.
 	MOVW	CR, R0     // Save CR in caller's frame
 	MOVD	R0, 8(R1)
-	MOVD	R2, 24(R1) // Save TOC in caller's frame.
 	MOVDU	R1, -320(R1) // Allocate frame.
 	
 	// Preserve callee-save registers.
@@ -121,7 +121,6 @@ done:
 	FMOVD	304(R1), F31
 
 	ADD	$320, R1
-	MOVD	24(R1), R2
 	MOVD	8(R1), R0
 	MOVFL	R0, $0xff
 	MOVD	16(R1), R0
diff --git a/src/runtime/rt0_linux_s390x.s b/src/runtime/rt0_linux_s390x.s
index aedd6c7ef2..4b62c5a65a 100644
--- a/src/runtime/rt0_linux_s390x.s
+++ b/src/runtime/rt0_linux_s390x.s
@@ -4,17 +4,20 @@
 
 #include "textflag.h"
 
-TEXT _rt0_s390x_linux(SB),NOSPLIT|NOFRAME,$0
+TEXT _rt0_s390x_linux(SB), NOSPLIT|NOFRAME, $0
 	// In a statically linked binary, the stack contains argc,
 	// argv as argc string pointers followed by a NULL, envv as a
 	// sequence of string pointers followed by a NULL, and auxv.
 	// There is no TLS base pointer.
-	//
-	// TODO: Support dynamic linking entry point
-	MOVD 0(R15), R2 // argc
-	ADD $8, R15, R3 // argv
-	BR main(SB)
 
-TEXT main(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·rt0_go(SB), R11
-	BR	R11
+	MOVD 0(R15), R2  // argc
+	ADD  $8, R15, R3 // argv
+	BR   main(SB)
+
+TEXT _rt0_s390x_linux_lib(SB), NOSPLIT, $0
+	MOVD $_rt0_s390x_lib(SB), R1
+	BR   R1
+
+TEXT main(SB), NOSPLIT|NOFRAME, $0
+	MOVD $runtime·rt0_go(SB), R1
+	BR   R1
diff --git a/src/runtime/rt0_nacl_386.s b/src/runtime/rt0_nacl_386.s
index d4ba06306a..4c990022f1 100644
--- a/src/runtime/rt0_nacl_386.s
+++ b/src/runtime/rt0_nacl_386.s
@@ -15,8 +15,10 @@ TEXT _rt0_386_nacl(SB),NOSPLIT,$8
 	LEAL	argv+16(FP), BX
 	MOVL	AX, 0(SP)
 	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+	JMP	runtime·rt0_go(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_nacl_arm.s b/src/runtime/rt0_nacl_arm.s
index eadb4782dd..2be8a0730f 100644
--- a/src/runtime/rt0_nacl_arm.s
+++ b/src/runtime/rt0_nacl_arm.s
@@ -13,8 +13,4 @@
 TEXT _rt0_arm_nacl(SB),NOSPLIT,$-4
 	MOVW	8(R13), R0
 	MOVW	$12(R13), R1
-	MOVM.DB.W [R0-R1], (R13)
-	B	main(SB)
-
-TEXT main(SB),NOSPLIT,$0
 	B	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_netbsd_386.s b/src/runtime/rt0_netbsd_386.s
index 70b8532538..cefc04a815 100644
--- a/src/runtime/rt0_netbsd_386.s
+++ b/src/runtime/rt0_netbsd_386.s
@@ -4,13 +4,14 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_netbsd(SB),NOSPLIT,$8
-	MOVL	8(SP), AX
-	LEAL	12(SP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_netbsd(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
+
+TEXT _rt0_386_netbsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_386_lib(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_netbsd_amd64.s b/src/runtime/rt0_netbsd_amd64.s
index fad56614e5..77c7187bba 100644
--- a/src/runtime/rt0_netbsd_amd64.s
+++ b/src/runtime/rt0_netbsd_amd64.s
@@ -5,11 +5,7 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_netbsd(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_netbsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_netbsd_arm.s b/src/runtime/rt0_netbsd_arm.s
index 2cb1182c06..503c32adac 100644
--- a/src/runtime/rt0_netbsd_arm.s
+++ b/src/runtime/rt0_netbsd_arm.s
@@ -4,8 +4,8 @@
 
 #include "textflag.h"
 
-TEXT _rt0_arm_netbsd(SB),NOSPLIT,$-4
-	MOVW	(R13), R0		// argc
-	MOVW	$4(R13), R1		// argv
-	MOVM.DB.W [R0-R1], (R13)
-	B runtime·rt0_go(SB)
+TEXT _rt0_arm_netbsd(SB),NOSPLIT,$0
+	B	_rt0_arm(SB)
+
+TEXT _rt0_arm_netbsd_lib(SB),NOSPLIT,$0
+	B	_rt0_arm_lib(SB)
diff --git a/src/runtime/rt0_openbsd_386.s b/src/runtime/rt0_openbsd_386.s
index f25d2e1cf0..959f4d655a 100644
--- a/src/runtime/rt0_openbsd_386.s
+++ b/src/runtime/rt0_openbsd_386.s
@@ -4,13 +4,14 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_openbsd(SB),NOSPLIT,$8
-	MOVL	8(SP), AX
-	LEAL	12(SP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
+TEXT _rt0_386_openbsd(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
+
+TEXT _rt0_386_openbsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_386_lib(SB)
 
 TEXT main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_openbsd_amd64.s b/src/runtime/rt0_openbsd_amd64.s
index 58fe666391..c2f3f23f37 100644
--- a/src/runtime/rt0_openbsd_amd64.s
+++ b/src/runtime/rt0_openbsd_amd64.s
@@ -5,11 +5,7 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_openbsd(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_openbsd_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_openbsd_arm.s b/src/runtime/rt0_openbsd_arm.s
index 6207e55982..3511c96abc 100644
--- a/src/runtime/rt0_openbsd_arm.s
+++ b/src/runtime/rt0_openbsd_arm.s
@@ -4,8 +4,8 @@
 
 #include "textflag.h"
 
-TEXT _rt0_arm_openbsd(SB),NOSPLIT,$-4
-	MOVW	(R13), R0		// argc
-	MOVW	$4(R13), R1		// argv
-	MOVM.DB.W [R0-R1], (R13)
-	B	runtime·rt0_go(SB)
+TEXT _rt0_arm_openbsd(SB),NOSPLIT,$0
+	B	_rt0_arm(SB)
+
+TEXT _rt0_arm_openbsd_lib(SB),NOSPLIT,$0
+	B	_rt0_arm_lib(SB)
diff --git a/src/runtime/rt0_plan9_386.s b/src/runtime/rt0_plan9_386.s
index cbbf245632..64716158f1 100644
--- a/src/runtime/rt0_plan9_386.s
+++ b/src/runtime/rt0_plan9_386.s
@@ -14,7 +14,7 @@ TEXT _rt0_386_plan9(SB),NOSPLIT,$12
 	MOVL	AX, 0(SP)
 	LEAL	inargv+0(FP), AX
 	MOVL	AX, 4(SP)
-	CALL	runtime·rt0_go(SB)
+	JMP	runtime·rt0_go(SB)
 
 GLOBL _tos(SB), NOPTR, $4
 GLOBL _privates(SB), NOPTR, $4
diff --git a/src/runtime/rt0_plan9_arm.s b/src/runtime/rt0_plan9_arm.s
index 2a35e4ef66..d6174a4df5 100644
--- a/src/runtime/rt0_plan9_arm.s
+++ b/src/runtime/rt0_plan9_arm.s
@@ -10,8 +10,6 @@ TEXT _rt0_arm_plan9(SB),NOSPLIT,$-4
 	MOVW	R0, _tos(SB)
 	MOVW	0(R13), R0
 	MOVW	$4(R13), R1
-	MOVW.W	R1, -4(R13)
-	MOVW.W	R0, -4(R13)
 	B	runtime·rt0_go(SB)
 
 GLOBL _tos(SB), NOPTR, $4
diff --git a/src/runtime/rt0_solaris_amd64.s b/src/runtime/rt0_solaris_amd64.s
index e2d1e71bb4..5c46ded3ae 100644
--- a/src/runtime/rt0_solaris_amd64.s
+++ b/src/runtime/rt0_solaris_amd64.s
@@ -5,11 +5,7 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_solaris(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
+TEXT _rt0_amd64_solaris_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/src/runtime/rt0_windows_386.s b/src/runtime/rt0_windows_386.s
index b9407a9879..fa39edd787 100644
--- a/src/runtime/rt0_windows_386.s
+++ b/src/runtime/rt0_windows_386.s
@@ -4,13 +4,8 @@
 
 #include "textflag.h"
 
-TEXT _rt0_386_windows(SB),NOSPLIT,$12
-	MOVL	12(SP), AX
-	LEAL	16(SP), BX
-	MOVL	AX, 4(SP)
-	MOVL	BX, 8(SP)
-	MOVL	$-1, 0(SP) // return PC for main
-	JMP	_main(SB)
+TEXT _rt0_386_windows(SB),NOSPLIT,$0
+	JMP	_rt0_386(SB)
 
 // When building with -buildmode=(c-shared or c-archive), this
 // symbol is called. For dynamic libraries it is called when the
@@ -41,10 +36,12 @@ TEXT _rt0_386_windows_lib(SB),NOSPLIT,$0x1C
 	RET
 
 TEXT _rt0_386_windows_lib_go(SB),NOSPLIT,$0
-	MOVL  $0, DI
-	MOVL	$0, SI
-	MOVL	$runtime·rt0_go(SB), AX
-	JMP	AX
+	PUSHL	$0
+	PUSHL	$0
+	JMP	runtime·rt0_go(SB)
 
 TEXT _main(SB),NOSPLIT,$0
+	// Remove the return address from the stack.
+	// rt0_go doesn't expect it to be there.
+	ADDL	$4, SP
 	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_windows_amd64.s b/src/runtime/rt0_windows_amd64.s
index 2f73b37f31..1604711cdb 100644
--- a/src/runtime/rt0_windows_amd64.s
+++ b/src/runtime/rt0_windows_amd64.s
@@ -7,10 +7,7 @@
 #include "textflag.h"
 
 TEXT _rt0_amd64_windows(SB),NOSPLIT,$-8
-	LEAQ	8(SP), SI // argv
-	MOVQ	0(SP), DI // argc
-	MOVQ	$main(SB), AX
-	JMP	AX
+	JMP	_rt0_amd64(SB)
 
 // When building with -buildmode=(c-shared or c-archive), this
 // symbol is called. For dynamic libraries it is called when the
@@ -42,7 +39,3 @@ TEXT _rt0_amd64_windows_lib_go(SB),NOSPLIT,$0
 	MOVQ	$0, SI
 	MOVQ	$runtime·rt0_go(SB), AX
 	JMP	AX
-
-TEXT main(SB),NOSPLIT,$-8
-	MOVQ	$runtime·rt0_go(SB), AX
-	JMP	AX
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index 1318babdea..5e0508631f 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -22,11 +22,15 @@ import (
 
 func checkGdbEnvironment(t *testing.T) {
 	testenv.MustHaveGoBuild(t)
-	if runtime.GOOS == "darwin" {
+	switch runtime.GOOS {
+	case "darwin":
 		t.Skip("gdb does not work on darwin")
-	}
-	if runtime.GOOS == "linux" && runtime.GOARCH == "ppc64" {
-		t.Skip("skipping gdb tests on linux/ppc64; see golang.org/issue/17366")
+	case "netbsd":
+		t.Skip("gdb does not work with threads on NetBSD; see golang.org/issue/22893 and gnats.netbsd.org/52548")
+	case "linux":
+		if runtime.GOARCH == "ppc64" {
+			t.Skip("skipping gdb tests on linux/ppc64; see golang.org/issue/17366")
+		}
 	}
 	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
 		t.Skip("gdb test can fail with GOROOT_FINAL pending")
@@ -76,7 +80,7 @@ import "fmt"
 import "runtime"
 var gslice []string
 func main() {
-	mapvar := make(map[string]string,5)
+	mapvar := make(map[string]string, 13)
 	mapvar["abc"] = "def"
 	mapvar["ghi"] = "jkl"
 	strvar := "abc"
@@ -84,7 +88,7 @@ func main() {
 	slicevar := make([]string, 0, 16)
 	slicevar = append(slicevar, mapvar["abc"])
 	fmt.Println("hi") // line 13
-	_ = ptrvar
+	runtime.KeepAlive(ptrvar)
 	gslice = slicevar
 	runtime.KeepAlive(mapvar)
 }
@@ -106,8 +110,8 @@ func testGdbPython(t *testing.T, cgo bool) {
 		t.Skip("skipping because cgo is not enabled")
 	}
 
-	t.Parallel()
 	checkGdbEnvironment(t)
+	t.Parallel()
 	checkGdbVersion(t)
 	checkGdbPython(t)
 
@@ -132,7 +136,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source %v\n%s", err, out)
 	}
@@ -198,8 +202,10 @@ func testGdbPython(t *testing.T, cgo bool) {
 		t.Fatalf("info goroutines failed: %s", bl)
 	}
 
-	printMapvarRe := regexp.MustCompile(`\Q = map[string]string = {["abc"] = "def", ["ghi"] = "jkl"}\E$`)
-	if bl := blocks["print mapvar"]; !printMapvarRe.MatchString(bl) {
+	printMapvarRe1 := regexp.MustCompile(`\Q = map[string]string = {["abc"] = "def", ["ghi"] = "jkl"}\E$`)
+	printMapvarRe2 := regexp.MustCompile(`\Q = map[string]string = {["ghi"] = "jkl", ["abc"] = "def"}\E$`)
+	if bl := blocks["print mapvar"]; !printMapvarRe1.MatchString(bl) &&
+		!printMapvarRe2.MatchString(bl) {
 		t.Fatalf("print mapvar failed: %s", bl)
 	}
 
@@ -212,7 +218,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 	// a collection of scalar vars holding the fields. In such cases
 	// the DWARF variable location expression should be of the
 	// form "var.field" and not just "field".
-	infoLocalsRe := regexp.MustCompile(`^slicevar.len = `)
+	infoLocalsRe := regexp.MustCompile(`.*\sslicevar.cap = `)
 	if bl := blocks["info locals"]; !infoLocalsRe.MatchString(bl) {
 		t.Fatalf("info locals failed: %s", bl)
 	}
@@ -260,8 +266,8 @@ func TestGdbBacktrace(t *testing.T) {
 		testenv.SkipFlaky(t, 15603)
 	}
 
-	t.Parallel()
 	checkGdbEnvironment(t)
+	t.Parallel()
 	checkGdbVersion(t)
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -278,7 +284,7 @@ func TestGdbBacktrace(t *testing.T) {
 	}
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source %v\n%s", err, out)
 	}
@@ -330,8 +336,8 @@ func main() {
 // TestGdbAutotmpTypes ensures that types of autotmp variables appear in .debug_info
 // See bug #17830.
 func TestGdbAutotmpTypes(t *testing.T) {
-	t.Parallel()
 	checkGdbEnvironment(t)
+	t.Parallel()
 	checkGdbVersion(t)
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -346,9 +352,9 @@ func TestGdbAutotmpTypes(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=-N -l", "-o", "a.exe")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source %v\n%s", err, out)
 	}
@@ -381,3 +387,62 @@ func TestGdbAutotmpTypes(t *testing.T) {
 		}
 	}
 }
+
+const constsSource = `
+package main
+
+const aConstant int = 42
+const largeConstant uint64 = ^uint64(0)
+const minusOne int64 = -1
+
+func main() {
+	println("hello world")
+}
+`
+
+func TestGdbConst(t *testing.T) {
+	checkGdbEnvironment(t)
+	t.Parallel()
+	checkGdbVersion(t)
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// Build the source code.
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(constsSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	// Execute gdb commands.
+	args := []string{"-nx", "-batch",
+		"-ex", "set startup-with-shell off",
+		"-ex", "break main.main",
+		"-ex", "run",
+		"-ex", "print main.aConstant",
+		"-ex", "print main.largeConstant",
+		"-ex", "print main.minusOne",
+		"-ex", "print 'runtime._MSpanInUse'",
+		"-ex", "print 'runtime._PageSize'",
+		filepath.Join(dir, "a.exe"),
+	}
+	got, _ := exec.Command("gdb", args...).CombinedOutput()
+
+	sgot := strings.Replace(string(got), "\r\n", "\n", -1)
+
+	t.Logf("output %q", sgot)
+
+	if !strings.Contains(sgot, "\n$1 = 42\n$2 = 18446744073709551615\n$3 = -1\n$4 = 1 '\\001'\n$5 = 8192") {
+		t.Fatalf("output mismatch")
+	}
+}
diff --git a/src/runtime/runtime-lldb_test.go b/src/runtime/runtime-lldb_test.go
index 98bc906666..9a287052ea 100644
--- a/src/runtime/runtime-lldb_test.go
+++ b/src/runtime/runtime-lldb_test.go
@@ -5,11 +5,7 @@
 package runtime_test
 
 import (
-	"debug/elf"
-	"debug/macho"
-	"encoding/binary"
 	"internal/testenv"
-	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -158,7 +154,7 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("failed to create file: %v", err)
 	}
 
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags", "-N -l", "-o", "a.exe")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
 	cmd.Dir = dir
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -182,81 +178,3 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("Unexpected lldb output:\n%s", got)
 	}
 }
-
-// Check that aranges are valid even when lldb isn't installed.
-func TestDwarfAranges(t *testing.T) {
-	testenv.MustHaveGoBuild(t)
-	dir, err := ioutil.TempDir("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
-	if err != nil {
-		t.Fatalf("failed to create file: %v", err)
-	}
-
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
-	cmd.Dir = dir
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("building source %v\n%s", err, out)
-	}
-
-	filename := filepath.Join(dir, "a.exe")
-	if f, err := elf.Open(filename); err == nil {
-		sect := f.Section(".debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else if f, err := macho.Open(filename); err == nil {
-		sect := f.Section("__debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else {
-		t.Skip("Not an elf or macho binary.")
-	}
-}
-
-func verifyAranges(t *testing.T, byteorder binary.ByteOrder, data io.ReadSeeker) {
-	var header struct {
-		UnitLength  uint32 // does not include the UnitLength field
-		Version     uint16
-		Offset      uint32
-		AddressSize uint8
-		SegmentSize uint8
-	}
-	for {
-		offset, err := data.Seek(0, io.SeekCurrent)
-		if err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		if err = binary.Read(data, byteorder, &header); err == io.EOF {
-			return
-		} else if err != nil {
-			t.Fatalf("Error reading arange header: %v", err)
-		}
-		tupleSize := int64(header.SegmentSize) + 2*int64(header.AddressSize)
-		lastTupleOffset := offset + int64(header.UnitLength) + 4 - tupleSize
-		if lastTupleOffset%tupleSize != 0 {
-			t.Fatalf("Invalid arange length %d, (addr %d, seg %d)", header.UnitLength, header.AddressSize, header.SegmentSize)
-		}
-		if _, err = data.Seek(lastTupleOffset, io.SeekStart); err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		buf := make([]byte, tupleSize)
-		if n, err := data.Read(buf); err != nil || int64(n) < tupleSize {
-			t.Fatalf("Read error: %v", err)
-		}
-		for _, val := range buf {
-			if val != 0 {
-				t.Fatalf("Invalid terminator")
-			}
-		}
-	}
-}
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 33ca75dfed..33ecc260dd 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -58,14 +58,8 @@ func syscall_Getpagesize() int { return int(physPageSize) }
 //go:linkname os_runtime_args os.runtime_args
 func os_runtime_args() []string { return append([]string{}, argslice...) }
 
-//go:linkname boring_runtime_arg0 crypto/internal/boring.runtime_arg0
-func boring_runtime_arg0() string {
-	// On Windows, argslice is not set, and it's too much work to find argv0.
-	if len(argslice) == 0 {
-		return ""
-	}
-	return argslice[0]
+//go:linkname syscall_Exit syscall.Exit
+//go:nosplit
+func syscall_Exit(code int) {
+	exit(int32(code))
 }
-
-//go:linkname fipstls_runtime_arg0 crypto/internal/boring/fipstls.runtime_arg0
-func fipstls_runtime_arg0() string { return boring_runtime_arg0() }
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index c0733481a8..0971e0cb37 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -99,10 +99,6 @@ var test_z64, test_x64 uint64
 func testAtomic64() {
 	test_z64 = 42
 	test_x64 = 0
-	prefetcht0(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
-	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
 	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
@@ -390,13 +386,6 @@ func parsedebugvars() {
 
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
-
-	// For cgocheck > 1, we turn on the write barrier at all times
-	// and check all pointer writes.
-	if debug.cgocheck > 1 {
-		writeBarrier.cgo = true
-		writeBarrier.enabled = true
-	}
 }
 
 //go:linkname setTraceback runtime/debug.SetTraceback
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 6871d9c68c..556f13d1c1 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -169,9 +169,13 @@ func efaceOf(ep *interface{}) *eface {
 // a word that is completely ignored by the GC than to have one for which
 // only a few updates are ignored.
 //
-// Gs, Ms, and Ps are always reachable via true pointers in the
-// allgs, allm, and allp lists or (during allocation before they reach those lists)
+// Gs and Ps are always reachable via true pointers in the
+// allgs and allp lists or (during allocation before they reach those lists)
 // from stack variables.
+//
+// Ms are always reachable via true pointers either from allm or
+// freem. Unlike Gs and Ps we do free Ms, so it's important that
+// nothing ever hold an muintptr across a safe point.
 
 // A guintptr holds a goroutine pointer, but typed as a uintptr
 // to bypass write barriers. It is used in the Gobuf goroutine state
@@ -221,6 +225,15 @@ func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
 //go:nosplit
 func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
 
+// muintptr is a *m that is not tracked by the garbage collector.
+//
+// Because we do free Ms, there are some additional constrains on
+// muintptrs:
+//
+// 1. Never hold an muintptr locally across a safe point.
+//
+// 2. Any muintptr in the heap must be owned by the M itself so it can
+//    ensure it is not in use when the last true *m is released.
 type muintptr uintptr
 
 //go:nosplit
@@ -241,17 +254,19 @@ type gobuf struct {
 	// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
 	//
 	// ctxt is unusual with respect to GC: it may be a
-	// heap-allocated funcval so write require a write barrier,
-	// but gobuf needs to be cleared from assembly. We take
-	// advantage of the fact that the only path that uses a
-	// non-nil ctxt is morestack. As a result, gogo is the only
-	// place where it may not already be nil, so gogo uses an
-	// explicit write barrier. Everywhere else that resets the
-	// gobuf asserts that ctxt is already nil.
+	// heap-allocated funcval, so GC needs to track it, but it
+	// needs to be set and cleared from assembly, where it's
+	// difficult to have write barriers. However, ctxt is really a
+	// saved, live register, and we only ever exchange it between
+	// the real register and the gobuf. Hence, we treat it as a
+	// root during stack scanning, which means assembly that saves
+	// and restores it doesn't need write barriers. It's still
+	// typed as a pointer so that any other writes from Go get
+	// write barriers.
 	sp   uintptr
 	pc   uintptr
 	g    guintptr
-	ctxt unsafe.Pointer // this has to be a pointer so that gc scans it
+	ctxt unsafe.Pointer
 	ret  sys.Uintreg
 	lr   uintptr
 	bp   uintptr // for GOEXPERIMENT=framepointer
@@ -272,11 +287,14 @@ type sudog struct {
 	// channel this sudog is blocking on. shrinkstack depends on
 	// this for sudogs involved in channel ops.
 
-	g          *g
-	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
-	next       *sudog
-	prev       *sudog
-	elem       unsafe.Pointer // data element (may point to stack)
+	g *g
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+	next     *sudog
+	prev     *sudog
+	elem     unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
 	// For channels, waitlink is only accessed by g.
@@ -354,7 +372,7 @@ type g struct {
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
 	tracelastp     puintptr // last P emitted an event for this goroutine
-	lockedm        *m
+	lockedm        muintptr
 	sig            uint32
 	writebuf       []byte
 	sigcode0       uintptr
@@ -367,6 +385,7 @@ type g struct {
 	cgoCtxt        []uintptr      // cgo traceback context
 	labels         unsafe.Pointer // profiler labels
 	timer          *timer         // cached timer for time.Sleep
+	selectDone     uint32         // are we participating in a select and did someone win the race?
 
 	// Per-G GC state
 
@@ -386,16 +405,17 @@ type m struct {
 	divmod  uint32 // div/mod denominator for arm - known to liblink
 
 	// Fields not known to debuggers.
-	procid        uint64     // for debuggers, but offset not hard-coded
-	gsignal       *g         // signal-handling g
-	sigmask       sigset     // storage for saved signal mask
-	tls           [6]uintptr // thread-local storage (for x86 extern register)
+	procid        uint64       // for debuggers, but offset not hard-coded
+	gsignal       *g           // signal-handling g
+	goSigStack    gsignalStack // Go-allocated signal handling stack
+	sigmask       sigset       // storage for saved signal mask
+	tls           [6]uintptr   // thread-local storage (for x86 extern register)
 	mstartfn      func()
 	curg          *g       // current running goroutine
 	caughtsig     guintptr // goroutine running during fatal signal
 	p             puintptr // attached p for executing go code (nil if not executing go code)
 	nextp         puintptr
-	id            int32
+	id            int64
 	mallocing     int32
 	throwing      int32
 	preemptoff    string // if != "", keep curg running on this m
@@ -409,8 +429,11 @@ type m struct {
 	inwb          bool // m is executing a write barrier
 	newSigstack   bool // minit on C thread called sigaltstack
 	printlock     int8
-	incgo         bool // m is executing a cgo call
-	fastrand      uint32
+	incgo         bool   // m is executing a cgo call
+	freeWait      uint32 // if == 0, safe to free g0 and delete m (atomic)
+	fastrand      [2]uint32
+	needextram    bool
+	traceback     uint8
 	ncgocall      uint64      // number of cgo calls in total
 	ncgo          int32       // number of cgo calls currently in progress
 	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
@@ -419,15 +442,14 @@ type m struct {
 	alllink       *m // on allm
 	schedlink     muintptr
 	mcache        *mcache
-	lockedg       *g
-	createstack   [32]uintptr // stack that created this thread.
-	freglo        [16]uint32  // d[i] lsb and f[i]
-	freghi        [16]uint32  // d[i] msb and f[i+16]
-	fflag         uint32      // floating point compare flags
-	locked        uint32      // tracking for lockosthread
-	nextwaitm     uintptr     // next m waiting for lock
-	needextram    bool
-	traceback     uint8
+	lockedg       guintptr
+	createstack   [32]uintptr    // stack that created this thread.
+	freglo        [16]uint32     // d[i] lsb and f[i]
+	freghi        [16]uint32     // d[i] msb and f[i+16]
+	fflag         uint32         // floating point compare flags
+	lockedExt     uint32         // tracking for external LockOSThread
+	lockedInt     uint32         // tracking for internal lockOSThread
+	nextwaitm     muintptr       // next m waiting for lock
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
 	waitlock      unsafe.Pointer
 	waittraceev   byte
@@ -435,6 +457,7 @@ type m struct {
 	startingtrace bool
 	syscalltick   uint32
 	thread        uintptr // thread handle
+	freelink      *m      // on sched.freem
 
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
@@ -502,26 +525,30 @@ type p struct {
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
-	gcAssistTime     int64 // Nanoseconds in assistAlloc
-	gcBgMarkWorker   guintptr
-	gcMarkWorkerMode gcMarkWorkerMode
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker
+	gcBgMarkWorker       guintptr
+	gcMarkWorkerMode     gcMarkWorkerMode
+
+	// gcMarkWorkerStartTime is the nanotime() at which this mark
+	// worker started.
+	gcMarkWorkerStartTime int64
 
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
 	gcw gcWork
 
+	// wbBuf is this P's GC write barrier buffer.
+	//
+	// TODO: Consider caching this in the running G.
+	wbBuf wbBuf
+
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
 	pad [sys.CacheLineSize]byte
 }
 
-const (
-	// The max value of GOMAXPROCS.
-	// There are no fundamental restrictions on the value.
-	_MaxGomaxprocs = 1 << 10
-)
-
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
 	goidgen  uint64
@@ -529,11 +556,16 @@ type schedt struct {
 
 	lock mutex
 
+	// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
+	// sure to call checkdead().
+
 	midle        muintptr // idle m's waiting for work
 	nmidle       int32    // number of idle m's waiting for work
 	nmidlelocked int32    // number of locked m's waiting for work
-	mcount       int32    // number of m's that have been created
+	mnext        int64    // number of m's that have been created and next M ID
 	maxmcount    int32    // maximum number of m's allowed (or die)
+	nmsys        int32    // number of system m's not counted for deadlock
+	nmfreed      int64    // cumulative number of freed m's
 
 	ngsys uint32 // number of system goroutines; updated atomically
 
@@ -560,6 +592,10 @@ type schedt struct {
 	deferlock mutex
 	deferpool [5]*_defer
 
+	// freem is the list of m's waiting to be freed when their
+	// m.exited is set. Linked through m.freelink.
+	freem *m
+
 	gcwaiting  uint32 // gc is waiting to run
 	stopwait   int32
 	stopnote   note
@@ -578,18 +614,7 @@ type schedt struct {
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
-// The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
-// External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m.locked record the nesting depth of calls to lockOSThread
-// (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
-// Internal locks can be recursive. For instance, a lock for cgo can occur while the main
-// goroutine is holding the lock during the initialization phase.
-const (
-	_LockExternal = 1
-	_LockInternal = 2
-)
-
+// Values for the flags field of a sigTabT.
 const (
 	_SigNotify   = 1 << iota // let signal.Notify have signal, even if from kernel
 	_SigKill                 // if signal.Notify doesn't take it, exit quietly
@@ -598,7 +623,8 @@ const (
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
-	_SigUnblock              // unblocked in minit
+	_SigUnblock              // always unblock; see blockableSig
+	_SigIgn                  // _SIG_DFL action is to ignore the signal
 )
 
 // Layout of in-memory per-function information prepared by linker
@@ -624,14 +650,11 @@ type _func struct {
 // Needs to be in sync with
 // ../cmd/compile/internal/gc/reflect.go:/^func.dumptypestructs.
 type itab struct {
-	inter  *interfacetype
-	_type  *_type
-	link   *itab
-	hash   uint32 // copy of _type.hash. Used for type switches.
-	bad    bool   // type does not implement interface
-	inhash bool   // has this itab been added to hash?
-	unused [2]byte
-	fun    [1]uintptr // variable sized
+	inter *interfacetype
+	_type *_type
+	hash  uint32 // copy of _type.hash. Used for type switches.
+	_     [4]byte
+	fun   [1]uintptr // variable sized. fun[0]==0 means _type does not implement inter.
 }
 
 // Lock-free stack node.
@@ -672,7 +695,8 @@ func extendRandom(r []byte, n int) {
 	}
 }
 
-// deferred subroutine calls
+// A _defer holds an entry on the list of deferred calls.
+// If you add a field here, add code to clear it in freedefer.
 type _defer struct {
 	siz     int32
 	started bool
@@ -716,15 +740,15 @@ const (
 const _TracebackMaxFrames = 100
 
 var (
-	emptystring string
-	allglen     uintptr
-	allm        *m
-	allp        [_MaxGomaxprocs + 1]*p
-	gomaxprocs  int32
-	ncpu        int32
-	forcegc     forcegcstate
-	sched       schedt
-	newprocs    int32
+	allglen    uintptr
+	allm       *m
+	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
+	allpLock   mutex // Protects P-less reads of allp and all writes
+	gomaxprocs int32
+	ncpu       int32
+	forcegc    forcegcstate
+	sched      schedt
+	newprocs   int32
 
 	// Information about what cpu features are available.
 	// Set on startup in asm_{386,amd64,amd64p32}.s.
diff --git a/src/runtime/runtime_boring.go b/src/runtime/runtime_boring.go
new file mode 100644
index 0000000000..5a98b20253
--- /dev/null
+++ b/src/runtime/runtime_boring.go
@@ -0,0 +1,19 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+//go:linkname boring_runtime_arg0 crypto/internal/boring.runtime_arg0
+func boring_runtime_arg0() string {
+	// On Windows, argslice is not set, and it's too much work to find argv0.
+	if len(argslice) == 0 {
+		return ""
+	}
+	return argslice[0]
+}
+
+//go:linkname fipstls_runtime_arg0 crypto/internal/boring/fipstls.runtime_arg0
+func fipstls_runtime_arg0() string { return boring_runtime_arg0() }
diff --git a/src/runtime/runtime_linux_test.go b/src/runtime/runtime_linux_test.go
index 2b6daecbfc..612397293f 100644
--- a/src/runtime/runtime_linux_test.go
+++ b/src/runtime/runtime_linux_test.go
@@ -8,6 +8,7 @@ import (
 	. "runtime"
 	"syscall"
 	"testing"
+	"time"
 	"unsafe"
 )
 
@@ -21,6 +22,17 @@ func init() {
 	// for how it is used in init (must be on main thread).
 	pid, tid = syscall.Getpid(), syscall.Gettid()
 	LockOSThread()
+
+	sysNanosleep = func(d time.Duration) {
+		// Invoke a blocking syscall directly; calling time.Sleep()
+		// would deschedule the goroutine instead.
+		ts := syscall.NsecToTimespec(d.Nanoseconds())
+		for {
+			if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
+				return
+			}
+		}
+	}
 }
 
 func TestLockOSThread(t *testing.T) {
diff --git a/src/runtime/runtime_mmap_test.go b/src/runtime/runtime_mmap_test.go
index 2eca6b9e88..57c38bc5dc 100644
--- a/src/runtime/runtime_mmap_test.go
+++ b/src/runtime/runtime_mmap_test.go
@@ -16,16 +16,10 @@ import (
 // what the code in mem_bsd.go, mem_darwin.go, and mem_linux.go expects.
 // See the uses of ENOMEM in sysMap in those files.
 func TestMmapErrorSign(t *testing.T) {
-	p := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	p, err := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
 
-	// The runtime.mmap function is nosplit, but t.Errorf is not.
-	// Reset the pointer so that we don't get an "invalid stack
-	// pointer" error from t.Errorf if we call it.
-	v := uintptr(p)
-	p = nil
-
-	if v != runtime.ENOMEM {
-		t.Errorf("mmap = %v, want %v", v, runtime.ENOMEM)
+	if p != nil || err != runtime.ENOMEM {
+		t.Errorf("mmap = %v, %v, want nil, %v", p, err, runtime.ENOMEM)
 	}
 }
 
@@ -35,20 +29,20 @@ func TestPhysPageSize(t *testing.T) {
 	ps := runtime.GetPhysPageSize()
 
 	// Get a region of memory to play with. This should be page-aligned.
-	b := uintptr(runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0))
-	if b < 4096 {
-		t.Fatalf("Mmap: %v", b)
+	b, err := runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		t.Fatalf("Mmap: %v", err)
 	}
 
 	// Mmap should fail at a half page into the buffer.
-	err := uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err >= 4096 {
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err == 0 {
 		t.Errorf("Mmap should have failed with half-page alignment %d, but succeeded: %v", ps/2, err)
 	}
 
 	// Mmap should succeed at a full page into the buffer.
-	err = uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err < 4096 {
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err != 0 {
 		t.Errorf("Mmap at full-page alignment %d failed: %v", ps, err)
 	}
 }
diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index e9bc256712..d5b6b3ac3c 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"flag"
 	"io"
 	. "runtime"
 	"runtime/debug"
@@ -13,6 +14,8 @@ import (
 	"unsafe"
 )
 
+var flagQuick = flag.Bool("quick", false, "skip slow tests, for second run in all.bash")
+
 func init() {
 	// We're testing the runtime, so make tracebacks show things
 	// in the runtime. This only raises the level, so it won't
@@ -196,9 +199,9 @@ func eqstring_generic(s1, s2 string) bool {
 }
 
 func TestEqString(t *testing.T) {
-	// This isn't really an exhaustive test of eqstring, it's
+	// This isn't really an exhaustive test of == on strings, it's
 	// just a convenient way of documenting (via eqstring_generic)
-	// what eqstring does.
+	// what == does.
 	s := []string{
 		"",
 		"a",
@@ -213,7 +216,7 @@ func TestEqString(t *testing.T) {
 			x := s1 == s2
 			y := eqstring_generic(s1, s2)
 			if x != y {
-				t.Errorf(`eqstring("%s","%s") = %t, want %t`, s1, s2, x, y)
+				t.Errorf(`("%s" == "%s") = %t, want %t`, s1, s2, x, y)
 			}
 		}
 	}
diff --git a/src/runtime/rwmutex_test.go b/src/runtime/rwmutex_test.go
index a69eca1511..872b3b098e 100644
--- a/src/runtime/rwmutex_test.go
+++ b/src/runtime/rwmutex_test.go
@@ -12,6 +12,7 @@ package runtime_test
 import (
 	"fmt"
 	. "runtime"
+	"runtime/debug"
 	"sync/atomic"
 	"testing"
 )
@@ -47,6 +48,10 @@ func doTestParallelReaders(numReaders int) {
 
 func TestParallelRWMutexReaders(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(-1))
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted.
+	// Disable GC for this test (see issue #10958).
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 	doTestParallelReaders(1)
 	doTestParallelReaders(3)
 	doTestParallelReaders(4)
diff --git a/src/runtime/select.go b/src/runtime/select.go
index 715cee8750..b59c096928 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -73,7 +73,7 @@ func newselect(sel *hselect, selsize int64, size int32) {
 }
 
 func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectsend: too many cases")
@@ -94,7 +94,7 @@ func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
 }
 
 func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectrecv: too many cases")
@@ -116,7 +116,7 @@ func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
 }
 
 func selectdefault(sel *hselect) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectdefault: too many cases")
@@ -286,7 +286,6 @@ func selectgo(sel *hselect) int {
 
 	var (
 		gp     *g
-		done   uint32
 		sg     *sudog
 		c      *hchan
 		k      *scase
@@ -353,7 +352,6 @@ loop:
 
 	// pass 2 - enqueue on all chans
 	gp = getg()
-	done = 0
 	if gp.waiting != nil {
 		throw("gp.waiting != nil")
 	}
@@ -367,8 +365,7 @@ loop:
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
-		// Note: selectdone is adjusted for stack copies in stack1.go:adjustsudogs
-		sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
+		sg.isSelect = true
 		// No stack splits between assigning elem and enqueuing
 		// sg on gp.waiting where copystack can find it.
 		sg.elem = cas.elem
@@ -394,62 +391,9 @@ loop:
 	gp.param = nil
 	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
 
-	// While we were asleep, some goroutine came along and completed
-	// one of the cases in the select and woke us up (called ready).
-	// As part of that process, the goroutine did a cas on done above
-	// (aka *sg.selectdone for all queued sg) to win the right to
-	// complete the select. Now done = 1.
-	//
-	// If we copy (grow) our own stack, we will update the
-	// selectdone pointers inside the gp.waiting sudog list to point
-	// at the new stack. Another goroutine attempting to
-	// complete one of our (still linked in) select cases might
-	// see the new selectdone pointer (pointing at the new stack)
-	// before the new stack has real data; if the new stack has done = 0
-	// (before the old values are copied over), the goroutine might
-	// do a cas via sg.selectdone and incorrectly believe that it has
-	// won the right to complete the select, executing a second
-	// communication and attempting to wake us (call ready) again.
-	//
-	// Then things break.
-	//
-	// The best break is that the goroutine doing ready sees the
-	// _Gcopystack status and throws, as in #17007.
-	// A worse break would be for us to continue on, start running real code,
-	// block in a semaphore acquisition (sema.go), and have the other
-	// goroutine wake us up without having really acquired the semaphore.
-	// That would result in the goroutine spuriously running and then
-	// queue up another spurious wakeup when the semaphore really is ready.
-	// In general the situation can cascade until something notices the
-	// problem and causes a crash.
-	//
-	// A stack shrink does not have this problem, because it locks
-	// all the channels that are involved first, blocking out the
-	// possibility of a cas on selectdone.
-	//
-	// A stack growth before gopark above does not have this
-	// problem, because we hold those channel locks (released by
-	// selparkcommit).
-	//
-	// A stack growth after sellock below does not have this
-	// problem, because again we hold those channel locks.
-	//
-	// The only problem is a stack growth during sellock.
-	// To keep that from happening, run sellock on the system stack.
-	//
-	// It might be that we could avoid this if copystack copied the
-	// stack before calling adjustsudogs. In that case,
-	// syncadjustsudogs would need to recopy the tiny part that
-	// it copies today, resulting in a little bit of extra copying.
-	//
-	// An even better fix, not for the week before a release candidate,
-	// would be to put space in every sudog and make selectdone
-	// point at (say) the space in the first sudog.
-
-	systemstack(func() {
-		sellock(scases, lockorder)
-	})
+	sellock(scases, lockorder)
 
+	gp.selectDone = 0
 	sg = (*sudog)(gp.param)
 	gp.param = nil
 
@@ -462,7 +406,7 @@ loop:
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
-		sg1.selectdone = nil
+		sg1.isSelect = false
 		sg1.elem = nil
 		sg1.c = nil
 	}
@@ -513,10 +457,8 @@ loop:
 		print("wait-return: sel=", sel, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
 	}
 
-	if cas.kind == caseRecv {
-		if cas.receivedp != nil {
-			*cas.receivedp = true
-		}
+	if cas.kind == caseRecv && cas.receivedp != nil {
+		*cas.receivedp = true
 	}
 
 	if raceenabled {
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 8715e07d7a..d5ea14d46d 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -275,7 +275,10 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
 	// https://en.wikipedia.org/wiki/Treap
 	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
-	s.ticket = fastrand()
+	//
+	// s.ticket compared with zero in couple of places, therefore set lowest bit.
+	// It will not affect treap's quality noticeably.
+	s.ticket = fastrand() | 1
 	s.parent = last
 	*pt = s
 
diff --git a/src/runtime/signal_darwin.go b/src/runtime/signal_darwin.go
index 0c5481a2ef..8090fb22a5 100644
--- a/src/runtime/signal_darwin.go
+++ b/src/runtime/signal_darwin.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/* 0 */ {0, "SIGNONE: no trap"},
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -26,20 +21,20 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify + _SigIgn, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 }
diff --git a/src/runtime/signal_darwin_arm.go b/src/runtime/signal_darwin_arm.go
index c88b90c5e2..9a5d3ac5bb 100644
--- a/src/runtime/signal_darwin_arm.go
+++ b/src/runtime/signal_darwin_arm.go
@@ -36,7 +36,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().lr }
 func (c *sigctxt) pc() uint32 { return c.regs().pc }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().cpsr }
-func (c *sigctxt) fault() uint32   { return c.info.si_addr }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.info.si_addr) }
 func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
 func (c *sigctxt) trap() uint32    { return 0 }
 func (c *sigctxt) error() uint32   { return 0 }
diff --git a/src/runtime/signal_darwin_arm64.go b/src/runtime/signal_darwin_arm64.go
index b14b9f1e50..41b8fcaab9 100644
--- a/src/runtime/signal_darwin_arm64.go
+++ b/src/runtime/signal_darwin_arm64.go
@@ -52,7 +52,7 @@ func (c *sigctxt) sp() uint64  { return c.regs().sp }
 //go:nowritebarrierrec
 func (c *sigctxt) pc() uint64 { return c.regs().pc }
 
-func (c *sigctxt) fault() uint64 { return uint64(uintptr(unsafe.Pointer(c.info.si_addr))) }
+func (c *sigctxt) fault() uintptr { return uintptr(unsafe.Pointer(c.info.si_addr)) }
 
 func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
 func (c *sigctxt) sigaddr() uint64 { return uint64(uintptr(unsafe.Pointer(c.info.si_addr))) }
diff --git a/src/runtime/signal_dragonfly.go b/src/runtime/signal_dragonfly.go
index 8e9ce17c86..f2b26e7179 100644
--- a/src/runtime/signal_dragonfly.go
+++ b/src/runtime/signal_dragonfly.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/* 0 */ {0, "SIGNONE: no trap"},
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -26,20 +21,20 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify + _SigIgn, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 32 */ {_SigNotify, "SIGTHR: reserved"},
diff --git a/src/runtime/signal_freebsd.go b/src/runtime/signal_freebsd.go
index 7ce7217e07..2812c69989 100644
--- a/src/runtime/signal_freebsd.go
+++ b/src/runtime/signal_freebsd.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/* 0 */ {0, "SIGNONE: no trap"},
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -26,20 +21,20 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify + _SigIgn, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 32 */ {_SigNotify, "SIGTHR: reserved"},
diff --git a/src/runtime/signal_freebsd_arm.go b/src/runtime/signal_freebsd_arm.go
index 9601370112..2135c1e752 100644
--- a/src/runtime/signal_freebsd_arm.go
+++ b/src/runtime/signal_freebsd_arm.go
@@ -36,7 +36,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().__gregs[14] }
 func (c *sigctxt) pc() uint32 { return c.regs().__gregs[15] }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().__gregs[16] }
-func (c *sigctxt) fault() uint32   { return uint32(c.info.si_addr) }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.info.si_addr) }
 func (c *sigctxt) trap() uint32    { return 0 }
 func (c *sigctxt) error() uint32   { return 0 }
 func (c *sigctxt) oldmask() uint32 { return 0 }
diff --git a/src/runtime/signal_linux_arm.go b/src/runtime/signal_linux_arm.go
index 06a57b83b9..876b505917 100644
--- a/src/runtime/signal_linux_arm.go
+++ b/src/runtime/signal_linux_arm.go
@@ -39,7 +39,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().lr }
 func (c *sigctxt) pc() uint32 { return c.regs().pc }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().cpsr }
-func (c *sigctxt) fault() uint32   { return c.regs().fault_address }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.regs().fault_address) }
 func (c *sigctxt) trap() uint32    { return c.regs().trap_no }
 func (c *sigctxt) error() uint32   { return c.regs().error_code }
 func (c *sigctxt) oldmask() uint32 { return c.regs().oldmask }
diff --git a/src/runtime/signal_linux_arm64.go b/src/runtime/signal_linux_arm64.go
index f3d4d384e5..2075f253d7 100644
--- a/src/runtime/signal_linux_arm64.go
+++ b/src/runtime/signal_linux_arm64.go
@@ -56,7 +56,7 @@ func (c *sigctxt) sp() uint64  { return c.regs().sp }
 func (c *sigctxt) pc() uint64 { return c.regs().pc }
 
 func (c *sigctxt) pstate() uint64 { return c.regs().pstate }
-func (c *sigctxt) fault() uint64  { return c.regs().fault_address }
+func (c *sigctxt) fault() uintptr { return uintptr(c.regs().fault_address) }
 
 func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
 func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
diff --git a/src/runtime/signal_linux_ppc64x.go b/src/runtime/signal_linux_ppc64x.go
index b6831bc22d..97cb26d587 100644
--- a/src/runtime/signal_linux_ppc64x.go
+++ b/src/runtime/signal_linux_ppc64x.go
@@ -67,7 +67,7 @@ func (c *sigctxt) ccr() uint64  { return c.regs().ccr }
 
 func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
 func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
-func (c *sigctxt) fault() uint64   { return c.regs().dar }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.regs().dar) }
 
 func (c *sigctxt) set_r0(x uint64)   { c.regs().gpr[0] = x }
 func (c *sigctxt) set_r12(x uint64)  { c.regs().gpr[12] = x }
diff --git a/src/runtime/signal_nacl.go b/src/runtime/signal_nacl.go
index 47930757da..ad321d8b75 100644
--- a/src/runtime/signal_nacl.go
+++ b/src/runtime/signal_nacl.go
@@ -26,13 +26,13 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
diff --git a/src/runtime/signal_nacl_arm.go b/src/runtime/signal_nacl_arm.go
index 959dbfbab1..b8312324ac 100644
--- a/src/runtime/signal_nacl_arm.go
+++ b/src/runtime/signal_nacl_arm.go
@@ -36,7 +36,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().lr }
 func (c *sigctxt) pc() uint32 { return c.regs().pc }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().cpsr }
-func (c *sigctxt) fault() uint32   { return ^uint32(0) }
+func (c *sigctxt) fault() uintptr  { return ^uintptr(0) }
 func (c *sigctxt) trap() uint32    { return ^uint32(0) }
 func (c *sigctxt) error() uint32   { return ^uint32(0) }
 func (c *sigctxt) oldmask() uint32 { return ^uint32(0) }
diff --git a/src/runtime/signal_netbsd.go b/src/runtime/signal_netbsd.go
index 30a3b8e1a9..ca510842e1 100644
--- a/src/runtime/signal_netbsd.go
+++ b/src/runtime/signal_netbsd.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/*  0 */ {0, "SIGNONE: no trap"},
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -26,20 +21,20 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify + _SigIgn, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 32 */ {_SigNotify, "SIGTHR: reserved"},
diff --git a/src/runtime/signal_netbsd_arm.go b/src/runtime/signal_netbsd_arm.go
index 64cfffae5a..fdb30785d9 100644
--- a/src/runtime/signal_netbsd_arm.go
+++ b/src/runtime/signal_netbsd_arm.go
@@ -36,7 +36,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().__gregs[_REG_R14] }
 func (c *sigctxt) pc() uint32 { return c.regs().__gregs[_REG_R15] }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().__gregs[_REG_CPSR] }
-func (c *sigctxt) fault() uint32   { return uint32(c.info._reason) }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.info._reason) }
 func (c *sigctxt) trap() uint32    { return 0 }
 func (c *sigctxt) error() uint32   { return 0 }
 func (c *sigctxt) oldmask() uint32 { return 0 }
diff --git a/src/runtime/signal_openbsd.go b/src/runtime/signal_openbsd.go
index 30a3b8e1a9..99c601ce58 100644
--- a/src/runtime/signal_openbsd.go
+++ b/src/runtime/signal_openbsd.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/*  0 */ {0, "SIGNONE: no trap"},
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -26,13 +21,13 @@ var sigtable = [...]sigTabT{
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
diff --git a/src/runtime/signal_openbsd_arm.go b/src/runtime/signal_openbsd_arm.go
index 66aea934f1..97bb13b4f3 100644
--- a/src/runtime/signal_openbsd_arm.go
+++ b/src/runtime/signal_openbsd_arm.go
@@ -38,7 +38,7 @@ func (c *sigctxt) lr() uint32  { return c.regs().sc_usr_lr }
 func (c *sigctxt) pc() uint32 { return c.regs().sc_pc }
 
 func (c *sigctxt) cpsr() uint32    { return c.regs().sc_spsr }
-func (c *sigctxt) fault() uint32   { return c.sigaddr() }
+func (c *sigctxt) fault() uintptr  { return uintptr(c.sigaddr()) }
 func (c *sigctxt) trap() uint32    { return 0 }
 func (c *sigctxt) error() uint32   { return 0 }
 func (c *sigctxt) oldmask() uint32 { return 0 }
diff --git a/src/runtime/signal_sighandler.go b/src/runtime/signal_sighandler.go
index b2e15a6539..f24a117fcd 100644
--- a/src/runtime/signal_sighandler.go
+++ b/src/runtime/signal_sighandler.go
@@ -88,9 +88,9 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	}
 
 	print("PC=", hex(c.sigpc()), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
 		print("signal arrived during cgo execution\n")
-		gp = _g_.m.lockedg
+		gp = _g_.m.lockedg.ptr()
 	}
 	print("\n")
 
@@ -111,7 +111,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 
 	if docrash {
 		crashing++
-		if crashing < sched.mcount-int32(extraMCount) {
+		if crashing < mcount()-int32(extraMCount) {
 			// There are other m's that need to dump their stacks.
 			// Relay SIGQUIT to the next m by sending it to the current process.
 			// All m's that have already received SIGQUIT have signal masks blocking
diff --git a/src/runtime/signal_solaris.go b/src/runtime/signal_solaris.go
index c931c222d6..a8eeeee129 100644
--- a/src/runtime/signal_solaris.go
+++ b/src/runtime/signal_solaris.go
@@ -4,11 +4,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/* 0 */ {0, "SIGNONE: no trap"},
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: hangup"},
@@ -28,16 +23,16 @@ var sigtable = [...]sigTabT{
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: software termination signal from kill"},
 	/* 16 */ {_SigNotify, "SIGUSR1: user defined signal 1"},
 	/* 17 */ {_SigNotify, "SIGUSR2: user defined signal 2"},
-	/* 18 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status change alias (POSIX)"},
+	/* 18 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status change alias (POSIX)"},
 	/* 19 */ {_SigNotify, "SIGPWR: power-fail restart"},
-	/* 20 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 21 */ {_SigNotify, "SIGURG: urgent socket condition"},
+	/* 20 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 21 */ {_SigNotify + _SigIgn, "SIGURG: urgent socket condition"},
 	/* 22 */ {_SigNotify, "SIGPOLL: pollable event occurred"},
 	/* 23 */ {0, "SIGSTOP: stop (cannot be caught or ignored)"},
-	/* 24 */ {_SigNotify + _SigDefault, "SIGTSTP: user stop requested from tty"},
-	/* 25 */ {_SigNotify + _SigDefault, "SIGCONT: stopped process has been continued"},
-	/* 26 */ {_SigNotify + _SigDefault, "SIGTTIN: background tty read attempted"},
-	/* 27 */ {_SigNotify + _SigDefault, "SIGTTOU: background tty write attempted"},
+	/* 24 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: user stop requested from tty"},
+	/* 25 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: stopped process has been continued"},
+	/* 26 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background tty read attempted"},
+	/* 27 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background tty write attempted"},
 	/* 28 */ {_SigNotify, "SIGVTALRM: virtual timer expired"},
 	/* 29 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling timer expired"},
 	/* 30 */ {_SigNotify, "SIGXCPU: exceeded cpu limit"},
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 539b165ba1..2cd3d71800 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -8,10 +8,19 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
+// sigTabT is the type of an entry in the global sigtable array.
+// sigtable is inherently system dependent, and appears in OS-specific files,
+// but sigTabT is the same for all Unixy systems.
+// The sigtable array is indexed by a system signal number to get the flags
+// and printable name of each signal.
+type sigTabT struct {
+	flags int32
+	name  string
+}
+
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	systemstack(sigpipe)
@@ -266,6 +275,12 @@ func sigpipe() {
 // sigtrampgo is called from the signal handler function, sigtramp,
 // written in assembly code.
 // This is called by the signal handler, and the world may be stopped.
+//
+// It must be nosplit because getg() is still the G that was running
+// (if any) when the signal was delivered, but it's (usually) called
+// on the gsignal stack. Until this switches the G to gsignal, the
+// stack bounds check won't work.
+//
 //go:nosplit
 //go:nowritebarrierrec
 func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
@@ -395,8 +410,9 @@ func sigpanic() {
 //go:nosplit
 //go:nowritebarrierrec
 func dieFromSignal(sig uint32) {
-	setsig(sig, _SIG_DFL)
 	unblocksig(sig)
+	// Mark the signal as unhandled to ensure it is forwarded.
+	atomic.Store(&handlingSig[sig], 0)
 	raise(sig)
 
 	// That should have killed us. On some systems, though, raise
@@ -408,6 +424,14 @@ func dieFromSignal(sig uint32) {
 	osyield()
 	osyield()
 
+	// If that didn't work, try _SIG_DFL.
+	setsig(sig, _SIG_DFL)
+	raise(sig)
+
+	osyield()
+	osyield()
+	osyield()
+
 	// If we are still somehow running, just exit with the wrong status.
 	exit(2)
 }
@@ -474,7 +498,7 @@ func crash() {
 		// this means the OS X core file will be >128 GB and even on a zippy
 		// workstation can take OS X well over an hour to write (uninterruptible).
 		// Save users from making that mistake.
-		if sys.PtrSize == 8 {
+		if GOARCH == "amd64" {
 			return
 		}
 	}
@@ -502,7 +526,7 @@ func ensureSigM() {
 		// mask accordingly.
 		sigBlocked := sigset_all
 		for i := range sigtable {
-			if sigtable[i].flags&_SigUnblock != 0 {
+			if !blockableSig(uint32(i)) {
 				sigdelset(&sigBlocked, i)
 			}
 		}
@@ -514,7 +538,7 @@ func ensureSigM() {
 					sigdelset(&sigBlocked, int(sig))
 				}
 			case sig := <-disableSigChan:
-				if sig > 0 {
+				if sig > 0 && blockableSig(sig) {
 					sigaddset(&sigBlocked, int(sig))
 				}
 			}
@@ -578,17 +602,23 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 		return false
 	}
 	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
+	flags := sigtable[sig].flags
 
-	if !signalsOK {
-		// The only way we can get here is if we are in a
-		// library or archive, we installed a signal handler
-		// at program startup, but the Go runtime has not yet
-		// been initialized.
+	// If we aren't handling the signal, forward it.
+	if atomic.Load(&handlingSig[sig]) == 0 || !signalsOK {
+		// If the signal is ignored, doing nothing is the same as forwarding.
+		if fwdFn == _SIG_IGN || (fwdFn == _SIG_DFL && flags&_SigIgn != 0) {
+			return true
+		}
+		// We are not handling the signal and there is no other handler to forward to.
+		// Crash with the default behavior.
 		if fwdFn == _SIG_DFL {
+			setsig(sig, _SIG_DFL)
 			dieFromSignal(sig)
-		} else {
-			sigfwd(fwdFn, sig, info, ctx)
+			return false
 		}
+
+		sigfwd(fwdFn, sig, info, ctx)
 		return true
 	}
 
@@ -597,18 +627,6 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 		return false
 	}
 
-	// If we aren't handling the signal, forward it.
-	// Really if we aren't handling the signal, we shouldn't get here,
-	// but on Darwin setsigstack can lead us here because it sets
-	// the sa_tramp field. The sa_tramp field is not returned by
-	// sigaction, so the fix for that is non-obvious.
-	if atomic.Load(&handlingSig[sig]) == 0 {
-		sigfwd(fwdFn, sig, info, ctx)
-		return true
-	}
-
-	flags := sigtable[sig].flags
-
 	c := &sigctxt{info, ctx}
 	// Only forward synchronous signals and SIGPIPE.
 	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
@@ -702,7 +720,7 @@ func minitSignalStack() {
 		signalstack(&_g_.m.gsignal.stack)
 		_g_.m.newSigstack = true
 	} else {
-		setGsignalStack(&st, nil)
+		setGsignalStack(&st, &_g_.m.goSigStack)
 		_g_.m.newSigstack = false
 	}
 }
@@ -718,7 +736,7 @@ func minitSignalStack() {
 func minitSignalMask() {
 	nmask := getg().m.sigmask
 	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
+		if !blockableSig(uint32(i)) {
 			sigdelset(&nmask, i)
 		}
 	}
@@ -732,7 +750,34 @@ func unminitSignals() {
 	if getg().m.newSigstack {
 		st := stackt{ss_flags: _SS_DISABLE}
 		sigaltstack(&st, nil)
+	} else {
+		// We got the signal stack from someone else. Restore
+		// the Go-allocated stack in case this M gets reused
+		// for another thread (e.g., it's an extram). Also, on
+		// Android, libc allocates a signal stack for all
+		// threads, so it's important to restore the Go stack
+		// even on Go-created threads so we can free it.
+		restoreGsignalStack(&getg().m.goSigStack)
+	}
+}
+
+// blockableSig returns whether sig may be blocked by the signal mask.
+// We never want to block the signals marked _SigUnblock;
+// these are the synchronous signals that turn into a Go panic.
+// In a Go program--not a c-archive/c-shared--we never want to block
+// the signals marked _SigKill or _SigThrow, as otherwise it's possible
+// for all running threads to block them and delay their delivery until
+// we start a new thread. When linked into a C program we let the C code
+// decide on the disposition of those signals.
+func blockableSig(sig uint32) bool {
+	flags := sigtable[sig].flags
+	if flags&_SigUnblock != 0 {
+		return false
+	}
+	if isarchive || islibrary {
+		return true
 	}
+	return flags&(_SigKill|_SigThrow) == 0
 }
 
 // gsignalStack saves the fields of the gsignal stack changed by
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index 73bd5b5cfc..7d230517f6 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -126,11 +126,11 @@ func lastcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
 	print("Exception ", hex(info.exceptioncode), " ", hex(info.exceptioninformation[0]), " ", hex(info.exceptioninformation[1]), " ", hex(r.ip()), "\n")
 
 	print("PC=", hex(r.ip()), "\n")
-	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
 		if iscgo {
 			print("signal arrived during external code execution\n")
 		}
-		gp = _g_.m.lockedg
+		gp = _g_.m.lockedg.ptr()
 	}
 	print("\n")
 
@@ -223,3 +223,6 @@ func crash() {
 	// It's okay to leave this empty for now: if crash returns
 	// the ordinary exit-after-panic happens.
 }
+
+// gsignalStack is unused on Windows.
+type gsignalStack struct{}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 236bb29929..98331627eb 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -45,13 +45,14 @@ import (
 // as there is no connection between handling a signal and receiving one,
 // but atomic instructions should minimize it.
 var sig struct {
-	note    note
-	mask    [(_NSIG + 31) / 32]uint32
-	wanted  [(_NSIG + 31) / 32]uint32
-	ignored [(_NSIG + 31) / 32]uint32
-	recv    [(_NSIG + 31) / 32]uint32
-	state   uint32
-	inuse   bool
+	note       note
+	mask       [(_NSIG + 31) / 32]uint32
+	wanted     [(_NSIG + 31) / 32]uint32
+	ignored    [(_NSIG + 31) / 32]uint32
+	recv       [(_NSIG + 31) / 32]uint32
+	state      uint32
+	delivering uint32
+	inuse      bool
 }
 
 const (
@@ -60,15 +61,20 @@ const (
 	sigSending
 )
 
-// Called from sighandler to send a signal back out of the signal handling thread.
-// Reports whether the signal was sent. If not, the caller typically crashes the program.
+// sigsend delivers a signal from sighandler to the internal signal delivery queue.
+// It reports whether the signal was sent. If not, the caller typically crashes the program.
+// It runs from the signal handler, so it's limited in what it can do.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
 	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
 		return false
 	}
 
+	atomic.Xadd(&sig.delivering, 1)
+	// We are running in the signal handler; defer is not available.
+
 	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
+		atomic.Xadd(&sig.delivering, -1)
 		return false
 	}
 
@@ -76,6 +82,7 @@ func sigsend(s uint32) bool {
 	for {
 		mask := sig.mask[s/32]
 		if mask&bit != 0 {
+			atomic.Xadd(&sig.delivering, -1)
 			return true // signal already in queue
 		}
 		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
@@ -104,6 +111,7 @@ Send:
 		}
 	}
 
+	atomic.Xadd(&sig.delivering, -1)
 	return true
 }
 
@@ -155,6 +163,15 @@ func signal_recv() uint32 {
 // by the os/signal package.
 //go:linkname signalWaitUntilIdle os/signal.signalWaitUntilIdle
 func signalWaitUntilIdle() {
+	// Although the signals we care about have been removed from
+	// sig.wanted, it is possible that another thread has received
+	// a signal, has read from sig.wanted, is now updating sig.mask,
+	// and has not yet woken up the processor thread. We need to wait
+	// until all current signal deliveries have completed.
+	for atomic.Load(&sig.delivering) != 0 {
+		Gosched()
+	}
+
 	// Although WaitUntilIdle seems like the right name for this
 	// function, the state we are looking for is sigReceiving, not
 	// sigIdle.  The sigIdle state is really more like sigProcessing.
diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index 874148e1d2..b26040b803 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -10,11 +10,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/* 0 */ {0, "SIGNONE: no trap"},
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -33,18 +28,18 @@ var sigtable = [...]sigTabT{
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
 	/* 16 */ {_SigThrow + _SigUnblock, "SIGSTKFLT: stack fault"},
-	/* 17 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 18 */ {_SigNotify + _SigDefault, "SIGCONT: continue"},
+	/* 17 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue"},
 	/* 19 */ {0, "SIGSTOP: stop, unblockable"},
-	/* 20 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 20 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 30 */ {_SigNotify, "SIGPWR: power failure restart"},
 	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
diff --git a/src/runtime/sigtab_linux_mipsx.go b/src/runtime/sigtab_linux_mipsx.go
index 8d9fb06704..81dd2314c5 100644
--- a/src/runtime/sigtab_linux_mipsx.go
+++ b/src/runtime/sigtab_linux_mipsx.go
@@ -7,11 +7,6 @@
 
 package runtime
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 var sigtable = [...]sigTabT{
 	/*  0 */ {0, "SIGNONE: no trap"},
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
@@ -31,16 +26,16 @@ var sigtable = [...]sigTabT{
 	/*  15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
 	/*  16 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/*  17 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
-	/*  18 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
+	/*  18 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
 	/*  19 */ {_SigNotify, "SIGPWR: power failure restart"},
-	/*  20 */ {_SigNotify, "SIGWINCH: window size change"},
-	/*  21 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/*  20 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/*  21 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
 	/*  22 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/*  23 */ {0, "SIGSTOP: stop, unblockable"},
-	/*  24 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/*  25 */ {_SigNotify + _SigDefault, "SIGCONT: continue"},
-	/*  26 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
-	/*  27 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
+	/*  24 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/*  25 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue"},
+	/*  26 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/*  27 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
 	/*  28 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
 	/*  29 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/*  30 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
diff --git a/src/runtime/sizeclasses.go b/src/runtime/sizeclasses.go
index 5366564afd..9e17b001d3 100644
--- a/src/runtime/sizeclasses.go
+++ b/src/runtime/sizeclasses.go
@@ -3,73 +3,73 @@
 
 package runtime
 
-// class  bytes/obj  bytes/span  objects  waste bytes
-//     1          8        8192     1024            0
-//     2         16        8192      512            0
-//     3         32        8192      256            0
-//     4         48        8192      170           32
-//     5         64        8192      128            0
-//     6         80        8192      102           32
-//     7         96        8192       85           32
-//     8        112        8192       73           16
-//     9        128        8192       64            0
-//    10        144        8192       56          128
-//    11        160        8192       51           32
-//    12        176        8192       46           96
-//    13        192        8192       42          128
-//    14        208        8192       39           80
-//    15        224        8192       36          128
-//    16        240        8192       34           32
-//    17        256        8192       32            0
-//    18        288        8192       28          128
-//    19        320        8192       25          192
-//    20        352        8192       23           96
-//    21        384        8192       21          128
-//    22        416        8192       19          288
-//    23        448        8192       18          128
-//    24        480        8192       17           32
-//    25        512        8192       16            0
-//    26        576        8192       14          128
-//    27        640        8192       12          512
-//    28        704        8192       11          448
-//    29        768        8192       10          512
-//    30        896        8192        9          128
-//    31       1024        8192        8            0
-//    32       1152        8192        7          128
-//    33       1280        8192        6          512
-//    34       1408       16384       11          896
-//    35       1536        8192        5          512
-//    36       1792       16384        9          256
-//    37       2048        8192        4            0
-//    38       2304       16384        7          256
-//    39       2688        8192        3          128
-//    40       3072       24576        8            0
-//    41       3200       16384        5          384
-//    42       3456       24576        7          384
-//    43       4096        8192        2            0
-//    44       4864       24576        5          256
-//    45       5376       16384        3          256
-//    46       6144       24576        4            0
-//    47       6528       32768        5          128
-//    48       6784       40960        6          256
-//    49       6912       49152        7          768
-//    50       8192        8192        1            0
-//    51       9472       57344        6          512
-//    52       9728       49152        5          512
-//    53      10240       40960        4            0
-//    54      10880       32768        3          128
-//    55      12288       24576        2            0
-//    56      13568       40960        3          256
-//    57      14336       57344        4            0
-//    58      16384       16384        1            0
-//    59      18432       73728        4            0
-//    60      19072       57344        3          128
-//    61      20480       40960        2            0
-//    62      21760       65536        3          256
-//    63      24576       24576        1            0
-//    64      27264       81920        3          128
-//    65      28672       57344        2            0
-//    66      32768       32768        1            0
+// class  bytes/obj  bytes/span  objects  tail waste  max waste
+//     1          8        8192     1024           0     87.50%
+//     2         16        8192      512           0     43.75%
+//     3         32        8192      256           0     46.88%
+//     4         48        8192      170          32     31.52%
+//     5         64        8192      128           0     23.44%
+//     6         80        8192      102          32     19.07%
+//     7         96        8192       85          32     15.95%
+//     8        112        8192       73          16     13.56%
+//     9        128        8192       64           0     11.72%
+//    10        144        8192       56         128     11.82%
+//    11        160        8192       51          32      9.73%
+//    12        176        8192       46          96      9.59%
+//    13        192        8192       42         128      9.25%
+//    14        208        8192       39          80      8.12%
+//    15        224        8192       36         128      8.15%
+//    16        240        8192       34          32      6.62%
+//    17        256        8192       32           0      5.86%
+//    18        288        8192       28         128     12.16%
+//    19        320        8192       25         192     11.80%
+//    20        352        8192       23          96      9.88%
+//    21        384        8192       21         128      9.51%
+//    22        416        8192       19         288     10.71%
+//    23        448        8192       18         128      8.37%
+//    24        480        8192       17          32      6.82%
+//    25        512        8192       16           0      6.05%
+//    26        576        8192       14         128     12.33%
+//    27        640        8192       12         512     15.48%
+//    28        704        8192       11         448     13.93%
+//    29        768        8192       10         512     13.94%
+//    30        896        8192        9         128     15.52%
+//    31       1024        8192        8           0     12.40%
+//    32       1152        8192        7         128     12.41%
+//    33       1280        8192        6         512     15.55%
+//    34       1408       16384       11         896     14.00%
+//    35       1536        8192        5         512     14.00%
+//    36       1792       16384        9         256     15.57%
+//    37       2048        8192        4           0     12.45%
+//    38       2304       16384        7         256     12.46%
+//    39       2688        8192        3         128     15.59%
+//    40       3072       24576        8           0     12.47%
+//    41       3200       16384        5         384      6.22%
+//    42       3456       24576        7         384      8.83%
+//    43       4096        8192        2           0     15.60%
+//    44       4864       24576        5         256     16.65%
+//    45       5376       16384        3         256     10.92%
+//    46       6144       24576        4           0     12.48%
+//    47       6528       32768        5         128      6.23%
+//    48       6784       40960        6         256      4.36%
+//    49       6912       49152        7         768      3.37%
+//    50       8192        8192        1           0     15.61%
+//    51       9472       57344        6         512     14.28%
+//    52       9728       49152        5         512      3.64%
+//    53      10240       40960        4           0      4.99%
+//    54      10880       32768        3         128      6.24%
+//    55      12288       24576        2           0     11.45%
+//    56      13568       40960        3         256      9.99%
+//    57      14336       57344        4           0      5.35%
+//    58      16384       16384        1           0     12.49%
+//    59      18432       73728        4           0     11.11%
+//    60      19072       57344        3         128      3.57%
+//    61      20480       40960        2           0      6.87%
+//    62      21760       65536        3         256      6.25%
+//    63      24576       24576        1           0     11.45%
+//    64      27264       81920        3         128     10.00%
+//    65      28672       57344        2           0      4.91%
+//    66      32768       32768        1           0     12.50%
 
 const (
 	_MaxSmallSize   = 32768
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 0f49df1647..351fec067d 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -14,6 +14,13 @@ type slice struct {
 	cap   int
 }
 
+// An notInHeapSlice is a slice backed by go:notinheap memory.
+type notInHeapSlice struct {
+	array *notInHeap
+	len   int
+	cap   int
+}
+
 // maxElems is a lookup table containing the maximum capacity for a slice.
 // The index is the size of the slice element.
 var maxElems = [...]uintptr{
@@ -81,7 +88,7 @@ func makeslice64(et *_type, len64, cap64 int64) slice {
 // The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
 func growslice(et *_type, old slice, cap int) slice {
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&et))
+		callerpc := getcallerpc()
 		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
 	}
 	if msanenabled {
@@ -105,12 +112,20 @@ func growslice(et *_type, old slice, cap int) slice {
 		if old.len < 1024 {
 			newcap = doublecap
 		} else {
-			for newcap < cap {
+			// Check 0 < newcap to detect overflow
+			// and prevent an infinite loop.
+			for 0 < newcap && newcap < cap {
 				newcap += newcap / 4
 			}
+			// Set newcap to the requested cap when
+			// the newcap calculation overflowed.
+			if newcap <= 0 {
+				newcap = cap
+			}
 		}
 	}
 
+	var overflow bool
 	var lenmem, newlenmem, capmem uintptr
 	const ptrSize = unsafe.Sizeof((*byte)(nil))
 	switch et.size {
@@ -118,20 +133,37 @@ func growslice(et *_type, old slice, cap int) slice {
 		lenmem = uintptr(old.len)
 		newlenmem = uintptr(cap)
 		capmem = roundupsize(uintptr(newcap))
+		overflow = uintptr(newcap) > _MaxMem
 		newcap = int(capmem)
 	case ptrSize:
 		lenmem = uintptr(old.len) * ptrSize
 		newlenmem = uintptr(cap) * ptrSize
 		capmem = roundupsize(uintptr(newcap) * ptrSize)
+		overflow = uintptr(newcap) > _MaxMem/ptrSize
 		newcap = int(capmem / ptrSize)
 	default:
 		lenmem = uintptr(old.len) * et.size
 		newlenmem = uintptr(cap) * et.size
 		capmem = roundupsize(uintptr(newcap) * et.size)
+		overflow = uintptr(newcap) > maxSliceCap(et.size)
 		newcap = int(capmem / et.size)
 	}
 
-	if cap < old.cap || uintptr(newcap) > maxSliceCap(et.size) {
+	// The check of overflow (uintptr(newcap) > maxSliceCap(et.size))
+	// in addition to capmem > _MaxMem is needed to prevent an overflow
+	// which can be used to trigger a segfault on 32bit architectures
+	// with this example program:
+	//
+	// type T [1<<27 + 1]int64
+	//
+	// var d T
+	// var s []T
+	//
+	// func main() {
+	//   s = append(s, d, d, d, d)
+	//   print(len(s), "\n")
+	// }
+	if cap < old.cap || overflow || capmem > _MaxMem {
 		panic(errorString("growslice: cap out of range"))
 	}
 
@@ -172,7 +204,7 @@ func slicecopy(to, fm slice, width uintptr) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
 		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
@@ -203,7 +235,7 @@ func slicestringcopy(to []byte, fm string) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicestringcopy)
 		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
 	}
diff --git a/src/runtime/softfloat64.go b/src/runtime/softfloat64.go
index 1678e8f9f1..8fde0feddc 100644
--- a/src/runtime/softfloat64.go
+++ b/src/runtime/softfloat64.go
@@ -483,3 +483,115 @@ again2:
 
 	return q1*b + q0, (un21*b + un0 - q0*v) >> s
 }
+
+func fadd32(x, y uint32) uint32 {
+	return f64to32(fadd64(f32to64(x), f32to64(y)))
+}
+
+func fmul32(x, y uint32) uint32 {
+	return f64to32(fmul64(f32to64(x), f32to64(y)))
+}
+
+func fdiv32(x, y uint32) uint32 {
+	return f64to32(fdiv64(f32to64(x), f32to64(y)))
+}
+
+func feq32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp == 0 && !nan
+}
+
+func fgt32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp >= 1 && !nan
+}
+
+func fge32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp >= 0 && !nan
+}
+
+func feq64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp == 0 && !nan
+}
+
+func fgt64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp >= 1 && !nan
+}
+
+func fge64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp >= 0 && !nan
+}
+
+func fint32to32(x int32) uint32 {
+	return f64to32(fintto64(int64(x)))
+}
+
+func fint32to64(x int32) uint64 {
+	return fintto64(int64(x))
+}
+
+func fint64to32(x int64) uint32 {
+	return f64to32(fintto64(x))
+}
+
+func fint64to64(x int64) uint64 {
+	return fintto64(x)
+}
+
+func f32toint32(x uint32) int32 {
+	val, _ := f64toint(f32to64(x))
+	return int32(val)
+}
+
+func f32toint64(x uint32) int64 {
+	val, _ := f64toint(f32to64(x))
+	return val
+}
+
+func f64toint32(x uint64) int32 {
+	val, _ := f64toint(x)
+	return int32(val)
+}
+
+func f64toint64(x uint64) int64 {
+	val, _ := f64toint(x)
+	return val
+}
+
+func f64touint64(x float64) uint64 {
+	if x < float64(1<<63) {
+		return uint64(int64(x))
+	}
+	y := x - float64(1<<63)
+	z := uint64(int64(y))
+	return z | (1 << 63)
+}
+
+func f32touint64(x float32) uint64 {
+	if x < float32(1<<63) {
+		return uint64(int64(x))
+	}
+	y := x - float32(1<<63)
+	z := uint64(int64(y))
+	return z | (1 << 63)
+}
+
+func fuint64to64(x uint64) float64 {
+	if int64(x) >= 0 {
+		return float64(int64(x))
+	}
+	// See ../cmd/compile/internal/gc/ssa.go:uint64Tofloat
+	y := x & 1
+	z := x >> 1
+	z = z | y
+	r := float64(int64(z))
+	return r + r
+}
+
+func fuint64to32(x uint64) float32 {
+	return float32(fuint64to64(x))
+}
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 525d0b14c1..eb0716c18d 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -578,29 +578,30 @@ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f
 		if stackDebug >= 4 {
 			print("        ", add(scanp, i*sys.PtrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*sys.PtrSize))), " # ", i, " ", bv.bytedata[i/8], "\n")
 		}
-		if ptrbit(&bv, i) == 1 {
-			pp := (*uintptr)(add(scanp, i*sys.PtrSize))
-		retry:
-			p := *pp
-			if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
-				// Looks like a junk value in a pointer slot.
-				// Live analysis wrong?
-				getg().m.traceback = 2
-				print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
-				throw("invalid pointer found on stack")
+		if ptrbit(&bv, i) != 1 {
+			continue
+		}
+		pp := (*uintptr)(add(scanp, i*sys.PtrSize))
+	retry:
+		p := *pp
+		if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
+			// Looks like a junk value in a pointer slot.
+			// Live analysis wrong?
+			getg().m.traceback = 2
+			print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
+			throw("invalid pointer found on stack")
+		}
+		if minp <= p && p < maxp {
+			if stackDebug >= 3 {
+				print("adjust ptr ", hex(p), " ", funcname(f), "\n")
 			}
-			if minp <= p && p < maxp {
-				if stackDebug >= 3 {
-					print("adjust ptr ", hex(p), " ", funcname(f), "\n")
-				}
-				if useCAS {
-					ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
-					if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
-						goto retry
-					}
-				} else {
-					*pp = p + delta
+			if useCAS {
+				ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
+				if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
+					goto retry
 				}
+			} else {
+				*pp = p + delta
 			}
 		}
 	}
@@ -751,7 +752,6 @@ func adjustsudogs(gp *g, adjinfo *adjustinfo) {
 	// might be in the stack.
 	for s := gp.waiting; s != nil; s = s.waitlink {
 		adjustpointer(adjinfo, unsafe.Pointer(&s.elem))
-		adjustpointer(adjinfo, unsafe.Pointer(&s.selectdone))
 	}
 }
 
@@ -768,10 +768,6 @@ func findsghi(gp *g, stk stack) uintptr {
 		if stk.lo <= p && p < stk.hi && p > sghi {
 			sghi = p
 		}
-		p = uintptr(unsafe.Pointer(sg.selectdone)) + unsafe.Sizeof(sg.selectdone)
-		if stk.lo <= p && p < stk.hi && p > sghi {
-			sghi = p
-		}
 	}
 	return sghi
 }
@@ -917,9 +913,12 @@ func round2(x int32) int32 {
 // g->atomicstatus will be Grunning or Gscanrunning upon entry.
 // If the GC is trying to stop this g then it will set preemptscan to true.
 //
-// ctxt is the value of the context register on morestack. newstack
-// will write it to g.sched.ctxt.
-func newstack(ctxt unsafe.Pointer) {
+// This must be nowritebarrierrec because it can be called as part of
+// stack growth from other nowritebarrierrec functions, but the
+// compiler doesn't check this.
+//
+//go:nowritebarrierrec
+func newstack() {
 	thisg := getg()
 	// TODO: double check all gp. shouldn't be getg().
 	if thisg.m.morebuf.g.ptr().stackguard0 == stackFork {
@@ -933,9 +932,6 @@ func newstack(ctxt unsafe.Pointer) {
 	}
 
 	gp := thisg.m.curg
-	// Write ctxt to gp.sched. We do this here instead of in
-	// morestack so it has the necessary write barrier.
-	gp.sched.ctxt = ctxt
 
 	if thisg.m.curg.throwsplit {
 		// Update syscallsp, syscallpc in case traceback uses them.
@@ -946,6 +942,7 @@ func newstack(ctxt unsafe.Pointer) {
 			"\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
 			"\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
 
+		thisg.m.traceback = 2 // Include runtime frames
 		traceback(morebuf.pc, morebuf.sp, morebuf.lr, gp)
 		throw("runtime: stack split at bad time")
 	}
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index 25e8f77da4..0fed241704 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -5,6 +5,9 @@
 package runtime_test
 
 import (
+	"bytes"
+	"fmt"
+	"reflect"
 	. "runtime"
 	"strings"
 	"sync"
@@ -78,10 +81,13 @@ func TestStackGrowth(t *testing.T) {
 	var wg sync.WaitGroup
 
 	// in a normal goroutine
+	var growDuration time.Duration // For debugging failures
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		growStack()
+		start := time.Now()
+		growStack(nil)
+		growDuration = time.Since(start)
 	}()
 	wg.Wait()
 
@@ -90,7 +96,7 @@ func TestStackGrowth(t *testing.T) {
 	go func() {
 		defer wg.Done()
 		LockOSThread()
-		growStack()
+		growStack(nil)
 		UnlockOSThread()
 	}()
 	wg.Wait()
@@ -100,12 +106,14 @@ func TestStackGrowth(t *testing.T) {
 	go func() {
 		defer wg.Done()
 		done := make(chan bool)
-		var started uint32
+		var startTime time.Time
+		var started, progress uint32
 		go func() {
 			s := new(string)
 			SetFinalizer(s, func(ss *string) {
+				startTime = time.Now()
 				atomic.StoreUint32(&started, 1)
-				growStack()
+				growStack(&progress)
 				done <- true
 			})
 			s = nil
@@ -118,7 +126,10 @@ func TestStackGrowth(t *testing.T) {
 		case <-time.After(20 * time.Second):
 			if atomic.LoadUint32(&started) == 0 {
 				t.Log("finalizer did not start")
+			} else {
+				t.Logf("finalizer started %s ago and finished %d iterations", time.Since(startTime), atomic.LoadUint32(&progress))
 			}
+			t.Log("first growStack took", growDuration)
 			t.Error("finalizer did not run")
 			return
 		}
@@ -131,7 +142,7 @@ func TestStackGrowth(t *testing.T) {
 //	growStack()
 //}
 
-func growStack() {
+func growStack(progress *uint32) {
 	n := 1 << 10
 	if testing.Short() {
 		n = 1 << 8
@@ -142,6 +153,9 @@ func growStack() {
 		if x != i+1 {
 			panic("stack is corrupted")
 		}
+		if progress != nil {
+			atomic.StoreUint32(progress, uint32(i))
+		}
 	}
 	GC()
 }
@@ -231,7 +245,7 @@ func TestDeferPtrs(t *testing.T) {
 		}
 	}()
 	defer set(&y, 42)
-	growStack()
+	growStack(nil)
 }
 
 type bigBuf [4 * 1024]byte
@@ -627,3 +641,169 @@ func count23(n int) int {
 	}
 	return 1 + count1(n-1)
 }
+
+type structWithMethod struct{}
+
+func (s structWithMethod) caller() string {
+	_, file, line, ok := Caller(1)
+	if !ok {
+		panic("Caller failed")
+	}
+	return fmt.Sprintf("%s:%d", file, line)
+}
+
+func (s structWithMethod) callers() []uintptr {
+	pc := make([]uintptr, 16)
+	return pc[:Callers(0, pc)]
+}
+
+func (s structWithMethod) stack() string {
+	buf := make([]byte, 4<<10)
+	return string(buf[:Stack(buf, false)])
+}
+
+func (s structWithMethod) nop() {}
+
+func TestStackWrapperCaller(t *testing.T) {
+	var d structWithMethod
+	// Force the compiler to construct a wrapper method.
+	wrapper := (*structWithMethod).caller
+	// Check that the wrapper doesn't affect the stack trace.
+	if dc, ic := d.caller(), wrapper(&d); dc != ic {
+		t.Fatalf("direct caller %q != indirect caller %q", dc, ic)
+	}
+}
+
+func TestStackWrapperCallers(t *testing.T) {
+	var d structWithMethod
+	wrapper := (*structWithMethod).callers
+	// Check that <autogenerated> doesn't appear in the stack trace.
+	pcs := wrapper(&d)
+	frames := CallersFrames(pcs)
+	for {
+		fr, more := frames.Next()
+		if fr.File == "<autogenerated>" {
+			t.Fatalf("<autogenerated> appears in stack trace: %+v", fr)
+		}
+		if !more {
+			break
+		}
+	}
+}
+
+func TestStackWrapperStack(t *testing.T) {
+	var d structWithMethod
+	wrapper := (*structWithMethod).stack
+	// Check that <autogenerated> doesn't appear in the stack trace.
+	stk := wrapper(&d)
+	if strings.Contains(stk, "<autogenerated>") {
+		t.Fatalf("<autogenerated> appears in stack trace:\n%s", stk)
+	}
+}
+
+type I interface {
+	M()
+}
+
+func TestStackWrapperStackPanic(t *testing.T) {
+	t.Run("sigpanic", func(t *testing.T) {
+		// nil calls to interface methods cause a sigpanic.
+		testStackWrapperPanic(t, func() { I.M(nil) }, "runtime_test.I.M")
+	})
+	t.Run("panicwrap", func(t *testing.T) {
+		// Nil calls to value method wrappers call panicwrap.
+		wrapper := (*structWithMethod).nop
+		testStackWrapperPanic(t, func() { wrapper(nil) }, "runtime_test.(*structWithMethod).nop")
+	})
+}
+
+func testStackWrapperPanic(t *testing.T, cb func(), expect string) {
+	// Test that the stack trace from a panicking wrapper includes
+	// the wrapper, even though elide these when they don't panic.
+	t.Run("CallersFrames", func(t *testing.T) {
+		defer func() {
+			err := recover()
+			if err == nil {
+				t.Fatalf("expected panic")
+			}
+			pcs := make([]uintptr, 10)
+			n := Callers(0, pcs)
+			frames := CallersFrames(pcs[:n])
+			for {
+				frame, more := frames.Next()
+				t.Log(frame.Function)
+				if frame.Function == expect {
+					return
+				}
+				if !more {
+					break
+				}
+			}
+			t.Fatalf("panicking wrapper %s missing from stack trace", expect)
+		}()
+		cb()
+	})
+	t.Run("Stack", func(t *testing.T) {
+		defer func() {
+			err := recover()
+			if err == nil {
+				t.Fatalf("expected panic")
+			}
+			buf := make([]byte, 4<<10)
+			stk := string(buf[:Stack(buf, false)])
+			if !strings.Contains(stk, "\n"+expect) {
+				t.Fatalf("panicking wrapper %s missing from stack trace:\n%s", expect, stk)
+			}
+		}()
+		cb()
+	})
+}
+
+func TestCallersFromWrapper(t *testing.T) {
+	// Test that invoking CallersFrames on a stack where the first
+	// PC is an autogenerated wrapper keeps the wrapper in the
+	// trace. Normally we elide these, assuming that the wrapper
+	// calls the thing you actually wanted to see, but in this
+	// case we need to keep it.
+	pc := reflect.ValueOf(I.M).Pointer()
+	frames := CallersFrames([]uintptr{pc})
+	frame, more := frames.Next()
+	if frame.Function != "runtime_test.I.M" {
+		t.Fatalf("want function %s, got %s", "runtime_test.I.M", frame.Function)
+	}
+	if more {
+		t.Fatalf("want 1 frame, got > 1")
+	}
+}
+
+func TestTracebackSystemstack(t *testing.T) {
+	if GOARCH == "ppc64" || GOARCH == "ppc64le" {
+		t.Skip("systemstack tail call not implemented on ppc64x")
+	}
+
+	// Test that profiles correctly jump over systemstack,
+	// including nested systemstack calls.
+	pcs := make([]uintptr, 20)
+	pcs = pcs[:TracebackSystemstack(pcs, 5)]
+	// Check that runtime.TracebackSystemstack appears five times
+	// and that we see TestTracebackSystemstack.
+	countIn, countOut := 0, 0
+	frames := CallersFrames(pcs)
+	var tb bytes.Buffer
+	for {
+		frame, more := frames.Next()
+		fmt.Fprintf(&tb, "\n%s+0x%x %s:%d", frame.Function, frame.PC-frame.Entry, frame.File, frame.Line)
+		switch frame.Function {
+		case "runtime.TracebackSystemstack":
+			countIn++
+		case "runtime_test.TestTracebackSystemstack":
+			countOut++
+		}
+		if !more {
+			break
+		}
+	}
+	if countIn != 5 || countOut != 1 {
+		t.Fatalf("expected 5 calls to TracebackSystemstack and 1 call to TestTracebackSystemstack, got:%s", tb.String())
+	}
+}
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 0ccc81ee58..22be091375 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -80,7 +80,7 @@ func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
 	if raceenabled {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicebytetostring))
 	}
 	if msanenabled {
@@ -134,7 +134,7 @@ func slicebytetostringtmp(b []byte) string {
 	if raceenabled && len(b) > 0 {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(len(b)),
-			getcallerpc(unsafe.Pointer(&b)),
+			getcallerpc(),
 			funcPC(slicebytetostringtmp))
 	}
 	if msanenabled && len(b) > 0 {
@@ -183,7 +183,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string {
 	if raceenabled && len(a) > 0 {
 		racereadrangepc(unsafe.Pointer(&a[0]),
 			uintptr(len(a))*unsafe.Sizeof(a[0]),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicerunetostring))
 	}
 	if msanenabled && len(a) > 0 {
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 72d21187ec..e83064166a 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -4,10 +4,7 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
+import "unsafe"
 
 // Should be a built-in for unsafe.Pointer?
 //go:nosplit
@@ -91,25 +88,28 @@ func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 }
 
 // exported value for testing
-var hashLoad = loadFactor
+var hashLoad = float32(loadFactorNum) / float32(loadFactorDen)
 
 //go:nosplit
 func fastrand() uint32 {
 	mp := getg().m
-	fr := mp.fastrand
-	mx := uint32(int32(fr)>>31) & 0xa8888eef
-	fr = fr<<1 ^ mx
-	mp.fastrand = fr
-	return fr
+	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
+	// Shift triplet [17,7,16] was calculated as indicated in Marsaglia's
+	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+	// This generator passes the SmallCrush suite, part of TestU01 framework:
+	// http://simul.iro.umontreal.ca/testu01/tu01.html
+	s1, s0 := mp.fastrand[0], mp.fastrand[1]
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	mp.fastrand[0], mp.fastrand[1] = s0, s1
+	return s0 + s1
 }
 
 //go:nosplit
 func fastrandn(n uint32) uint32 {
-	// Don't be clever.
-	// fastrand is not good enough for cleverness.
-	// Just use mod.
-	// See golang.org/issue/21806.
-	return fastrand() % n
+	// This is similar to fastrand() % n, but faster.
+	// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+	return uint32(uint64(fastrand()) * uint64(n) >> 32)
 }
 
 //go:linkname sync_fastrand sync.fastrand
@@ -133,11 +133,9 @@ func noescape(p unsafe.Pointer) unsafe.Pointer {
 func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
 func gogo(buf *gobuf)
 func gosave(buf *gobuf)
-func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
 
 //go:noescape
 func jmpdefer(fv *funcval, argp uintptr)
-func exit1(code int32)
 func asminit()
 func setg(gg *g)
 func breakpoint()
@@ -196,14 +194,16 @@ func publicationBarrier()
 
 // getcallerpc returns the program counter (PC) of its caller's caller.
 // getcallersp returns the stack pointer (SP) of its caller's caller.
-// For both, the argp must be a pointer to the caller's first function argument.
+// argp must be a pointer to the caller's first function argument.
 // The implementation may or may not use argp, depending on
-// the architecture.
+// the architecture. The implementation may be a compiler
+// intrinsic; there is not necessarily code implementing this
+// on every platform.
 //
 // For example:
 //
 //	func f(arg1, arg2, arg3 int) {
-//		pc := getcallerpc(unsafe.Pointer(&arg1))
+//		pc := getcallerpc()
 //		sp := getcallersp(unsafe.Pointer(&arg1))
 //	}
 //
@@ -223,12 +223,26 @@ func publicationBarrier()
 // immediately and can only be passed to nosplit functions.
 
 //go:noescape
-func getcallerpc(argp unsafe.Pointer) uintptr
+func getcallerpc() uintptr
 
-//go:nosplit
-func getcallersp(argp unsafe.Pointer) uintptr {
-	return uintptr(argp) - sys.MinFrameSize
-}
+//go:noescape
+func getcallersp(argp unsafe.Pointer) uintptr // implemented as an intrinsic on all platforms
+
+// getclosureptr returns the pointer to the current closure.
+// getclosureptr can only be used in an assignment statement
+// at the entry of a function. Moreover, go:nosplit directive
+// must be specified at the declaration of caller function,
+// so that the function prolog does not clobber the closure register.
+// for example:
+//
+//	//go:nosplit
+//	func f(arg1, arg2, arg3 int) {
+//		dx := getclosureptr()
+//	}
+//
+// The compiler rewrites calls to this function into instructions that fetch the
+// pointer from a well-known register (DX on x86 architecture, etc.) directly.
+func getclosureptr() uintptr
 
 //go:noescape
 func asmcgocall(fn, arg unsafe.Pointer) int32
@@ -278,11 +292,6 @@ func call1073741824(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 
 func systemstack_switch()
 
-func prefetcht0(addr uintptr)
-func prefetcht1(addr uintptr)
-func prefetcht2(addr uintptr)
-func prefetchnta(addr uintptr)
-
 // round n up to a multiple of a.  a must be a power of 2.
 func round(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
@@ -292,7 +301,6 @@ func round(n, a uintptr) uintptr {
 func checkASM() bool
 
 func memequal_varlen(a, b unsafe.Pointer) bool
-func eqstring(s1, s2 string) bool
 
 // bool2int returns 0 if x is false or 1 if x is true.
 func bool2int(x bool) int {
diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 8390d8fca9..ae5ccd3fee 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go
@@ -25,3 +25,9 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
 func open(name *byte, mode, perm int32) int32
 
 func madvise(addr unsafe.Pointer, n uintptr, flags int32)
+
+// exitThread terminates the current thread, writing *wait = 0 when
+// the stack is safe to reclaim.
+//
+//go:noescape
+func exitThread(wait *uint32)
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index e1b41ca4ff..bdf98b9e9d 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -19,6 +19,11 @@ type Frames struct {
 	// stackExpander expands callers into a sequence of Frames,
 	// tracking the necessary state across PCs.
 	stackExpander stackExpander
+
+	// elideWrapper indicates that, if the next frame is an
+	// autogenerated wrapper function, it should be elided from
+	// the stack.
+	elideWrapper bool
 }
 
 // Frame is the information returned by Frames for each call frame.
@@ -112,12 +117,14 @@ func (se *stackExpander) init(callers []uintptr) []uintptr {
 // Next returns frame information for the next caller.
 // If more is false, there are no more callers (the Frame value is valid).
 func (ci *Frames) Next() (frame Frame, more bool) {
-	ci.callers, frame, more = ci.stackExpander.next(ci.callers)
+	ci.callers, frame, more = ci.stackExpander.next(ci.callers, ci.elideWrapper)
+	ci.elideWrapper = elideWrapperCalling(frame.Function)
 	return
 }
 
-func (se *stackExpander) next(callers []uintptr) (ncallers []uintptr, frame Frame, more bool) {
+func (se *stackExpander) next(callers []uintptr, elideWrapper bool) (ncallers []uintptr, frame Frame, more bool) {
 	ncallers = callers
+again:
 	if !se.pcExpander.more {
 		// Expand the next PC.
 		if len(ncallers) == 0 {
@@ -144,6 +151,13 @@ func (se *stackExpander) next(callers []uintptr) (ncallers []uintptr, frame Fram
 	}
 
 	frame = se.pcExpander.next()
+	if elideWrapper && frame.File == "<autogenerated>" {
+		// Ignore autogenerated functions such as pointer
+		// method forwarding functions. These are an
+		// implementation detail that doesn't reflect the
+		// source code.
+		goto again
+	}
 	return ncallers, frame, se.pcExpander.more || len(ncallers) > 0
 }
 
@@ -338,8 +352,8 @@ const (
 // moduledata records information about the layout of the executable
 // image. It is written by the linker. Any changes here must be
 // matched changes to the code in cmd/internal/ld/symtab.go:symtab.
-// moduledata is stored in read-only memory; none of the pointers here
-// are visible to the garbage collector.
+// moduledata is stored in statically allocated non-pointer memory;
+// none of the pointers here are visible to the garbage collector.
 type moduledata struct {
 	pclntable    []byte
 	ftab         []functab
@@ -367,10 +381,14 @@ type moduledata struct {
 	modulename   string
 	modulehashes []modulehash
 
+	hasmain uint8 // 1 if module contains the main function, 0 otherwise
+
 	gcdatamask, gcbssmask bitvector
 
 	typemap map[typeOff]*_type // offset to *_rtype in previous module
 
+	bad bool // module failed to load and should be ignored
+
 	next *moduledata
 }
 
@@ -403,7 +421,7 @@ var pinnedTypemaps []map[typeOff]*_type
 
 var firstmoduledata moduledata  // linker symbol
 var lastmoduledatap *moduledata // linker symbol
-var modulesSlice unsafe.Pointer // see activeModules
+var modulesSlice *[]*moduledata // see activeModules
 
 // activeModules returns a slice of active modules.
 //
@@ -443,6 +461,9 @@ func activeModules() []*moduledata {
 func modulesinit() {
 	modules := new([]*moduledata)
 	for md := &firstmoduledata; md != nil; md = md.next {
+		if md.bad {
+			continue
+		}
 		*modules = append(*modules, md)
 		if md.gcdatamask == (bitvector{}) {
 			md.gcdatamask = progToPointerMask((*byte)(unsafe.Pointer(md.gcdata)), md.edata-md.data)
@@ -459,9 +480,8 @@ func modulesinit() {
 	// contains the main function.
 	//
 	// See Issue #18729.
-	mainText := funcPC(main_main)
 	for i, md := range *modules {
-		if md.text <= mainText && mainText <= md.etext {
+		if md.hasmain != 0 {
 			(*modules)[0] = md
 			(*modules)[i] = &firstmoduledata
 			break
@@ -521,7 +541,6 @@ func moduledataverify1(datap *moduledata) {
 
 	// ftab is lookup table for function by program counter.
 	nftab := len(datap.ftab) - 1
-	var pcCache pcvalueCache
 	for i := 0; i < nftab; i++ {
 		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
 		if datap.ftab[i].entry > datap.ftab[i+1].entry {
@@ -537,30 +556,6 @@ func moduledataverify1(datap *moduledata) {
 			}
 			throw("invalid runtime symbol table")
 		}
-
-		if debugPcln || nftab-i < 5 {
-			// Check a PC near but not at the very end.
-			// The very end might be just padding that is not covered by the tables.
-			// No architecture rounds function entries to more than 16 bytes,
-			// but if one came along we'd need to subtract more here.
-			// But don't use the next PC if it corresponds to a foreign object chunk
-			// (no pcln table, f2.pcln == 0). That chunk might have an alignment
-			// more than 16 bytes.
-			f := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff])), datap}
-			end := f.entry
-			if i+1 < nftab {
-				f2 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff])), datap}
-				if f2.pcln != 0 {
-					end = f2.entry - 16
-					if end < f.entry {
-						end = f.entry
-					}
-				}
-			}
-			pcvalue(f, f.pcfile, end, &pcCache, true)
-			pcvalue(f, f.pcln, end, &pcCache, true)
-			pcvalue(f, f.pcsp, end, &pcCache, true)
-		}
 	}
 
 	if datap.minpc != datap.ftab[0].entry ||
diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
index 5c62bfd20e..ccd901ada5 100644
--- a/src/runtime/sys_darwin_386.s
+++ b/src/runtime/sys_darwin_386.s
@@ -19,13 +19,39 @@ TEXT runtime·exit(SB),NOSPLIT,$0
 
 // Exit this OS thread (like pthread_exit, which eventually
 // calls __bsdthread_terminate).
-TEXT runtime·exit1(SB),NOSPLIT,$0
+TEXT exit1<>(SB),NOSPLIT,$16-0
+	// __bsdthread_terminate takes 4 word-size arguments.
+	// Set them all to 0. (None are an exit status.)
+	MOVL	$0, 0(SP)
+	MOVL	$0, 4(SP)
+	MOVL	$0, 8(SP)
+	MOVL	$0, 12(SP)
 	MOVL	$361, AX
 	INT	$0x80
 	JAE 2(PC)
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+GLOBL exitStack<>(SB),RODATA,$(4*4)
+DATA exitStack<>+0x00(SB)/4, $0
+DATA exitStack<>+0x04(SB)/4, $0
+DATA exitStack<>+0x08(SB)/4, $0
+DATA exitStack<>+0x0c(SB)/4, $0
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	// __bsdthread_terminate takes 4 arguments, which it expects
+	// on the stack. They should all be 0, so switch over to a
+	// fake stack of 0s. It won't write to the stack.
+	MOVL	$exitStack<>(SB), SP
+	MOVL	$361, AX	// __bsdthread_terminate
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	JMP	0(PC)
+
 TEXT runtime·open(SB),NOSPLIT,$0
 	MOVL	$5, AX
 	INT	$0x80
@@ -77,7 +103,13 @@ TEXT runtime·raiseproc(SB),NOSPLIT,$16
 TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVL	$197, AX
 	INT	$0x80
-	MOVL	AX, ret+24(FP)
+	JAE	ok
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·madvise(SB),NOSPLIT,$0
@@ -394,7 +426,7 @@ TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
 	MOVL	BX, m_procid(DX)	// m->procid = thread port (for debuggers)
 	CALL	runtime·stackcheck(SB)		// smashes AX
 	CALL	CX	// fn()
-	CALL	runtime·exit1(SB)
+	CALL	exit1<>(SB)
 	RET
 
 // func bsdthread_register() int32
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index a8dc700d60..f549efdbf6 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -25,13 +25,26 @@ TEXT runtime·exit(SB),NOSPLIT,$0
 
 // Exit this OS thread (like pthread_exit, which eventually
 // calls __bsdthread_terminate).
-TEXT runtime·exit1(SB),NOSPLIT,$0
-	MOVL	code+0(FP), DI		// arg 1 exit status
+TEXT exit1<>(SB),NOSPLIT,$0
+	// Because of exitThread below, this must not use the stack.
+	// __bsdthread_terminate takes 4 word-size arguments.
+	// Set them all to 0. (None are an exit status.)
+	MOVL	$0, DI
+	MOVL	$0, SI
+	MOVL	$0, DX
+	MOVL	$0, R10
 	MOVL	$(0x2000000+361), AX	// syscall entry
 	SYSCALL
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	JMP	exit1<>(SB)
+
 TEXT runtime·open(SB),NOSPLIT,$0
 	MOVQ	name+0(FP), DI		// arg 1 pathname
 	MOVL	mode+8(FP), SI		// arg 2 flags
@@ -107,29 +120,38 @@ TEXT runtime·madvise(SB), NOSPLIT, $0
 	RET
 
 // OS X comm page time offsets
-// http://www.opensource.apple.com/source/xnu/xnu-1699.26.8/osfmk/i386/cpu_capabilities.h
-#define	nt_tsc_base	0x50
-#define	nt_scale	0x58
-#define	nt_shift	0x5c
-#define	nt_ns_base	0x60
-#define	nt_generation	0x68
-#define	gtod_generation	0x6c
-#define	gtod_ns_base	0x70
-#define	gtod_sec_base	0x78
+// https://opensource.apple.com/source/xnu/xnu-4570.1.46/osfmk/i386/cpu_capabilities.h
+
+#define	commpage_version	0x1e
+
+#define	v12_nt_tsc_base	0x50
+#define	v12_nt_scale	0x58
+#define	v12_nt_shift	0x5c
+#define	v12_nt_ns_base	0x60
+#define	v12_nt_generation	0x68
+#define	v12_gtod_generation	0x6c  // obsolete since High Sierra (v13)
+#define	v12_gtod_ns_base	0x70  // obsolete since High Sierra (v13)
+#define	v12_gtod_sec_base	0x78  // obsolete since High Sierra (v13)
+
+#define	v13_gtod_ns_base	0xd0
+#define	v13_gtod_sec_ofs	0xd8
+#define	v13_gtod_frac_ofs	0xe0
+#define	v13_gtod_scale		0xe8
+#define	v13_gtod_tkspersec	0xf0
 
 TEXT runtime·nanotime(SB),NOSPLIT,$0-8
 	MOVQ	$0x7fffffe00000, BP	/* comm page base */
 	// Loop trying to take a consistent snapshot
 	// of the time parameters.
 timeloop:
-	MOVL	nt_generation(BP), R9
+	MOVL	v12_nt_generation(BP), R9
 	TESTL	R9, R9
 	JZ	timeloop
 	RDTSC
-	MOVQ	nt_tsc_base(BP), R10
-	MOVL	nt_scale(BP), R11
-	MOVQ	nt_ns_base(BP), R12
-	CMPL	nt_generation(BP), R9
+	MOVQ	v12_nt_tsc_base(BP), R10
+	MOVL	v12_nt_scale(BP), R11
+	MOVQ	v12_nt_ns_base(BP), R12
+	CMPL	v12_nt_generation(BP), R9
 	JNE	timeloop
 
 	// Gathered all the data we need. Compute monotonic time:
@@ -151,22 +173,91 @@ TEXT time·now(SB), NOSPLIT, $32-24
 	// are used in the systime fallback, as the timeval address
 	// filled in by the system call.
 	MOVQ	$0x7fffffe00000, BP	/* comm page base */
+	CMPW	commpage_version(BP), $13
+	JB		v12 /* sierra and older */
+
+	// This is the new code, for macOS High Sierra (v13) and newer.
+v13:
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop13:
+	MOVQ 	v13_gtod_ns_base(BP), R12
+
+	MOVL	v12_nt_generation(BP), CX
+	TESTL	CX, CX
+	JZ		timeloop13
+	RDTSC
+	MOVQ	v12_nt_tsc_base(BP), SI
+	MOVL	v12_nt_scale(BP), DI
+	MOVQ	v12_nt_ns_base(BP), BX
+	CMPL	v12_nt_generation(BP), CX
+	JNE		timeloop13
+
+	MOVQ 	v13_gtod_sec_ofs(BP), R8
+	MOVQ 	v13_gtod_frac_ofs(BP), R9
+	MOVQ 	v13_gtod_scale(BP), R10
+	MOVQ 	v13_gtod_tkspersec(BP), R11
+	CMPQ 	v13_gtod_ns_base(BP), R12
+	JNE 	timeloop13
+
+	// Compute monotonic time
+	//	mono = ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	SUBQ	SI, AX
+	MULQ	DI
+	SHRQ	$32, AX:DX
+	ADDQ	BX, AX
+
+	// Subtract startNano base to return the monotonic runtime timer
+	// which is an offset from process boot.
+	MOVQ	AX, BX
+	MOVQ	runtime·startNano(SB), CX
+	SUBQ	CX, BX
+	MOVQ	BX, monotonic+16(FP)
+
+	// Now compute the 128-bit wall time:
+	//  wall = ((mono - gtod_ns_base) * gtod_scale) + gtod_offs
+	// The parameters are updated every second, so if we found them
+	// outdated (that is, more than one second is passed from the ns base),
+	// fallback to the syscall.
+	TESTQ	R12, R12
+	JZ		systime
+	SUBQ	R12, AX
+	CMPQ	R11, AX
+	JB		systime
+	MULQ 	R10
+	ADDQ	R9, AX
+	ADCQ	R8, DX
+
+	// Convert the 128-bit wall time into (sec,nsec).
+	// High part (seconds) is already good to go, while low part
+	// (fraction of seconds) must be converted to nanoseconds.
+	MOVQ	DX, sec+0(FP)
+	MOVQ 	$1000000000, CX
+	MULQ	CX
+	MOVQ	DX, nsec+8(FP)
+	RET
+
+	// This is the legacy code needed for macOS Sierra (v12) and older.
+v12:
 	// Loop trying to take a consistent snapshot
 	// of the time parameters.
 timeloop:
-	MOVL	gtod_generation(BP), R8
-	MOVL	nt_generation(BP), R9
+	MOVL	v12_gtod_generation(BP), R8
+	MOVL	v12_nt_generation(BP), R9
 	TESTL	R9, R9
 	JZ	timeloop
 	RDTSC
-	MOVQ	nt_tsc_base(BP), R10
-	MOVL	nt_scale(BP), R11
-	MOVQ	nt_ns_base(BP), R12
-	CMPL	nt_generation(BP), R9
+	MOVQ	v12_nt_tsc_base(BP), R10
+	MOVL	v12_nt_scale(BP), R11
+	MOVQ	v12_nt_ns_base(BP), R12
+	CMPL	v12_nt_generation(BP), R9
 	JNE	timeloop
-	MOVQ	gtod_ns_base(BP), R13
-	MOVQ	gtod_sec_base(BP), R14
-	CMPL	gtod_generation(BP), R8
+	MOVQ	v12_gtod_ns_base(BP), R13
+	MOVQ	v12_gtod_sec_base(BP), R14
+	CMPL	v12_gtod_generation(BP), R8
 	JNE	timeloop
 
 	// Gathered all the data we need. Compute:
@@ -283,7 +374,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVL	off+28(FP), R9		// arg 6 offset
 	MOVL	$(0x2000000+197), AX	// syscall entry
 	SYSCALL
-	MOVQ	AX, ret+32(FP)
+	JCC	ok
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
@@ -375,7 +472,7 @@ TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
 	MOVQ	CX, g_m(AX)
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	CALL	DX	// fn
-	CALL	runtime·exit1(SB)
+	CALL	exit1<>(SB)
 	RET
 
 // func bsdthread_register() int32
diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
index ea559b5f3e..1ad904f833 100644
--- a/src/runtime/sys_darwin_arm.s
+++ b/src/runtime/sys_darwin_arm.s
@@ -19,7 +19,6 @@
 #define	SYS_mmap           197
 #define	SYS_munmap         73
 #define	SYS_madvise        75
-#define	SYS_mincore        78
 #define	SYS_gettimeofday   116
 #define	SYS_kill           37
 #define	SYS_getpid         20
@@ -90,13 +89,32 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 
 // Exit this OS thread (like pthread_exit, which eventually
 // calls __bsdthread_terminate).
-TEXT runtime·exit1(SB),NOSPLIT,$0
+TEXT exit1<>(SB),NOSPLIT,$0
+	// Because of exitThread below, this must not use the stack.
+	// __bsdthread_terminate takes 4 word-size arguments.
+	// Set them all to 0. (None are an exit status.)
+	MOVW	$0, R0
+	MOVW	$0, R1
+	MOVW	$0, R2
+	MOVW	$0, R3
 	MOVW	$SYS_bsdthread_terminate, R12
 	SWI	$0x80
 	MOVW	$1234, R0
 	MOVW	$1003, R1
 	MOVW	R0, (R1)	// fail hard
 
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVW	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R2
+storeloop:
+	LDREX	(R0), R4          // loads R4
+	STREX	R2, (R0), R1      // stores R2
+	CMP	$0, R1
+	BNE	storeloop
+	JMP	exit1<>(SB)
+
 TEXT runtime·raise(SB),NOSPLIT,$0
 	// Ideally we'd send the signal to the current thread,
 	// not the whole process, but that's too hard on OS X.
@@ -122,7 +140,14 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVW	$0, R6 // off_t is uint64_t
 	MOVW	$SYS_mmap, R12
 	SWI	$0x80
-	MOVW	R0, ret+24(FP)
+	MOVW	$0, R1
+	BCC     ok
+	MOVW	R1, p+24(FP)
+	MOVW	R0, err+28(FP)
+	RET
+ok:
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
@@ -150,15 +175,6 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0
 	SWI	$0x80
 	RET
 
-TEXT runtime·mincore(SB),NOSPLIT,$0
-	MOVW	addr+0(FP), R0
-	MOVW	n+4(FP), R1
-	MOVW	dst+8(FP), R2
-	MOVW	$SYS_mincore, R12
-	SWI	$0x80
-	MOVW	R0, ret+12(FP)
-	RET
-
 TEXT runtime·walltime(SB), 7, $32
 	MOVW	$8(R13), R0  // timeval
 	MOVW	$0, R1  // zone
@@ -380,7 +396,7 @@ TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
 	EOR     R12, R12
 	WORD    $0xeee1ca10 // fmxr	fpscr, ip
 	BL      (R2) // fn
-	BL      runtime·exit1(SB)
+	BL      exit1<>(SB)
 	RET
 
 // int32 bsdthread_register(void)
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 0e91d5bd10..5663af512d 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -19,7 +19,6 @@
 #define	SYS_mmap           197
 #define	SYS_munmap         73
 #define	SYS_madvise        75
-#define	SYS_mincore        78
 #define	SYS_gettimeofday   116
 #define	SYS_kill           37
 #define	SYS_getpid         20
@@ -90,13 +89,28 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 
 // Exit this OS thread (like pthread_exit, which eventually
 // calls __bsdthread_terminate).
-TEXT runtime·exit1(SB),NOSPLIT,$0
+TEXT exit1<>(SB),NOSPLIT,$0
+	// Because of exitThread below, this must not use the stack.
+	// __bsdthread_terminate takes 4 word-size arguments.
+	// Set them all to 0. (None are an exit status.)
+	MOVW	$0, R0
+	MOVW	$0, R1
+	MOVW	$0, R2
+	MOVW	$0, R3
 	MOVW	$SYS_bsdthread_terminate, R16
 	SVC	$0x80
 	MOVD	$1234, R0
 	MOVD	$1003, R1
 	MOVD	R0, (R1)	// fail hard
 
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVD	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R1
+	STLRW	R1, (R0)
+	JMP	exit1<>(SB)
+
 TEXT runtime·raise(SB),NOSPLIT,$0
 	// Ideally we'd send the signal to the current thread,
 	// not the whole process, but that's too hard on OS X.
@@ -121,7 +135,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVW	off+28(FP), R5
 	MOVW	$SYS_mmap, R16
 	SVC	$0x80
-	MOVD	R0, ret+32(FP)
+	BCC	ok
+	MOVD	$0, p+32(FP)
+	MOVD	R0, err+40(FP)
+	RET
+ok:
+	MOVD	R0, p+32(FP)
+	MOVD	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index f355268b99..813f1f4b69 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -64,12 +64,18 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8
-	MOVL	code+0(FP), DI		// arg 1 exit status
-	MOVL	$431, AX
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	MOVL	$0x10000, DI	// arg 1 how - EXTEXIT_LWP
+	MOVL	$0, SI		// arg 2 status
+	MOVL	$0, DX		// arg 3 addr
+	MOVL	$494, AX	// extexit
 	SYSCALL
 	MOVL	$0xf1, 0xf1  // crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVQ	name+0(FP), DI		// arg 1 pathname
@@ -236,8 +242,15 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVQ	$0, R9			// arg 6 - pad
 	MOVL	$197, AX
 	SYSCALL
+	JCC	ok
+	ADDQ	$16, SP
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
 	ADDQ	$16, SP
-	MOVQ	AX, ret+32(FP)
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
index 0f5df21e40..bef8e3257a 100644
--- a/src/runtime/sys_freebsd_386.s
+++ b/src/runtime/sys_freebsd_386.s
@@ -52,12 +52,23 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-4
-	MOVL	$431, AX
+GLOBL exitStack<>(SB),RODATA,$8
+DATA exitStack<>+0x00(SB)/4, $0
+DATA exitStack<>+0x04(SB)/4, $0
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	// thr_exit takes a single pointer argument, which it expects
+	// on the stack. We want to pass 0, so switch over to a fake
+	// stack of 0s. It won't write to the stack.
+	MOVL	$exitStack<>(SB), SP
+	MOVL	$431, AX	// thr_exit
 	INT	$0x80
-	JAE	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-4
 	MOVL	$5, AX
@@ -138,7 +149,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$32
 	STOSL
 	MOVL	$477, AX
 	INT	$0x80
-	MOVL	AX, ret+24(FP)
+	JAE	ok
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-4
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 5d072a9957..7499931ca1 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -54,12 +54,16 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8
-	MOVL	code+0(FP), DI		// arg 1 exit status
-	MOVL	$431, AX
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	MOVL	$0, DI		// arg 1 long *state
+	MOVL	$431, AX	// thr_exit
 	SYSCALL
 	MOVL	$0xf1, 0xf1  // crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVQ	name+0(FP), DI		// arg 1 pathname
@@ -229,7 +233,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVL	off+28(FP), R9		// arg 6 offset
 	MOVL	$477, AX
 	SYSCALL
-	MOVQ	AX, ret+32(FP)
+	JCC	ok
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index 2851587b0d..3f52864305 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s
@@ -82,13 +82,22 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVW.CS R8, (R8)
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8
-	MOVW code+0(FP), R0	// arg 1 exit status
-	MOVW $SYS_thr_exit, R7	
-	SWI $0
-	MOVW.CS $0, R8 // crash on syscall failure
-	MOVW.CS R8, (R8)
-	RET
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVW	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R2
+storeloop:
+	LDREX	(R0), R4          // loads R4
+	STREX	R2, (R0), R1      // stores R2
+	CMP	$0, R1
+	BNE	storeloop
+	MOVW	$0, R0		// arg 1 long *state
+	MOVW	$SYS_thr_exit, R7
+	SWI	$0
+	MOVW.CS	$0, R8 // crash on syscall failure
+	MOVW.CS	R8, (R8)
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVW name+0(FP), R0	// arg 1 name
@@ -249,8 +258,11 @@ TEXT runtime·mmap(SB),NOSPLIT,$16
 	MOVW $SYS_mmap, R7
 	SWI $0
 	SUB $4, R13
-	// TODO(dfc) error checking ?
-	MOVW	R0, ret+24(FP)
+	MOVW $0, R1
+	MOVW.CS R0, R1		// if failed, put in R1
+	MOVW.CS $0, R0
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index a3baebae4a..bc3b8dbb1c 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -24,22 +24,70 @@
 //#define INVOKE_SYSCALL	CALL	0x10(GS) // non-portable
 #define INVOKE_SYSCALL	INT	$0x80
 
+#define SYS_exit		1
+#define SYS_read		3
+#define SYS_write		4
+#define SYS_open		5
+#define SYS_close		6
+#define SYS_getpid		20
+#define SYS_access		33
+#define SYS_kill		37
+#define SYS_brk 		45
+#define SYS_fcntl		55
+#define SYS_munmap		91
+#define SYS_socketcall		102
+#define SYS_setittimer		104
+#define SYS_clone		120
+#define SYS_sched_yield 	158
+#define SYS_rt_sigreturn	173
+#define SYS_rt_sigaction	174
+#define SYS_rt_sigprocmask	175
+#define SYS_sigaltstack 	186
+#define SYS_ugetrlimit		191
+#define SYS_mmap2		192
+#define SYS_mincore		218
+#define SYS_madvise		219
+#define SYS_gettid		224
+#define SYS_tkill		238
+#define SYS_futex		240
+#define SYS_sched_getaffinity	242
+#define SYS_set_thread_area	243
+#define SYS_exit_group		252
+#define SYS_epoll_create	254
+#define SYS_epoll_ctl		255
+#define SYS_epoll_wait		256
+#define SYS_clock_gettime	265
+#define SYS_pselect6		308
+#define SYS_epoll_create1	329
+
 TEXT runtime·exit(SB),NOSPLIT,$0
-	MOVL	$252, AX	// syscall number
+	MOVL	$SYS_exit_group, AX
 	MOVL	code+0(FP), BX
 	INVOKE_SYSCALL
 	INT $3	// not reached
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$0
-	MOVL	$1, AX	// exit - exit the current os thread
+TEXT exit1<>(SB),NOSPLIT,$0
+	MOVL	$SYS_exit, AX
 	MOVL	code+0(FP), BX
 	INVOKE_SYSCALL
 	INT $3	// not reached
 	RET
 
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	MOVL	$1, AX	// exit (just this thread)
+	MOVL	$0, BX	// exit code
+	INT	$0x80	// no stack; must not use CALL
+	// We may not even have a stack any more.
+	INT	$3
+	JMP	0(PC)
+
 TEXT runtime·open(SB),NOSPLIT,$0
-	MOVL	$5, AX		// syscall - open
+	MOVL	$SYS_open, AX
 	MOVL	name+0(FP), BX
 	MOVL	mode+4(FP), CX
 	MOVL	perm+8(FP), DX
@@ -51,7 +99,7 @@ TEXT runtime·open(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·closefd(SB),NOSPLIT,$0
-	MOVL	$6, AX		// syscall - close
+	MOVL	$SYS_close, AX
 	MOVL	fd+0(FP), BX
 	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
@@ -61,7 +109,7 @@ TEXT runtime·closefd(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·write(SB),NOSPLIT,$0
-	MOVL	$4, AX		// syscall - write
+	MOVL	$SYS_write, AX
 	MOVL	fd+0(FP), BX
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
@@ -73,7 +121,7 @@ TEXT runtime·write(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·read(SB),NOSPLIT,$0
-	MOVL	$3, AX		// syscall - read
+	MOVL	$SYS_read, AX
 	MOVL	fd+0(FP), BX
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
@@ -85,7 +133,7 @@ TEXT runtime·read(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·getrlimit(SB),NOSPLIT,$0
-	MOVL	$191, AX		// syscall - ugetrlimit
+	MOVL	$SYS_ugetrlimit, AX
 	MOVL	kind+0(FP), BX
 	MOVL	limit+4(FP), CX
 	INVOKE_SYSCALL
@@ -103,7 +151,7 @@ TEXT runtime·usleep(SB),NOSPLIT,$8
 	MOVL	AX, 4(SP)
 
 	// pselect6(0, 0, 0, 0, &ts, 0)
-	MOVL	$308, AX
+	MOVL	$SYS_pselect6, AX
 	MOVL	$0, BX
 	MOVL	$0, CX
 	MOVL	$0, DX
@@ -114,31 +162,31 @@ TEXT runtime·usleep(SB),NOSPLIT,$8
 	RET
 
 TEXT runtime·gettid(SB),NOSPLIT,$0-4
-	MOVL	$224, AX	// syscall - gettid
+	MOVL	$SYS_gettid, AX
 	INVOKE_SYSCALL
 	MOVL	AX, ret+0(FP)
 	RET
 
 TEXT runtime·raise(SB),NOSPLIT,$12
-	MOVL	$224, AX	// syscall - gettid
+	MOVL	$SYS_gettid, AX
 	INVOKE_SYSCALL
 	MOVL	AX, BX	// arg 1 tid
 	MOVL	sig+0(FP), CX	// arg 2 signal
-	MOVL	$238, AX	// syscall - tkill
+	MOVL	$SYS_tkill, AX
 	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$12
-	MOVL	$20, AX	// syscall - getpid
+	MOVL	$SYS_getpid, AX
 	INVOKE_SYSCALL
 	MOVL	AX, BX	// arg 1 pid
 	MOVL	sig+0(FP), CX	// arg 2 signal
-	MOVL	$37, AX	// syscall - kill
+	MOVL	$SYS_kill, AX
 	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·setitimer(SB),NOSPLIT,$0-12
-	MOVL	$104, AX			// syscall - setitimer
+	MOVL	$SYS_setittimer, AX
 	MOVL	mode+0(FP), BX
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
@@ -146,7 +194,7 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0-12
 	RET
 
 TEXT runtime·mincore(SB),NOSPLIT,$0-16
-	MOVL	$218, AX			// syscall - mincore
+	MOVL	$SYS_mincore, AX
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	MOVL	dst+8(FP), DX
@@ -155,15 +203,56 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-16
 	RET
 
 // func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
-	MOVL	$265, AX			// syscall - clock_gettime
+TEXT runtime·walltime(SB), NOSPLIT, $0-12
+	// We don't know how much stack space the VDSO code will need,
+	// so switch to g0.
+
+	MOVL	SP, BP	// Save old SP; BP unchanged by C code.
+
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), CX
+	MOVL	m_curg(CX), DX
+
+	CMPL	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+
+	MOVL	m_g0(CX), DX
+	MOVL	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBL	$16, SP		// Space for results
+	ANDL	$~15, SP	// Align for C code
+
+	// Stack layout, depending on call path:
+	//  x(SP)   vDSO            INVOKE_SYSCALL
+	//    12    ts.tv_nsec      ts.tv_nsec
+	//     8    ts.tv_sec       ts.tv_sec
+	//     4    &ts             -
+	//     0    CLOCK_<id>      -
+
+	MOVL	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPL	AX, $0
+	JEQ	fallback
+
+	LEAL	8(SP), BX	// &ts (struct timespec)
+	MOVL	BX, 4(SP)
+	MOVL	$0, 0(SP)	// CLOCK_REALTIME
+	CALL	AX
+	JMP finish
+
+fallback:
+	MOVL	$SYS_clock_gettime, AX
 	MOVL	$0, BX		// CLOCK_REALTIME
 	LEAL	8(SP), CX
-	MOVL	$0, DX
 	INVOKE_SYSCALL
+
+finish:
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
 
+	MOVL	BP, SP		// Restore real SP
+
 	// sec is in AX, nsec in BX
 	MOVL	AX, sec_lo+0(FP)
 	MOVL	$0, sec_hi+4(FP)
@@ -172,15 +261,48 @@ TEXT runtime·walltime(SB), NOSPLIT, $32
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $32
-	MOVL	$265, AX			// syscall - clock_gettime
+TEXT runtime·nanotime(SB), NOSPLIT, $0-8
+	// Switch to g0 stack. See comment above in runtime·walltime.
+
+	MOVL	SP, BP	// Save old SP; BP unchanged by C code.
+
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), CX
+	MOVL	m_curg(CX), DX
+
+	CMPL	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+
+	MOVL	m_g0(CX), DX
+	MOVL	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBL	$16, SP		// Space for results
+	ANDL	$~15, SP	// Align for C code
+
+	MOVL	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPL	AX, $0
+	JEQ	fallback
+
+	LEAL	8(SP), BX	// &ts (struct timespec)
+	MOVL	BX, 4(SP)
+	MOVL	$1, 0(SP)	// CLOCK_MONOTONIC
+	CALL	AX
+	JMP finish
+
+fallback:
+	MOVL	$SYS_clock_gettime, AX
 	MOVL	$1, BX		// CLOCK_MONOTONIC
 	LEAL	8(SP), CX
-	MOVL	$0, DX
 	INVOKE_SYSCALL
+
+finish:
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
 
+	MOVL	BP, SP		// Restore real SP
+
 	// sec is in AX, nsec in BX
 	// convert to DX:AX nsec
 	MOVL	$1000000000, CX
@@ -193,7 +315,7 @@ TEXT runtime·nanotime(SB), NOSPLIT, $32
 	RET
 
 TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0
-	MOVL	$175, AX		// syscall entry
+	MOVL	$SYS_rt_sigprocmask, AX
 	MOVL	how+0(FP), BX
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
@@ -205,7 +327,7 @@ TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·rt_sigaction(SB),NOSPLIT,$0
-	MOVL	$174, AX		// syscall - rt_sigaction
+	MOVL	$SYS_rt_sigaction, AX
 	MOVL	sig+0(FP), BX
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
@@ -258,7 +380,7 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
 	JMP	runtime·sigtramp(SB)
 
 TEXT runtime·sigreturn(SB),NOSPLIT,$0
-	MOVL	$173, AX	// rt_sigreturn
+	MOVL	$SYS_rt_sigreturn, AX
 	// Sigreturn expects same SP as signal handler,
 	// so cannot CALL 0x10(GS) here.
 	INT	$0x80
@@ -266,7 +388,7 @@ TEXT runtime·sigreturn(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
-	MOVL	$192, AX	// mmap2
+	MOVL	$SYS_mmap2, AX
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	MOVL	prot+8(FP), DX
@@ -276,14 +398,19 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	SHRL	$12, BP
 	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
-	JLS	3(PC)
+	JLS	ok
 	NOTL	AX
 	INCL	AX
-	MOVL	AX, ret+24(FP)
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
-	MOVL	$91, AX	// munmap
+	MOVL	$SYS_munmap, AX
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	INVOKE_SYSCALL
@@ -293,7 +420,7 @@ TEXT runtime·munmap(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·madvise(SB),NOSPLIT,$0
-	MOVL	$219, AX	// madvise
+	MOVL	$SYS_madvise, AX
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	MOVL	flags+8(FP), DX
@@ -304,7 +431,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
 // int32 futex(int32 *uaddr, int32 op, int32 val,
 //	struct timespec *timeout, int32 *uaddr2, int32 val2);
 TEXT runtime·futex(SB),NOSPLIT,$0
-	MOVL	$240, AX	// futex
+	MOVL	$SYS_futex, AX
 	MOVL	addr+0(FP), BX
 	MOVL	op+4(FP), CX
 	MOVL	val+8(FP), DX
@@ -317,7 +444,7 @@ TEXT runtime·futex(SB),NOSPLIT,$0
 
 // int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
 TEXT runtime·clone(SB),NOSPLIT,$0
-	MOVL	$120, AX	// clone
+	MOVL	$SYS_clone, AX
 	MOVL	flags+0(FP), BX
 	MOVL	stk+4(FP), CX
 	MOVL	$0, DX	// parent tid ptr
@@ -351,7 +478,7 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	INT	$3
 
 	// Initialize AX to Linux tid
-	MOVL	$224, AX
+	MOVL	$SYS_gettid, AX
 	INVOKE_SYSCALL
 
 	MOVL	0(SP), BX	    // m
@@ -396,11 +523,11 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 
 nog:
 	CALL	SI	// fn()
-	CALL	runtime·exit1(SB)
+	CALL	exit1<>(SB)
 	MOVL	$0x1234, 0x1005
 
 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
-	MOVL	$186, AX	// sigaltstack
+	MOVL	$SYS_sigaltstack, AX
 	MOVL	new+0(FP), BX
 	MOVL	old+4(FP), CX
 	INVOKE_SYSCALL
@@ -483,7 +610,7 @@ TEXT runtime·setldt(SB),NOSPLIT,$32
 
 	// call set_thread_area
 	MOVL	AX, BX	// user_desc
-	MOVL	$243, AX	// syscall - set_thread_area
+	MOVL	$SYS_set_thread_area, AX
 	// We can't call this via 0x10(GS) because this is called from setldt0 to set that up.
 	INT     $0x80
 
@@ -509,12 +636,12 @@ TEXT runtime·setldt(SB),NOSPLIT,$32
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
-	MOVL	$158, AX
+	MOVL	$SYS_sched_yield, AX
 	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
-	MOVL	$242, AX		// syscall - sched_getaffinity
+	MOVL	$SYS_sched_getaffinity, AX
 	MOVL	pid+0(FP), BX
 	MOVL	len+4(FP), CX
 	MOVL	buf+8(FP), DX
@@ -524,7 +651,7 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
 
 // int32 runtime·epollcreate(int32 size);
 TEXT runtime·epollcreate(SB),NOSPLIT,$0
-	MOVL    $254, AX
+	MOVL    $SYS_epoll_create, AX
 	MOVL	size+0(FP), BX
 	INVOKE_SYSCALL
 	MOVL	AX, ret+4(FP)
@@ -532,7 +659,7 @@ TEXT runtime·epollcreate(SB),NOSPLIT,$0
 
 // int32 runtime·epollcreate1(int32 flags);
 TEXT runtime·epollcreate1(SB),NOSPLIT,$0
-	MOVL    $329, AX
+	MOVL    $SYS_epoll_create1, AX
 	MOVL	flags+0(FP), BX
 	INVOKE_SYSCALL
 	MOVL	AX, ret+4(FP)
@@ -540,7 +667,7 @@ TEXT runtime·epollcreate1(SB),NOSPLIT,$0
 
 // func epollctl(epfd, op, fd int32, ev *epollEvent) int
 TEXT runtime·epollctl(SB),NOSPLIT,$0
-	MOVL	$255, AX
+	MOVL	$SYS_epoll_ctl, AX
 	MOVL	epfd+0(FP), BX
 	MOVL	op+4(FP), CX
 	MOVL	fd+8(FP), DX
@@ -551,7 +678,7 @@ TEXT runtime·epollctl(SB),NOSPLIT,$0
 
 // int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
 TEXT runtime·epollwait(SB),NOSPLIT,$0
-	MOVL	$256, AX
+	MOVL	$SYS_epoll_wait, AX
 	MOVL	epfd+0(FP), BX
 	MOVL	ev+4(FP), CX
 	MOVL	nev+8(FP), DX
@@ -562,7 +689,7 @@ TEXT runtime·epollwait(SB),NOSPLIT,$0
 
 // void runtime·closeonexec(int32 fd);
 TEXT runtime·closeonexec(SB),NOSPLIT,$0
-	MOVL	$55, AX  // fcntl
+	MOVL	$SYS_fcntl, AX
 	MOVL	fd+0(FP), BX  // fd
 	MOVL	$2, CX  // F_SETFD
 	MOVL	$1, DX  // FD_CLOEXEC
@@ -571,7 +698,7 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0
-	MOVL	$33, AX  // syscall - access
+	MOVL	$SYS_access, AX
 	MOVL	name+0(FP), BX
 	MOVL	mode+4(FP), CX
 	INVOKE_SYSCALL
@@ -582,7 +709,7 @@ TEXT runtime·access(SB),NOSPLIT,$0
 TEXT runtime·connect(SB),NOSPLIT,$0-16
 	// connect is implemented as socketcall(NR_socket, 3, *(rest of args))
 	// stack already should have fd, addr, addrlen.
-	MOVL	$102, AX  // syscall - socketcall
+	MOVL	$SYS_socketcall, AX
 	MOVL	$3, BX  // connect
 	LEAL	fd+0(FP), CX
 	INVOKE_SYSCALL
@@ -593,7 +720,7 @@ TEXT runtime·connect(SB),NOSPLIT,$0-16
 TEXT runtime·socket(SB),NOSPLIT,$0-16
 	// socket is implemented as socketcall(NR_socket, 1, *(rest of args))
 	// stack already should have domain, type, protocol.
-	MOVL	$102, AX  // syscall - socketcall
+	MOVL	$SYS_socketcall, AX
 	MOVL	$1, BX  // socket
 	LEAL	domain+0(FP), CX
 	INVOKE_SYSCALL
@@ -603,7 +730,7 @@ TEXT runtime·socket(SB),NOSPLIT,$0-16
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
 	// Implemented as brk(NULL).
-	MOVL	$45, AX  // syscall - brk
+	MOVL	$SYS_brk, AX
 	MOVL	$0, BX  // NULL
 	INVOKE_SYSCALL
 	MOVL	AX, ret+0(FP)
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index e0dc3e1264..5a94bda7c2 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -10,23 +10,65 @@
 #include "go_tls.h"
 #include "textflag.h"
 
+#define SYS_read		0
+#define SYS_write		1
+#define SYS_open		2
+#define SYS_close		3
+#define SYS_mmap		9
+#define SYS_munmap		11
+#define SYS_brk 		12
+#define SYS_rt_sigaction	13
+#define SYS_rt_sigprocmask	14
+#define SYS_rt_sigreturn	15
+#define SYS_access		21
+#define SYS_sched_yield 	24
+#define SYS_mincore		27
+#define SYS_madvise		28
+#define SYS_setittimer		38
+#define SYS_getpid		39
+#define SYS_socket		41
+#define SYS_connect		42
+#define SYS_clone		56
+#define SYS_exit		60
+#define SYS_kill		62
+#define SYS_fcntl		72
+#define SYS_getrlimit		97
+#define SYS_sigaltstack 	131
+#define SYS_arch_prctl		158
+#define SYS_gettid		186
+#define SYS_tkill		200
+#define SYS_futex		202
+#define SYS_sched_getaffinity	204
+#define SYS_epoll_create	213
+#define SYS_exit_group		231
+#define SYS_epoll_wait		232
+#define SYS_epoll_ctl		233
+#define SYS_pselect6		270
+#define SYS_epoll_create1	291
+
 TEXT runtime·exit(SB),NOSPLIT,$0-4
 	MOVL	code+0(FP), DI
-	MOVL	$231, AX	// exitgroup - force all os threads to exit
+	MOVL	$SYS_exit_group, AX
 	SYSCALL
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$0-4
-	MOVL	code+0(FP), DI
-	MOVL	$60, AX	// exit - exit the current os thread
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	MOVL	$0, DI	// exit code
+	MOVL	$SYS_exit, AX
 	SYSCALL
-	RET
+	// We may not even have a stack any more.
+	INT	$3
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$0-20
 	MOVQ	name+0(FP), DI
 	MOVL	mode+8(FP), SI
 	MOVL	perm+12(FP), DX
-	MOVL	$2, AX			// syscall entry
+	MOVL	$SYS_open, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -36,7 +78,7 @@ TEXT runtime·open(SB),NOSPLIT,$0-20
 
 TEXT runtime·closefd(SB),NOSPLIT,$0-12
 	MOVL	fd+0(FP), DI
-	MOVL	$3, AX			// syscall entry
+	MOVL	$SYS_close, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -48,7 +90,7 @@ TEXT runtime·write(SB),NOSPLIT,$0-28
 	MOVQ	fd+0(FP), DI
 	MOVQ	p+8(FP), SI
 	MOVL	n+16(FP), DX
-	MOVL	$1, AX			// syscall entry
+	MOVL	$SYS_write, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -60,7 +102,7 @@ TEXT runtime·read(SB),NOSPLIT,$0-28
 	MOVL	fd+0(FP), DI
 	MOVQ	p+8(FP), SI
 	MOVL	n+16(FP), DX
-	MOVL	$0, AX			// syscall entry
+	MOVL	$SYS_read, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -71,7 +113,7 @@ TEXT runtime·read(SB),NOSPLIT,$0-28
 TEXT runtime·getrlimit(SB),NOSPLIT,$0-20
 	MOVL	kind+0(FP), DI
 	MOVQ	limit+8(FP), SI
-	MOVL	$97, AX			// syscall entry
+	MOVL	$SYS_getrlimit, AX
 	SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
@@ -93,31 +135,31 @@ TEXT runtime·usleep(SB),NOSPLIT,$16
 	MOVL	$0, R10
 	MOVQ	SP, R8
 	MOVL	$0, R9
-	MOVL	$270, AX
+	MOVL	$SYS_pselect6, AX
 	SYSCALL
 	RET
 
 TEXT runtime·gettid(SB),NOSPLIT,$0-4
-	MOVL	$186, AX	// syscall - gettid
+	MOVL	$SYS_gettid, AX
 	SYSCALL
 	MOVL	AX, ret+0(FP)
 	RET
 
 TEXT runtime·raise(SB),NOSPLIT,$0
-	MOVL	$186, AX	// syscall - gettid
+	MOVL	$SYS_gettid, AX
 	SYSCALL
 	MOVL	AX, DI	// arg 1 tid
 	MOVL	sig+0(FP), SI	// arg 2
-	MOVL	$200, AX	// syscall - tkill
+	MOVL	$SYS_tkill, AX
 	SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$0
-	MOVL	$39, AX	// syscall - getpid
+	MOVL	$SYS_getpid, AX
 	SYSCALL
 	MOVL	AX, DI	// arg 1 pid
 	MOVL	sig+0(FP), SI	// arg 2
-	MOVL	$62, AX	// syscall - kill
+	MOVL	$SYS_kill, AX
 	SYSCALL
 	RET
 
@@ -125,7 +167,7 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0-24
 	MOVL	mode+0(FP), DI
 	MOVQ	new+8(FP), SI
 	MOVQ	old+16(FP), DX
-	MOVL	$38, AX			// syscall entry
+	MOVL	$SYS_setittimer, AX
 	SYSCALL
 	RET
 
@@ -133,17 +175,37 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-28
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
 	MOVQ	dst+16(FP), DX
-	MOVL	$27, AX			// syscall entry
+	MOVL	$SYS_mincore, AX
 	SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
 
 // func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$16
-	// Be careful. We're calling a function with gcc calling convention here.
-	// We're guaranteed 128 bytes on entry, and we've taken 16, and the
-	// call uses another 8.
-	// That leaves 104 for the gettime code to use. Hope that's enough!
+TEXT runtime·walltime(SB),NOSPLIT,$0-12
+	// We don't know how much stack space the VDSO code will need,
+	// so switch to g0.
+	// In particular, a kernel configured with CONFIG_OPTIMIZE_INLINING=n
+	// and hardening can use a full page of stack space in gettime_sym
+	// due to stack probes inserted to avoid stack/heap collisions.
+	// See issue #20427.
+
+	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), CX
+	MOVQ	m_curg(CX), DX
+
+	CMPQ	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+
+	MOVQ	m_g0(CX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBQ	$16, SP		// Space for results
+	ANDQ	$~15, SP	// Align for C code
+
 	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
@@ -152,6 +214,7 @@ TEXT runtime·walltime(SB),NOSPLIT,$16
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
+	MOVQ	BP, SP		// Restore real SP
 	MOVQ	AX, sec+0(FP)
 	MOVL	DX, nsec+8(FP)
 	RET
@@ -163,13 +226,31 @@ fallback:
 	MOVQ	0(SP), AX	// sec
 	MOVL	8(SP), DX	// usec
 	IMULQ	$1000, DX
+	MOVQ	BP, SP		// Restore real SP
 	MOVQ	AX, sec+0(FP)
 	MOVL	DX, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$16
-	// Duplicate time.now here to avoid using up precious stack space.
-	// See comment above in time.now.
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+	// Switch to g0 stack. See comment above in runtime·walltime.
+
+	MOVQ	SP, BP	// Save old SP; BX unchanged by C code.
+
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), CX
+	MOVQ	m_curg(CX), DX
+
+	CMPQ	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+
+	MOVQ	m_g0(CX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBQ	$16, SP		// Space for results
+	ANDQ	$~15, SP	// Align for C code
+
 	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
@@ -178,6 +259,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
+	MOVQ	BP, SP		// Restore real SP
 	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
@@ -191,6 +273,7 @@ fallback:
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVL	8(SP), DX	// usec
+	MOVQ	BP, SP		// Restore real SP
 	IMULQ	$1000, DX
 	// sec is in AX, nsec in DX
 	// return nsec in AX
@@ -204,7 +287,7 @@ TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28
 	MOVQ	new+8(FP), SI
 	MOVQ	old+16(FP), DX
 	MOVL	size+24(FP), R10
-	MOVL	$14, AX			// syscall entry
+	MOVL	$SYS_rt_sigprocmask, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -216,7 +299,7 @@ TEXT runtime·sysSigaction(SB),NOSPLIT,$0-36
 	MOVQ	new+8(FP), SI
 	MOVQ	old+16(FP), DX
 	MOVQ	size+24(FP), R10
-	MOVL	$13, AX			// syscall entry
+	MOVL	$SYS_rt_sigaction, AX
 	SYSCALL
 	MOVL	AX, ret+32(FP)
 	RET
@@ -354,7 +437,7 @@ sigtrampnog:
 // The code that cares about the precise instructions used is:
 // https://gcc.gnu.org/viewcvs/gcc/trunk/libgcc/config/i386/linux-unwind.h?revision=219188&view=markup
 TEXT runtime·sigreturn(SB),NOSPLIT,$0
-	MOVQ	$15, AX	// rt_sigreturn
+	MOVQ	$SYS_rt_sigreturn, AX
 	SYSCALL
 	INT $3	// not reached
 
@@ -366,13 +449,18 @@ TEXT runtime·sysMmap(SB),NOSPLIT,$0
 	MOVL	fd+24(FP), R8
 	MOVL	off+28(FP), R9
 
-	MOVL	$9, AX			// mmap
+	MOVL	$SYS_mmap, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
-	JLS	3(PC)
+	JLS	ok
 	NOTQ	AX
 	INCQ	AX
-	MOVQ	AX, ret+32(FP)
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 // Call the function stored in _cgo_mmap using the GCC calling convention.
@@ -396,7 +484,7 @@ TEXT runtime·callCgoMmap(SB),NOSPLIT,$16
 TEXT runtime·sysMunmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
-	MOVQ	$11, AX	// munmap
+	MOVQ	$SYS_munmap, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -420,7 +508,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
 	MOVL	flags+16(FP), DX
-	MOVQ	$28, AX	// madvise
+	MOVQ	$SYS_madvise, AX
 	SYSCALL
 	// ignore failure - maybe pages are locked
 	RET
@@ -434,7 +522,7 @@ TEXT runtime·futex(SB),NOSPLIT,$0
 	MOVQ	ts+16(FP), R10
 	MOVQ	addr2+24(FP), R8
 	MOVL	val3+32(FP), R9
-	MOVL	$202, AX
+	MOVL	$SYS_futex, AX
 	SYSCALL
 	MOVL	AX, ret+40(FP)
 	RET
@@ -452,7 +540,7 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	MOVQ	gp+24(FP), R9
 	MOVQ	fn+32(FP), R12
 
-	MOVL	$56, AX
+	MOVL	$SYS_clone, AX
 	SYSCALL
 
 	// In parent, return.
@@ -471,7 +559,7 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	JEQ	nog
 
 	// Initialize m->procid to Linux tid
-	MOVL	$186, AX	// gettid
+	MOVL	$SYS_gettid, AX
 	SYSCALL
 	MOVQ	AX, m_procid(R8)
 
@@ -491,14 +579,14 @@ nog:
 
 	// It shouldn't return. If it does, exit that thread.
 	MOVL	$111, DI
-	MOVL	$60, AX
+	MOVL	$SYS_exit, AX
 	SYSCALL
 	JMP	-3(PC)	// keep exiting
 
 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
 	MOVQ	new+0(FP), DI
 	MOVQ	old+8(FP), SI
-	MOVQ	$131, AX
+	MOVQ	$SYS_sigaltstack, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -517,7 +605,7 @@ TEXT runtime·settls(SB),NOSPLIT,$32
 #endif
 	MOVQ	DI, SI
 	MOVQ	$0x1002, DI	// ARCH_SET_FS
-	MOVQ	$158, AX	// arch_prctl
+	MOVQ	$SYS_arch_prctl, AX
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	2(PC)
@@ -525,7 +613,7 @@ TEXT runtime·settls(SB),NOSPLIT,$32
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
-	MOVL	$24, AX
+	MOVL	$SYS_sched_yield, AX
 	SYSCALL
 	RET
 
@@ -533,7 +621,7 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
 	MOVQ	pid+0(FP), DI
 	MOVQ	len+8(FP), SI
 	MOVQ	buf+16(FP), DX
-	MOVL	$204, AX			// syscall entry
+	MOVL	$SYS_sched_getaffinity, AX
 	SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
@@ -541,7 +629,7 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
 // int32 runtime·epollcreate(int32 size);
 TEXT runtime·epollcreate(SB),NOSPLIT,$0
 	MOVL    size+0(FP), DI
-	MOVL    $213, AX                        // syscall entry
+	MOVL    $SYS_epoll_create, AX
 	SYSCALL
 	MOVL	AX, ret+8(FP)
 	RET
@@ -549,7 +637,7 @@ TEXT runtime·epollcreate(SB),NOSPLIT,$0
 // int32 runtime·epollcreate1(int32 flags);
 TEXT runtime·epollcreate1(SB),NOSPLIT,$0
 	MOVL	flags+0(FP), DI
-	MOVL	$291, AX			// syscall entry
+	MOVL	$SYS_epoll_create1, AX
 	SYSCALL
 	MOVL	AX, ret+8(FP)
 	RET
@@ -560,7 +648,7 @@ TEXT runtime·epollctl(SB),NOSPLIT,$0
 	MOVL	op+4(FP), SI
 	MOVL	fd+8(FP), DX
 	MOVQ	ev+16(FP), R10
-	MOVL	$233, AX			// syscall entry
+	MOVL	$SYS_epoll_ctl, AX
 	SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
@@ -571,7 +659,7 @@ TEXT runtime·epollwait(SB),NOSPLIT,$0
 	MOVQ	ev+8(FP), SI
 	MOVL	nev+16(FP), DX
 	MOVL	timeout+20(FP), R10
-	MOVL	$232, AX			// syscall entry
+	MOVL	$SYS_epoll_wait, AX
 	SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
@@ -581,7 +669,7 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 	MOVL    fd+0(FP), DI  // fd
 	MOVQ    $2, SI  // F_SETFD
 	MOVQ    $1, DX  // FD_CLOEXEC
-	MOVL	$72, AX  // fcntl
+	MOVL	$SYS_fcntl, AX
 	SYSCALL
 	RET
 
@@ -590,7 +678,7 @@ TEXT runtime·closeonexec(SB),NOSPLIT,$0
 TEXT runtime·access(SB),NOSPLIT,$0
 	MOVQ	name+0(FP), DI
 	MOVL	mode+8(FP), SI
-	MOVL	$21, AX  // syscall entry
+	MOVL	$SYS_access, AX
 	SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
@@ -600,7 +688,7 @@ TEXT runtime·connect(SB),NOSPLIT,$0-28
 	MOVL	fd+0(FP), DI
 	MOVQ	addr+8(FP), SI
 	MOVL	len+16(FP), DX
-	MOVL	$42, AX  // syscall entry
+	MOVL	$SYS_connect, AX
 	SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
@@ -610,7 +698,7 @@ TEXT runtime·socket(SB),NOSPLIT,$0-20
 	MOVL	domain+0(FP), DI
 	MOVL	typ+4(FP), SI
 	MOVL	prot+8(FP), DX
-	MOVL	$41, AX  // syscall entry
+	MOVL	$SYS_socket, AX
 	SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
@@ -619,7 +707,7 @@ TEXT runtime·socket(SB),NOSPLIT,$0-20
 TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
 	// Implemented as brk(NULL).
 	MOVQ	$0, DI
-	MOVL	$12, AX  // syscall entry
+	MOVL	$SYS_brk, AX
 	SYSCALL
 	MOVQ	AX, ret+0(FP)
 	RET
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 64beed8f68..794f9b39a6 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -114,7 +114,7 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVW	$1002, R1
 	MOVW	R0, (R1)	// fail hard
 
-TEXT runtime·exit1(SB),NOSPLIT,$-4
+TEXT exit1<>(SB),NOSPLIT,$-4
 	MOVW	code+0(FP), R0
 	MOVW	$SYS_exit, R7
 	SWI	$0
@@ -122,6 +122,22 @@ TEXT runtime·exit1(SB),NOSPLIT,$-4
 	MOVW	$1003, R1
 	MOVW	R0, (R1)	// fail hard
 
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$-4-4
+	MOVW	wait+0(FP), R0
+	// We're done using the stack.
+	// Alas, there's no reliable way to make this write atomic
+	// without potentially using the stack. So it goes.
+	MOVW	$0, R1
+	MOVW	R1, (R0)
+	MOVW	$0, R0	// exit code
+	MOVW	$SYS_exit, R7
+	SWI	$0
+	MOVW	$1234, R0
+	MOVW	$1004, R1
+	MOVW	R0, (R1)	// fail hard
+	JMP	0(PC)
+
 TEXT runtime·gettid(SB),NOSPLIT,$0-4
 	MOVW	$SYS_gettid, R7
 	SWI	$0
@@ -157,8 +173,12 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	SWI	$0
 	MOVW	$0xfffff001, R6
 	CMP		R6, R0
+	MOVW	$0, R1
 	RSB.HI	$0, R0
-	MOVW	R0, ret+24(FP)
+	MOVW.HI	R0, R1		// if error, put in R1
+	MOVW.HI	$0, R0
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
@@ -317,7 +337,7 @@ nog:
 	SUB	$16, R13 // restore the stack pointer to avoid memory corruption
 	MOVW	$0, R0
 	MOVW	R0, 4(R13)
-	BL	runtime·exit1(SB)
+	BL	exit1<>(SB)
 
 	MOVW	$1234, R0
 	MOVW	$1005, R1
diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index e921f9906c..758e68575b 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s
@@ -54,11 +54,16 @@ TEXT runtime·exit(SB),NOSPLIT,$-8-4
 	SVC
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8-4
-	MOVW	code+0(FP), R0
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$-8-8
+	MOVD	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R1
+	STLRW	R1, (R0)
+	MOVW	$0, R0	// exit code
 	MOVD	$SYS_exit, R8
 	SVC
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8-20
 	MOVD	$AT_FDCWD, R0
@@ -273,9 +278,14 @@ TEXT runtime·mmap(SB),NOSPLIT,$-8
 	MOVD	$SYS_mmap, R8
 	SVC
 	CMN	$4095, R0
-	BCC	2(PC)
+	BCC	ok
 	NEG	R0,R0
-	MOVD	R0, ret+32(FP)
+	MOVD	$0, p+32(FP)
+	MOVD	R0, err+40(FP)
+	RET
+ok:
+	MOVD	R0, p+32(FP)
+	MOVD	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-8
diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index 27de7b0901..7402ae21d4 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s
@@ -53,11 +53,18 @@ TEXT runtime·exit(SB),NOSPLIT,$-8-4
 	SYSCALL
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8-4
-	MOVW	code+0(FP), R4
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$-8-8
+	MOVV	wait+0(FP), R1
+	// We're done using the stack.
+	MOVW	$0, R2
+	SYNC
+	MOVW	R2, (R1)
+	SYNC
+	MOVW	$0, R4	// exit code
 	MOVV	$SYS_exit, R2
 	SYSCALL
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8-20
 	MOVV	name+0(FP), R4
@@ -262,7 +269,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$-8
 
 	MOVV	$SYS_mmap, R2
 	SYSCALL
-	MOVV	R2, ret+32(FP)
+	BEQ	R7, ok
+	MOVV	$0, p+32(FP)
+	MOVV	R2, err+40(FP)
+	RET
+ok:
+	MOVV	R2, p+32(FP)
+	MOVV	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-8
diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index 39bd731a4d..6bd0267ea2 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s
@@ -54,12 +54,19 @@ TEXT runtime·exit(SB),NOSPLIT,$0-4
 	UNDEF
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$0-4
-	MOVW	code+0(FP), R4
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVW	wait+0(FP), R1
+	// We're done using the stack.
+	MOVW	$0, R2
+	SYNC
+	MOVW	R2, (R1)
+	SYNC
+	MOVW	$0, R4	// exit code
 	MOVW	$SYS_exit, R2
 	SYSCALL
 	UNDEF
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$0-16
 	MOVW	name+0(FP), R4
@@ -272,7 +279,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$12
 TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
 	JMP	runtime·sigtramp(SB)
 
-TEXT runtime·mmap(SB),NOSPLIT,$20-28
+TEXT runtime·mmap(SB),NOSPLIT,$20-32
 	MOVW	addr+0(FP), R4
 	MOVW	n+4(FP), R5
 	MOVW	prot+8(FP), R6
@@ -284,7 +291,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$20-28
 
 	MOVW	$SYS_mmap, R2
 	SYSCALL
-	MOVW	R2, ret+24(FP)
+	BEQ	R7, ok
+	MOVW	$0, p+24(FP)
+	MOVW	R2, err+28(FP)
+	RET
+ok:
+	MOVW	R2, p+24(FP)
+	MOVW	$0, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0-8
diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index 2b2aa61d06..9b45f94e65 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s
@@ -54,10 +54,16 @@ TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	SYSCALL	$SYS_exit_group
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT|NOFRAME,$0-4
-	MOVW	code+0(FP), R3
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	wait+0(FP), R1
+	// We're done using the stack.
+	MOVW	$0, R2
+	SYNC
+	MOVW	R2, (R1)
+	MOVW	$0, R3	// exit code
 	SYSCALL	$SYS_exit
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
 	MOVD	name+0(FP), R3
@@ -244,7 +250,96 @@ TEXT runtime·_sigtramp(SB),NOSPLIT,$64
 
 #ifdef GOARCH_ppc64le
 // ppc64le doesn't need function descriptors
-TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+TEXT runtime·cgoSigtramp(SB),NOSPLIT|NOFRAME,$0
+	// The stack unwinder, presumably written in C, may not be able to
+	// handle Go frame correctly. So, this function is NOFRAME, and we
+	// we save/restore LR manually.
+	MOVD	LR, R10
+
+	// We're coming from C code, initialize essential registers.
+	CALL	runtime·reginit(SB)
+
+	// If no traceback function, do usual sigtramp.
+	MOVD	runtime·cgoTraceback(SB), R6
+	CMP	$0, R6
+	BEQ	sigtramp
+
+	// If no traceback support function, which means that
+	// runtime/cgo was not linked in, do usual sigtramp.
+	MOVD	_cgo_callers(SB), R6
+	CMP	$0, R6
+	BEQ	sigtramp
+
+	// Set up g register.
+	CALL	runtime·load_g(SB)
+
+	// Figure out if we are currently in a cgo call.
+	// If not, just do usual sigtramp.
+	CMP	$0, g
+	BEQ	sigtrampnog // g == nil
+	MOVD	g_m(g), R6
+	CMP	$0, R6
+	BEQ	sigtramp    // g.m == nil
+	MOVW	m_ncgo(R6), R7
+	CMPW	$0, R7
+	BEQ	sigtramp    // g.m.ncgo = 0
+	MOVD	m_curg(R6), R7
+	CMP	$0, R7
+	BEQ	sigtramp    // g.m.curg == nil
+	MOVD	g_syscallsp(R7), R7
+	CMP	$0, R7
+	BEQ	sigtramp    // g.m.curg.syscallsp == 0
+	MOVD	m_cgoCallers(R6), R7 // R7 is the fifth arg in C calling convention.
+	CMP	$0, R7
+	BEQ	sigtramp    // g.m.cgoCallers == nil
+	MOVW	m_cgoCallersUse(R6), R8
+	CMPW	$0, R8
+	BNE	sigtramp    // g.m.cgoCallersUse != 0
+
+	// Jump to a function in runtime/cgo.
+	// That function, written in C, will call the user's traceback
+	// function with proper unwind info, and will then call back here.
+	// The first three arguments, and the fifth, are already in registers.
+	// Set the two remaining arguments now.
+	MOVD	runtime·cgoTraceback(SB), R6
+	MOVD	$runtime·sigtramp(SB), R8
+	MOVD	_cgo_callers(SB), R12
+	MOVD	R12, CTR
+	MOVD	R10, LR // restore LR
+	JMP	(CTR)
+
+sigtramp:
+	MOVD	R10, LR // restore LR
+	JMP	runtime·sigtramp(SB)
+
+sigtrampnog:
+	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
+	// stack trace.
+	CMPW	R3, $27 // 27 == SIGPROF
+	BNE	sigtramp
+
+	// Lock sigprofCallersUse (cas from 0 to 1).
+	MOVW	$1, R7
+	MOVD	$runtime·sigprofCallersUse(SB), R8
+	SYNC
+	LWAR    (R8), R6
+	CMPW    $0, R6
+	BNE     sigtramp
+	STWCCC  R7, (R8)
+	BNE     -4(PC)
+	ISYNC
+
+	// Jump to the traceback function in runtime/cgo.
+	// It will call back to sigprofNonGo, which will ignore the
+	// arguments passed in registers.
+	// First three arguments to traceback function are in registers already.
+	MOVD	runtime·cgoTraceback(SB), R6
+	MOVD	$runtime·sigprofCallers(SB), R7
+	MOVD	$runtime·sigprofNonGoWrapper<>(SB), R8
+	MOVD	_cgo_callers(SB), R12
+	MOVD	R12, CTR
+	MOVD	R10, LR // restore LR
+	JMP	(CTR)
 #else
 // function descriptor for the real sigtramp
 TEXT runtime·cgoSigtramp(SB),NOSPLIT|NOFRAME,$0
@@ -252,10 +347,14 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT|NOFRAME,$0
 	DWORD	$0
 	DWORD	$0
 TEXT runtime·_cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
 #endif
-	MOVD	$runtime·sigtramp(SB), R12
-	MOVD	R12, CTR
-	JMP	(CTR)
+
+TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT,$0
+	// We're coming from C code, set up essential register, then call sigprofNonGo.
+	CALL	runtime·reginit(SB)
+	CALL	runtime·sigprofNonGo(SB)
+	RET
 
 TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr+0(FP), R3
@@ -266,7 +365,13 @@ TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
 	MOVW	off+28(FP), R8
 
 	SYSCALL	$SYS_mmap
-	MOVD	R3, ret+32(FP)
+	BVC	ok
+	MOVD	$0, p+32(FP)
+	MOVD	R3, err+40(FP)
+	RET
+ok:
+	MOVD	R3, p+32(FP)
+	MOVD	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
index b8099e2553..72b024434f 100644
--- a/src/runtime/sys_linux_s390x.s
+++ b/src/runtime/sys_linux_s390x.s
@@ -49,11 +49,16 @@ TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	SYSCALL
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT|NOFRAME,$0-4
-	MOVW	code+0(FP), R2
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	wait+0(FP), R1
+	// We're done using the stack.
+	MOVW	$0, R2
+	MOVW	R2, (R1)
+	MOVW	$0, R2	// exit code
 	MOVW	$SYS_exit, R1
 	SYSCALL
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
 	MOVD	name+0(FP), R2
@@ -246,7 +251,7 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
 	BR	runtime·sigtramp(SB)
 
 // func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
-TEXT runtime·mmap(SB),NOSPLIT,$48-40
+TEXT runtime·mmap(SB),NOSPLIT,$48-48
 	MOVD	addr+0(FP), R2
 	MOVD	n+8(FP), R3
 	MOVW	prot+16(FP), R4
@@ -267,9 +272,14 @@ TEXT runtime·mmap(SB),NOSPLIT,$48-40
 	MOVW	$SYS_mmap, R1
 	SYSCALL
 	MOVD	$-4095, R3
-	CMPUBLT	R2, R3, 2(PC)
+	CMPUBLT	R2, R3, ok
 	NEG	R2
-	MOVD	R2, ret+32(FP)
+	MOVD	$0, p+32(FP)
+	MOVD	R2, err+40(FP)
+	RET
+ok:
+	MOVD	R2, p+32(FP)
+	MOVD	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s
index d945453970..cdc8ff1a02 100644
--- a/src/runtime/sys_nacl_386.s
+++ b/src/runtime/sys_nacl_386.s
@@ -16,11 +16,13 @@ TEXT runtime·exit(SB),NOSPLIT,$4
 	NACL_SYSCALL(SYS_exit)
 	JMP 0(PC)
 
-TEXT runtime·exit1(SB),NOSPLIT,$4
-	MOVL code+0(FP), AX
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$4-4
+	MOVL wait+0(FP), AX
+	// SYS_thread_exit will clear *wait when the stack is free.
 	MOVL AX, 0(SP)
 	NACL_SYSCALL(SYS_thread_exit)
-	RET
+	JMP 0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$12
 	MOVL name+0(FP), AX
@@ -228,9 +230,14 @@ TEXT runtime·mmap(SB),NOSPLIT,$32
 	MOVL	AX, 20(SP)
 	NACL_SYSCALL(SYS_mmap)
 	CMPL	AX, $-4095
-	JNA	2(PC)
+	JNA	ok
 	NEGL	AX
-	MOVL	AX, ret+24(FP)
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·walltime(SB),NOSPLIT,$20
diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s
index 2a3998391c..ff4c2e7bb5 100644
--- a/src/runtime/sys_nacl_amd64p32.s
+++ b/src/runtime/sys_nacl_amd64p32.s
@@ -19,10 +19,12 @@ TEXT runtime·exit(SB),NOSPLIT,$0
 	NACL_SYSCALL(SYS_exit)
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$0
-	MOVL code+0(FP), DI
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL wait+0(FP), DI
+	// SYS_thread_exit will clear *wait when the stack is free.
 	NACL_SYSCALL(SYS_thread_exit)
-	RET
+	JMP 0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$0
 	MOVL name+0(FP), DI
@@ -237,9 +239,14 @@ TEXT runtime·mmap(SB),NOSPLIT,$8
 	MOVL SP, R9
 	NACL_SYSCALL(SYS_mmap)
 	CMPL AX, $-4095
-	JNA 2(PC)
+	JNA ok
 	NEGL AX
-	MOVL	AX, ret+24(FP)
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·walltime(SB),NOSPLIT,$16
diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
index 6a6ef4e6b2..6e01fe42e8 100644
--- a/src/runtime/sys_nacl_arm.s
+++ b/src/runtime/sys_nacl_arm.s
@@ -15,10 +15,12 @@ TEXT runtime·exit(SB),NOSPLIT,$0
 	NACL_SYSCALL(SYS_exit)
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$0
-	MOVW	code+0(FP), R0
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$4-4
+	MOVW wait+0(FP), R0
+	// SYS_thread_exit will clear *wait when the stack is free.
 	NACL_SYSCALL(SYS_thread_exit)
-	RET
+	JMP 0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$0
 	MOVW	name+0(FP), R0
@@ -192,8 +194,12 @@ TEXT runtime·mmap(SB),NOSPLIT,$8
 	NACL_SYSCALL(SYS_mmap)
 	MOVM.IA.W (R13), [R4, R5]
 	CMP	$-4095, R0
+	MOVW	$0, R1
 	RSB.HI	$0, R0
-	MOVW	R0, ret+24(FP)
+	MOVW.HI	R0, R1		// if error, put in R1
+	MOVW.HI	$0, R0
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·walltime(SB),NOSPLIT,$16
diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index 742193cf28..4042ab4f8a 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s
@@ -17,12 +17,15 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-4
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
 	MOVL	$310, AX		// sys__lwp_exit
 	INT	$0x80
-	JAE	2(PC)
 	MOVL	$0xf1, 0xf1		// crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-4
 	MOVL	$5, AX
@@ -113,7 +116,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$36
 	STOSL
 	MOVL	$197, AX		// sys_mmap
 	INT	$0x80
-	MOVL	AX, ret+24(FP)
+	JAE	ok
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-4
@@ -155,7 +164,7 @@ TEXT runtime·walltime(SB), NOSPLIT, $32
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB),NOSPLIT,$32
 	LEAL	12(SP), BX
-	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	$3, 4(SP)		// arg 1 - clock_id CLOCK_MONOTONIC
 	MOVL	BX, 8(SP)		// arg 2 - tp
 	MOVL	$427, AX		// sys_clock_gettime
 	INT	$0x80
@@ -298,7 +307,7 @@ TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
 	// Call fn
 	CALL	SI
 
-	CALL	runtime·exit1(SB)
+	// fn should never return
 	MOVL	$0x1234, 0x1005
 	RET
 
@@ -337,9 +346,9 @@ TEXT runtime·osyield(SB),NOSPLIT,$-4
 	RET
 
 TEXT runtime·lwp_park(SB),NOSPLIT,$-4
-	MOVL	$434, AX		// sys__lwp_park
+	MOVL	$478, AX		// sys__lwp_park
 	INT	$0x80
-	MOVL	AX, ret+16(FP)
+	MOVL	AX, ret+24(FP)
 	RET
 
 TEXT runtime·lwp_unpark(SB),NOSPLIT,$-4
@@ -366,10 +375,12 @@ TEXT runtime·sysctl(SB),NOSPLIT,$28
 	MOVSL				// arg 6 - newlen
 	MOVL	$202, AX		// sys___sysctl
 	INT	$0x80
-	JCC	3(PC)
+	JAE	4(PC)
 	NEGL	AX
+	MOVL	AX, ret+24(FP)
 	RET
 	MOVL	$0, AX
+	MOVL	AX, ret+24(FP)
 	RET
 
 GLOBL runtime·tlsoffset(SB),NOPTR,$4
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index c632a0b969..11b9c1b417 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -48,13 +48,15 @@ TEXT runtime·osyield(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·lwp_park(SB),NOSPLIT,$0
-	MOVQ	abstime+0(FP), DI		// arg 1 - abstime
-	MOVL	unpark+8(FP), SI		// arg 2 - unpark
-	MOVQ	hint+16(FP), DX		// arg 3 - hint
-	MOVQ	unparkhint+24(FP), R10		// arg 4 - unparkhint
-	MOVL	$434, AX		// sys__lwp_park
+	MOVL	clockid+0(FP), DI		// arg 1 - clockid
+	MOVL	flags+4(FP), SI			// arg 2 - flags
+	MOVQ	ts+8(FP), DX			// arg 3 - ts
+	MOVL	unpark+16(FP), R10		// arg 4 - unpark
+	MOVQ	hint+24(FP), R8			// arg 5 - hint
+	MOVQ	unparkhint+32(FP), R9		// arg 6 - unparkhint
+	MOVL	$478, AX			// sys__lwp_park
 	SYSCALL
-	MOVL	AX, ret+32(FP)
+	MOVL	AX, ret+40(FP)
 	RET
 
 TEXT runtime·lwp_unpark(SB),NOSPLIT,$0
@@ -79,11 +81,15 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
 	MOVL	$310, AX		// sys__lwp_exit
 	SYSCALL
 	MOVL	$0xf1, 0xf1		// crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVQ	name+0(FP), DI		// arg 1 pathname
@@ -184,7 +190,7 @@ TEXT runtime·walltime(SB), NOSPLIT, $32
 	RET
 
 TEXT runtime·nanotime(SB),NOSPLIT,$32
-	MOVQ	$0, DI			// arg 1 - clock_id
+	MOVQ	$3, DI			// arg 1 - clock_id CLOCK_MONOTONIC
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$427, AX		// sys_clock_gettime
 	SYSCALL
@@ -286,8 +292,15 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVQ	$0, R9			// arg 6 - pad
 	MOVL	$197, AX		// sys_mmap
 	SYSCALL
+	JCC	ok
+	ADDQ	$16, SP
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
 	ADDQ	$16, SP
-	MOVQ	AX, ret+32(FP)
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index 789b12ef5b..7d2e290dd9 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s
@@ -18,12 +18,21 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVW.CS R8, (R8)
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-4
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVW wait+0(FP), R0
+	// We're done using the stack.
+	MOVW $0, R2
+storeloop:
+	LDREX (R0), R4          // loads R4
+	STREX R2, (R0), R1      // stores R2
+	CMP $0, R1
+	BNE storeloop
 	SWI $0xa00136	// sys__lwp_exit
 	MOVW $1, R8	// crash
 	MOVW R8, (R8)
-	RET
-	
+	JMP 0(PC)
+
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVW name+0(FP), R0
 	MOVW mode+4(FP), R1
@@ -71,13 +80,17 @@ TEXT runtime·osyield(SB),NOSPLIT,$0
 	SWI $0xa0015e	// sys_sched_yield
 	RET
 
-TEXT runtime·lwp_park(SB),NOSPLIT,$0
-	MOVW abstime+0(FP), R0	// arg 1 - abstime
-	MOVW unpark+4(FP), R1	// arg 2 - unpark
-	MOVW hint+8(FP), R2	// arg 3 - hint
-	MOVW unparkhint+12(FP), R3	// arg 4 - unparkhint
-	SWI $0xa001b2	// sys__lwp_park
-	MOVW	R0, ret+16(FP)
+TEXT runtime·lwp_park(SB),NOSPLIT,$8
+	MOVW clockid+0(FP), R0		// arg 1 - clock_id
+	MOVW flags+4(FP), R1		// arg 2 - flags
+	MOVW ts+8(FP), R2		// arg 3 - ts
+	MOVW unpark+12(FP), R3		// arg 4 - unpark
+	MOVW hint+16(FP), R4		// arg 5 - hint
+	MOVW R4, 4(R13)
+	MOVW unparkhint+20(FP), R5	// arg 6 - unparkhint
+	MOVW R5, 8(R13)
+	SWI $0xa001de			// sys__lwp_park
+	MOVW	R0, ret+24(FP)
 	RET
 
 TEXT runtime·lwp_unpark(SB),NOSPLIT,$0
@@ -155,7 +168,7 @@ TEXT runtime·walltime(SB), NOSPLIT, $32
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB), NOSPLIT, $32
-	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $3, R0 // CLOCK_MONOTONIC
 	MOVW $8(R13), R1
 	SWI $0xa001ab	// clock_gettime
 
@@ -255,7 +268,11 @@ TEXT runtime·mmap(SB),NOSPLIT,$12
 	ADD $4, R13 // pass arg 5 and arg 6 on stack
 	SWI $0xa000c5	// sys_mmap
 	SUB $4, R13
-	MOVW	R0, ret+24(FP)
+	MOVW	$0, R1
+	MOVW.CS R0, R1	// if error, move to R1
+	MOVW.CS $0, R0
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
index fb2a6883df..475a937798 100644
--- a/src/runtime/sys_openbsd_386.s
+++ b/src/runtime/sys_openbsd_386.s
@@ -19,14 +19,21 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$8
-	MOVL	$0, 0(SP)
-	MOVL	$0, 4(SP)		// arg 1 - notdead
+GLOBL exitStack<>(SB),RODATA,$8
+DATA exitStack<>+0x00(SB)/4, $0
+DATA exitStack<>+0x04(SB)/4, $0
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVL	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	// sys__lwp_exit takes 1 argument, which it expects on the stack.
+	MOVL	$exitStack<>(SB), SP
 	MOVL	$302, AX		// sys___threxit
 	INT	$0x80
-	JAE	2(PC)
 	MOVL	$0xf1, 0xf1		// crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-4
 	MOVL	$5, AX
@@ -118,7 +125,13 @@ TEXT runtime·mmap(SB),NOSPLIT,$36
 	STOSL
 	MOVL	$197, AX		// sys_mmap
 	INT	$0x80
-	MOVL	AX, ret+24(FP)
+	JAE	ok
+	MOVL	$0, p+24(FP)
+	MOVL	AX, err+28(FP)
+	RET
+ok:
+	MOVL	AX, p+24(FP)
+	MOVL	$0, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-4
@@ -308,7 +321,7 @@ TEXT runtime·tfork(SB),NOSPLIT,$12
 	// Call fn.
 	CALL	SI
 
-	CALL	runtime·exit1(SB)
+	// fn should never return.
 	MOVL	$0x1234, 0x1005
 	RET
 
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index 9a52e5d9ef..658f2c49dc 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -88,12 +88,16 @@ TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
 	MOVQ	$0, DI			// arg 1 - notdead
 	MOVL	$302, AX		// sys___threxit
 	SYSCALL
 	MOVL	$0xf1, 0xf1		// crash
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-8
 	MOVQ	name+0(FP), DI		// arg 1 pathname
@@ -278,8 +282,15 @@ TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVQ	$0, R9			// arg 6 - pad
 	MOVL	$197, AX
 	SYSCALL
+	JCC	ok
+	ADDQ	$16, SP
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
 	ADDQ	$16, SP
-	MOVQ	AX, ret+32(FP)
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index 93a5d5b7f6..ea7538630d 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s
@@ -22,13 +22,22 @@ TEXT runtime·exit(SB),NOSPLIT,$-4
 	MOVW.CS	R8, (R8)
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-4
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-4
+	MOVW	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R2
+storeloop:
+	LDREX	(R0), R4          // loads R4
+	STREX	R2, (R0), R1      // stores R2
+	CMP	$0, R1
+	BNE	storeloop
 	MOVW	$0, R0			// arg 1 - notdead
 	MOVW	$302, R12		// sys___threxit
 	SWI	$0
 	MOVW.CS	$1, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
-	RET
+	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-4
 	MOVW	name+0(FP), R0		// arg 1 - path
@@ -120,7 +129,11 @@ TEXT runtime·mmap(SB),NOSPLIT,$16
 	MOVW	$197, R12		// sys_mmap
 	SWI	$0
 	SUB	$4, R13
-	MOVW	R0, ret+24(FP)
+	MOVW	$0, R1
+	MOVW.CS	R0, R1			// if error, move to R1
+	MOVW.CS $0, R0
+	MOVW	R0, p+24(FP)
+	MOVW	R1, err+28(FP)
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$0
@@ -269,7 +282,7 @@ TEXT runtime·tfork(SB),NOSPLIT,$0
 	// Call fn.
 	BL	(R6)
 
-	BL	runtime·exit1(SB)
+	// fn should never return.
 	MOVW	$2, R8			// crash if reached
 	MOVW	R8, (R8)
 	RET
diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index 688bd2371a..47dcb8db04 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s
@@ -139,7 +139,7 @@ TEXT runtime·rfork(SB),NOSPLIT,$0
 	MOVL	AX, ret+4(FP)
 	RET
 
-TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$4
 	MOVL	newm+0(FP), CX
 	MOVL	m_g0(CX), DX
 
@@ -163,8 +163,10 @@ TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	CALL	runtime·mstart(SB)
 
-	MOVL	$0x1234, 0x1234		// not reached
-	RET
+	// Exit the thread.
+	MOVL	$0, 0(SP)
+	CALL	runtime·exits(SB)
+	JMP	0(PC)
 
 // void sigtramp(void *ureg, int8 *note)
 TEXT runtime·sigtramp(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
index d7bd92c1b4..8077d6d324 100644
--- a/src/runtime/sys_plan9_amd64.s
+++ b/src/runtime/sys_plan9_amd64.s
@@ -136,7 +136,7 @@ TEXT runtime·rfork(SB),NOSPLIT,$0
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$8
 	MOVQ	newm+0(FP), CX
 	MOVQ	m_g0(CX), DX
 
@@ -160,8 +160,10 @@ TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
 	CALL	runtime·mstart(SB)
 
-	MOVQ	$0x1234, 0x1234		// not reached
-	RET
+	// Exit the thread.
+	MOVQ	$0, 0(SP)
+	CALL	runtime·exits(SB)
+	JMP	0(PC)
 
 // This is needed by asm_amd64.s
 TEXT runtime·settls(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_plan9_arm.s b/src/runtime/sys_plan9_arm.s
index 94a6f63974..efaf60337b 100644
--- a/src/runtime/sys_plan9_arm.s
+++ b/src/runtime/sys_plan9_arm.s
@@ -207,7 +207,7 @@ TEXT runtime·rfork(SB),NOSPLIT,$0-8
 	RET
 
 //func tstart_plan9(newm *m)
-TEXT runtime·tstart_plan9(SB),NOSPLIT,$0-4
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$4-4
 	MOVW	newm+0(FP), R1
 	MOVW	m_g0(R1), g
 
@@ -226,9 +226,11 @@ TEXT runtime·tstart_plan9(SB),NOSPLIT,$0-4
 
 	BL	runtime·mstart(SB)
 
-	MOVW	$0x1234, R0
-	MOVW	R0, 0(R0)		// not reached
-	RET
+	// Exit the thread.
+	MOVW	$0, R0
+	MOVW	R0, 4(R13)
+	CALL	runtime·exits(SB)
+	JMP	0(PC)
 
 //func sigtramp(ureg, note unsafe.Pointer)
 TEXT runtime·sigtramp(SB),NOSPLIT,$0-8
diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s
index aeb2e2c897..2b6dabab99 100644
--- a/src/runtime/sys_solaris_amd64.s
+++ b/src/runtime/sys_solaris_amd64.s
@@ -183,9 +183,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	JMP	exit
 
 allgood:
-	// save g
-	MOVQ	R10, 80(SP)
-
 	// Save m->libcall and m->scratch. We need to do this because we
 	// might get interrupted by a signal in runtime·asmcgocall.
 
@@ -223,19 +220,11 @@ allgood:
 	MOVL	0(R10), R10
 	MOVQ	R10, 160(SP)
 
-	MOVQ	g(BX), R10
-	// g = m->gsignal
-	MOVQ	m_gsignal(BP), BP
-	MOVQ	BP, g(BX)
-
-	// TODO: If current SP is not in gsignal.stack, then adjust.
-
 	// prepare call
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
 	MOVQ	DX, 16(SP)
-	MOVQ	R10, 24(SP)
-	CALL	runtime·sighandler(SB)
+	CALL	runtime·sigtrampgo(SB)
 
 	get_tls(BX)
 	MOVQ	g(BX), BP
@@ -273,10 +262,6 @@ allgood:
 	MOVQ	160(SP), R10
 	MOVL	R10, 0(R11)
 
-	// restore g
-	MOVQ	80(SP), R10
-	MOVQ	R10, g(BX)
-
 exit:
 	// restore registers
 	MOVQ	32(SP), BX
diff --git a/src/runtime/sys_x86.go b/src/runtime/sys_x86.go
index 7e4e27354e..2b4ed8bdf5 100644
--- a/src/runtime/sys_x86.go
+++ b/src/runtime/sys_x86.go
@@ -11,7 +11,7 @@ import (
 	"unsafe"
 )
 
-// adjust Gobuf as it if executed a call to fn with context ctxt
+// adjust Gobuf as if it executed a call to fn with context ctxt
 // and then did an immediate gosave.
 func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
 	sp := buf.sp
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index ca8ea8b04f..134d4dbd99 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -93,6 +93,8 @@ const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
 //go:linkname syscall_loadsystemlibrary syscall.loadsystemlibrary
 //go:nosplit
 func syscall_loadsystemlibrary(filename *uint16) (handle, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 
 	if useLoadLibraryEx {
@@ -126,6 +128,8 @@ func syscall_loadsystemlibrary(filename *uint16) (handle, err uintptr) {
 //go:linkname syscall_loadlibrary syscall.loadlibrary
 //go:nosplit
 func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = getLoadLibrary()
 	c.n = 1
@@ -141,6 +145,8 @@ func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {
 //go:linkname syscall_getprocaddress syscall.getprocaddress
 //go:nosplit
 func syscall_getprocaddress(handle uintptr, procname *byte) (outhandle, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = getGetProcAddress()
 	c.n = 2
@@ -156,6 +162,8 @@ func syscall_getprocaddress(handle uintptr, procname *byte) (outhandle, err uint
 //go:linkname syscall_Syscall syscall.Syscall
 //go:nosplit
 func syscall_Syscall(fn, nargs, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
@@ -167,6 +175,8 @@ func syscall_Syscall(fn, nargs, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 //go:linkname syscall_Syscall6 syscall.Syscall6
 //go:nosplit
 func syscall_Syscall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
@@ -178,6 +188,8 @@ func syscall_Syscall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err ui
 //go:linkname syscall_Syscall9 syscall.Syscall9
 //go:nosplit
 func syscall_Syscall9(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
@@ -189,6 +201,8 @@ func syscall_Syscall9(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1
 //go:linkname syscall_Syscall12 syscall.Syscall12
 //go:nosplit
 func syscall_Syscall12(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12 uintptr) (r1, r2, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
@@ -200,6 +214,8 @@ func syscall_Syscall12(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11,
 //go:linkname syscall_Syscall15 syscall.Syscall15
 //go:nosplit
 func syscall_Syscall15(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15 uintptr) (r1, r2, err uintptr) {
+	lockOSThread()
+	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
@@ -207,9 +223,3 @@ func syscall_Syscall15(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11,
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
 	return c.r1, c.r2, c.err
 }
-
-//go:linkname syscall_exit syscall.Exit
-//go:nosplit
-func syscall_exit(code int) {
-	exit(int32(code))
-}
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 3da154dfa8..dfde12a211 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -15,6 +15,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"syscall"
 	"testing"
@@ -537,6 +538,17 @@ func TestWERDialogue(t *testing.T) {
 	cmd.CombinedOutput()
 }
 
+func TestWindowsStackMemory(t *testing.T) {
+	o := runTestProg(t, "testprog", "StackMemory")
+	stackUsage, err := strconv.Atoi(o)
+	if err != nil {
+		t.Fatalf("Failed to read stack usage: %v", err)
+	}
+	if expected, got := 100<<10, stackUsage; got > expected {
+		t.Fatalf("expected < %d bytes of memory per thread, got %d", expected, got)
+	}
+}
+
 var used byte
 
 func use(buf []byte) {
@@ -1043,7 +1055,7 @@ func BenchmarkRunningGoProgram(b *testing.B) {
 	}
 
 	exe := filepath.Join(tmpdir, "main.exe")
-	cmd := exec.Command("go", "build", "-o", exe, src)
+	cmd := exec.Command(testenv.GoToolPath(b), "build", "-o", exe, src)
 	cmd.Dir = tmpdir
 	out, err := cmd.CombinedOutput()
 	if err != nil {
diff --git a/src/runtime/testdata/testprog/gettid.go b/src/runtime/testdata/testprog/gettid.go
new file mode 100644
index 0000000000..1b3e29ab08
--- /dev/null
+++ b/src/runtime/testdata/testprog/gettid.go
@@ -0,0 +1,29 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+)
+
+func gettid() int {
+	return syscall.Gettid()
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	if os.IsNotExist(err) {
+		return false, true
+	}
+	// Check if it's a zombie thread.
+	state := bytes.Fields(stat)[2]
+	return !(len(state) == 1 && state[0] == 'Z'), true
+}
diff --git a/src/runtime/testdata/testprog/gettid_none.go b/src/runtime/testdata/testprog/gettid_none.go
new file mode 100644
index 0000000000..036db87e10
--- /dev/null
+++ b/src/runtime/testdata/testprog/gettid_none.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !linux
+
+package main
+
+func gettid() int {
+	return 0
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	return false, false
+}
diff --git a/src/runtime/testdata/testprog/lockosthread.go b/src/runtime/testdata/testprog/lockosthread.go
new file mode 100644
index 0000000000..88c0d12e4c
--- /dev/null
+++ b/src/runtime/testdata/testprog/lockosthread.go
@@ -0,0 +1,94 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"time"
+)
+
+var mainTID int
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainTID = gettid()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// gettid only works on Linux, so on other platforms this just
+	// checks that the runtime doesn't do anything terrible.
+
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		if mainTID != 0 && gettid() != mainTID {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	if mainTID != 0 && gettid() == mainTID {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subTID int
+	ready := make(chan bool, 1)
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subTID = gettid()
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	runtime.UnlockOSThread()
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		if subTID != 0 && gettid() == subTID {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		exists, supported := tidExists(subTID)
+		if !supported || !exists {
+			goto ok
+		}
+	}
+	println("sub thread", subTID, "still running")
+	return
+ok:
+	println("OK")
+}
diff --git a/src/runtime/testdata/testprog/syscall_windows.go b/src/runtime/testdata/testprog/syscall_windows.go
index 6e6782e987..b4b66441b8 100644
--- a/src/runtime/testdata/testprog/syscall_windows.go
+++ b/src/runtime/testdata/testprog/syscall_windows.go
@@ -4,11 +4,18 @@
 
 package main
 
-import "syscall"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
 
 func init() {
 	register("RaiseException", RaiseException)
 	register("ZeroDivisionException", ZeroDivisionException)
+	register("StackMemory", StackMemory)
 }
 
 func RaiseException() {
@@ -25,3 +32,39 @@ func ZeroDivisionException() {
 	z := x / y
 	println(z)
 }
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/src/runtime/testdata/testprogcgo/callback.go b/src/runtime/testdata/testprogcgo/callback.go
index 7d9d68ddd1..be0409f39d 100644
--- a/src/runtime/testdata/testprogcgo/callback.go
+++ b/src/runtime/testdata/testprogcgo/callback.go
@@ -29,6 +29,7 @@ import "C"
 
 import (
 	"fmt"
+	"os"
 	"runtime"
 )
 
@@ -63,7 +64,10 @@ func grow1(x, sum *int) int {
 }
 
 func CgoCallbackGC() {
-	const P = 100
+	P := 100
+	if os.Getenv("RUNTIME_TESTING_SHORT") != "" {
+		P = 10
+	}
 	done := make(chan bool)
 	// allocate a bunch of stack frames and spray them with pointers
 	for i := 0; i < P; i++ {
diff --git a/src/runtime/testdata/testprogcgo/catchpanic.go b/src/runtime/testdata/testprogcgo/catchpanic.go
new file mode 100644
index 0000000000..55a606d1bc
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/catchpanic.go
@@ -0,0 +1,46 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void abrthandler(int signum) {
+	if (signum == SIGABRT) {
+		exit(0);  // success
+	}
+}
+
+void registerAbortHandler() {
+	struct sigaction act;
+	memset(&act, 0, sizeof act);
+	act.sa_handler = abrthandler;
+	sigaction(SIGABRT, &act, NULL);
+}
+
+static void __attribute__ ((constructor)) sigsetup(void) {
+	if (getenv("CGOCATCHPANIC_EARLY_HANDLER") == NULL)
+		return;
+	registerAbortHandler();
+}
+*/
+import "C"
+import "os"
+
+func init() {
+	register("CgoCatchPanic", CgoCatchPanic)
+}
+
+// Test that the SIGABRT raised by panic can be caught by an early signal handler.
+func CgoCatchPanic() {
+	if _, ok := os.LookupEnv("CGOCATCHPANIC_EARLY_HANDLER"); !ok {
+		C.registerAbortHandler()
+	}
+	panic("catch me")
+}
diff --git a/src/runtime/testdata/testprogcgo/cgo.go b/src/runtime/testdata/testprogcgo/cgo.go
index 209524a24d..a587db385b 100644
--- a/src/runtime/testdata/testprogcgo/cgo.go
+++ b/src/runtime/testdata/testprogcgo/cgo.go
@@ -52,7 +52,11 @@ func CgoSignalDeadlock() {
 	time.Sleep(time.Millisecond)
 	start := time.Now()
 	var times []time.Duration
-	for i := 0; i < 64; i++ {
+	n := 64
+	if os.Getenv("RUNTIME_TEST_SHORT") != "" {
+		n = 16
+	}
+	for i := 0; i < n; i++ {
 		go func() {
 			runtime.LockOSThread()
 			select {}
diff --git a/src/runtime/testdata/testprogcgo/lockosthread.c b/src/runtime/testdata/testprogcgo/lockosthread.c
new file mode 100644
index 0000000000..b10cc4f3b9
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/lockosthread.c
@@ -0,0 +1,13 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+#include <stdint.h>
+
+uint32_t threadExited;
+
+void setExited(void *x) {
+	__sync_fetch_and_add(&threadExited, 1);
+}
diff --git a/src/runtime/testdata/testprogcgo/lockosthread.go b/src/runtime/testdata/testprogcgo/lockosthread.go
new file mode 100644
index 0000000000..36423d9eb0
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/lockosthread.go
@@ -0,0 +1,111 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+/*
+#include <pthread.h>
+#include <stdint.h>
+
+extern uint32_t threadExited;
+
+void setExited(void *x);
+*/
+import "C"
+
+var mainThread C.pthread_t
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainThread = C.pthread_self()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		self := C.pthread_self()
+		if C.pthread_equal(mainThread, self) == 0 {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	self := C.pthread_self()
+	if C.pthread_equal(mainThread, self) != 0 {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subThread C.pthread_t
+	ready := make(chan bool, 1)
+	C.threadExited = 0
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subThread = C.pthread_self()
+		// Register a pthread destructor so we can tell this
+		// thread has exited.
+		var key C.pthread_key_t
+		C.pthread_key_create(&key, (*[0]byte)(unsafe.Pointer(C.setExited)))
+		C.pthread_setspecific(key, unsafe.Pointer(new(int)))
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		self := C.pthread_self()
+		if C.pthread_equal(subThread, self) != 0 {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		if atomic.LoadUint32((*uint32)(&C.threadExited)) != 0 {
+			println("OK")
+			return
+		}
+	}
+	println("sub thread still running")
+	os.Exit(1)
+}
diff --git a/src/runtime/testdata/testprogcgo/sigstack.go b/src/runtime/testdata/testprogcgo/sigstack.go
new file mode 100644
index 0000000000..e30a5592dc
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/sigstack.go
@@ -0,0 +1,95 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+// Test handling of Go-allocated signal stacks when calling from
+// C-created threads with and without signal stacks. (See issue
+// #22930.)
+
+package main
+
+/*
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#ifndef MAP_STACK
+#define MAP_STACK 0
+#endif
+
+extern void SigStackCallback();
+
+static void* WithSigStack(void* arg __attribute__((unused))) {
+	// Set up an alternate system stack.
+	void* base = mmap(0, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_STACK, -1, 0);
+	if (base == MAP_FAILED) {
+		perror("mmap failed");
+		abort();
+	}
+	stack_t st = {}, ost = {};
+	st.ss_sp = (char*)base;
+	st.ss_flags = 0;
+	st.ss_size = SIGSTKSZ;
+	if (sigaltstack(&st, &ost) < 0) {
+		perror("sigaltstack failed");
+		abort();
+	}
+
+	// Call Go.
+	SigStackCallback();
+
+	// Disable signal stack and protect it so we can detect reuse.
+	if (ost.ss_flags & SS_DISABLE) {
+		// Darwin libsystem has a bug where it checks ss_size
+		// even if SS_DISABLE is set. (The kernel gets it right.)
+		ost.ss_size = SIGSTKSZ;
+	}
+	if (sigaltstack(&ost, NULL) < 0) {
+		perror("sigaltstack restore failed");
+		abort();
+	}
+	mprotect(base, SIGSTKSZ, PROT_NONE);
+	return NULL;
+}
+
+static void* WithoutSigStack(void* arg __attribute__((unused))) {
+	SigStackCallback();
+	return NULL;
+}
+
+static void DoThread(int sigstack) {
+	pthread_t tid;
+	if (sigstack) {
+		pthread_create(&tid, NULL, WithSigStack, NULL);
+	} else {
+		pthread_create(&tid, NULL, WithoutSigStack, NULL);
+	}
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+func init() {
+	register("SigStack", SigStack)
+}
+
+func SigStack() {
+	C.DoThread(0)
+	C.DoThread(1)
+	C.DoThread(0)
+	C.DoThread(1)
+	println("OK")
+}
+
+var BadPtr *int
+
+//export SigStackCallback
+func SigStackCallback() {
+	// Cause the Go signal handler to run.
+	defer func() { recover() }()
+	*BadPtr = 42
+}
diff --git a/src/runtime/testdata/testprogcgo/stack_windows.go b/src/runtime/testdata/testprogcgo/stack_windows.go
new file mode 100644
index 0000000000..846297a960
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/stack_windows.go
@@ -0,0 +1,54 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "C"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
+
+func init() {
+	register("StackMemory", StackMemory)
+}
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index 23f61d62d0..6c349c8461 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -6,14 +6,18 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Package time knows the layout of this structure.
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
 // For GOOS=nacl, package syscall knows the layout of this structure.
 // If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
 type timer struct {
-	i int // heap index
+	tb *timersBucket // the bucket the timer lives in
+	i  int           // heap index
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
@@ -25,7 +29,37 @@ type timer struct {
 	seq    uintptr
 }
 
-var timers struct {
+// timersLen is the length of timers array.
+//
+// Ideally, this would be set to GOMAXPROCS, but that would require
+// dynamic reallocation
+//
+// The current value is a compromise between memory usage and performance
+// that should cover the majority of GOMAXPROCS values used in the wild.
+const timersLen = 64
+
+// timers contains "per-P" timer heaps.
+//
+// Timers are queued into timersBucket associated with the current P,
+// so each P may work with its own timers independently of other P instances.
+//
+// Each timersBucket may be associated with multiple P
+// if GOMAXPROCS > timersLen.
+var timers [timersLen]struct {
+	timersBucket
+
+	// The padding should eliminate false sharing
+	// between timersBucket values.
+	pad [sys.CacheLineSize - unsafe.Sizeof(timersBucket{})%sys.CacheLineSize]byte
+}
+
+func (t *timer) assignBucket() *timersBucket {
+	id := uint8(getg().m.p.ptr().id) % timersLen
+	t.tb = &timers[id].timersBucket
+	return t.tb
+}
+
+type timersBucket struct {
 	lock         mutex
 	gp           *g
 	created      bool
@@ -51,18 +85,20 @@ func timeSleep(ns int64) {
 		return
 	}
 
-	t := getg().timer
+	gp := getg()
+	t := gp.timer
 	if t == nil {
 		t = new(timer)
-		getg().timer = t
+		gp.timer = t
 	}
 	*t = timer{}
 	t.when = nanotime() + ns
 	t.f = goroutineReady
-	t.arg = getg()
-	lock(&timers.lock)
-	addtimerLocked(t)
-	goparkunlock(&timers.lock, "sleep", traceEvGoSleep, 2)
+	t.arg = gp
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	goparkunlock(&tb.lock, "sleep", traceEvGoSleep, 2)
 }
 
 // startTimer adds t to the timer heap.
@@ -89,87 +125,95 @@ func goroutineReady(arg interface{}, seq uintptr) {
 }
 
 func addtimer(t *timer) {
-	lock(&timers.lock)
-	addtimerLocked(t)
-	unlock(&timers.lock)
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	unlock(&tb.lock)
 }
 
 // Add a timer to the heap and start or kick timerproc if the new timer is
 // earlier than any of the others.
 // Timers are locked.
-func addtimerLocked(t *timer) {
+func (tb *timersBucket) addtimerLocked(t *timer) {
 	// when must never be negative; otherwise timerproc will overflow
 	// during its delta calculation and never expire other runtime timers.
 	if t.when < 0 {
 		t.when = 1<<63 - 1
 	}
-	t.i = len(timers.t)
-	timers.t = append(timers.t, t)
-	siftupTimer(t.i)
+	t.i = len(tb.t)
+	tb.t = append(tb.t, t)
+	siftupTimer(tb.t, t.i)
 	if t.i == 0 {
 		// siftup moved to top: new earliest deadline.
-		if timers.sleeping {
-			timers.sleeping = false
-			notewakeup(&timers.waitnote)
+		if tb.sleeping {
+			tb.sleeping = false
+			notewakeup(&tb.waitnote)
 		}
-		if timers.rescheduling {
-			timers.rescheduling = false
-			goready(timers.gp, 0)
+		if tb.rescheduling {
+			tb.rescheduling = false
+			goready(tb.gp, 0)
 		}
 	}
-	if !timers.created {
-		timers.created = true
-		go timerproc()
+	if !tb.created {
+		tb.created = true
+		go timerproc(tb)
 	}
 }
 
 // Delete timer t from the heap.
 // Do not need to update the timerproc: if it wakes up early, no big deal.
 func deltimer(t *timer) bool {
-	// Dereference t so that any panic happens before the lock is held.
-	// Discard result, because t might be moving in the heap.
-	_ = t.i
+	if t.tb == nil {
+		// t.tb can be nil if the user created a timer
+		// directly, without invoking startTimer e.g
+		//    time.Ticker{C: c}
+		// In this case, return early without any deletion.
+		// See Issue 21874.
+		return false
+	}
 
-	lock(&timers.lock)
+	tb := t.tb
+
+	lock(&tb.lock)
 	// t may not be registered anymore and may have
 	// a bogus i (typically 0, if generated by Go).
 	// Verify it before proceeding.
 	i := t.i
-	last := len(timers.t) - 1
-	if i < 0 || i > last || timers.t[i] != t {
-		unlock(&timers.lock)
+	last := len(tb.t) - 1
+	if i < 0 || i > last || tb.t[i] != t {
+		unlock(&tb.lock)
 		return false
 	}
 	if i != last {
-		timers.t[i] = timers.t[last]
-		timers.t[i].i = i
+		tb.t[i] = tb.t[last]
+		tb.t[i].i = i
 	}
-	timers.t[last] = nil
-	timers.t = timers.t[:last]
+	tb.t[last] = nil
+	tb.t = tb.t[:last]
 	if i != last {
-		siftupTimer(i)
-		siftdownTimer(i)
+		siftupTimer(tb.t, i)
+		siftdownTimer(tb.t, i)
 	}
-	unlock(&timers.lock)
+	unlock(&tb.lock)
 	return true
 }
 
 // Timerproc runs the time-driven events.
-// It sleeps until the next event in the timers heap.
+// It sleeps until the next event in the tb heap.
 // If addtimer inserts a new earlier event, it wakes timerproc early.
-func timerproc() {
-	timers.gp = getg()
+func timerproc(tb *timersBucket) {
+	tb.gp = getg()
 	for {
-		lock(&timers.lock)
-		timers.sleeping = false
+		lock(&tb.lock)
+		tb.sleeping = false
 		now := nanotime()
 		delta := int64(-1)
 		for {
-			if len(timers.t) == 0 {
+			if len(tb.t) == 0 {
 				delta = -1
 				break
 			}
-			t := timers.t[0]
+			t := tb.t[0]
 			delta = t.when - now
 			if delta > 0 {
 				break
@@ -177,43 +221,43 @@ func timerproc() {
 			if t.period > 0 {
 				// leave in heap but adjust next time to fire
 				t.when += t.period * (1 + -delta/t.period)
-				siftdownTimer(0)
+				siftdownTimer(tb.t, 0)
 			} else {
 				// remove from heap
-				last := len(timers.t) - 1
+				last := len(tb.t) - 1
 				if last > 0 {
-					timers.t[0] = timers.t[last]
-					timers.t[0].i = 0
+					tb.t[0] = tb.t[last]
+					tb.t[0].i = 0
 				}
-				timers.t[last] = nil
-				timers.t = timers.t[:last]
+				tb.t[last] = nil
+				tb.t = tb.t[:last]
 				if last > 0 {
-					siftdownTimer(0)
+					siftdownTimer(tb.t, 0)
 				}
 				t.i = -1 // mark as removed
 			}
 			f := t.f
 			arg := t.arg
 			seq := t.seq
-			unlock(&timers.lock)
+			unlock(&tb.lock)
 			if raceenabled {
 				raceacquire(unsafe.Pointer(t))
 			}
 			f(arg, seq)
-			lock(&timers.lock)
+			lock(&tb.lock)
 		}
 		if delta < 0 || faketime > 0 {
 			// No timers left - put goroutine to sleep.
-			timers.rescheduling = true
-			goparkunlock(&timers.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
+			tb.rescheduling = true
+			goparkunlock(&tb.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
 			continue
 		}
 		// At least one timer pending. Sleep until then.
-		timers.sleeping = true
-		timers.sleepUntil = now + delta
-		noteclear(&timers.waitnote)
-		unlock(&timers.lock)
-		notetsleepg(&timers.waitnote, delta)
+		tb.sleeping = true
+		tb.sleepUntil = now + delta
+		noteclear(&tb.waitnote)
+		unlock(&tb.lock)
+		notetsleepg(&tb.waitnote, delta)
 	}
 }
 
@@ -222,28 +266,67 @@ func timejump() *g {
 		return nil
 	}
 
-	lock(&timers.lock)
-	if !timers.created || len(timers.t) == 0 {
-		unlock(&timers.lock)
+	for i := range timers {
+		lock(&timers[i].lock)
+	}
+	gp := timejumpLocked()
+	for i := range timers {
+		unlock(&timers[i].lock)
+	}
+
+	return gp
+}
+
+func timejumpLocked() *g {
+	// Determine a timer bucket with minimum when.
+	var minT *timer
+	for i := range timers {
+		tb := &timers[i]
+		if !tb.created || len(tb.t) == 0 {
+			continue
+		}
+		t := tb.t[0]
+		if minT == nil || t.when < minT.when {
+			minT = t
+		}
+	}
+	if minT == nil || minT.when <= faketime {
+		return nil
+	}
+
+	faketime = minT.when
+	tb := minT.tb
+	if !tb.rescheduling {
 		return nil
 	}
+	tb.rescheduling = false
+	return tb.gp
+}
+
+func timeSleepUntil() int64 {
+	next := int64(1<<63 - 1)
 
-	var gp *g
-	if faketime < timers.t[0].when {
-		faketime = timers.t[0].when
-		if timers.rescheduling {
-			timers.rescheduling = false
-			gp = timers.gp
+	// Determine minimum sleepUntil across all the timer buckets.
+	//
+	// The function can not return a precise answer,
+	// as another timer may pop in as soon as timers have been unlocked.
+	// So lock the timers one by one instead of all at once.
+	for i := range timers {
+		tb := &timers[i]
+
+		lock(&tb.lock)
+		if tb.sleeping && tb.sleepUntil < next {
+			next = tb.sleepUntil
 		}
+		unlock(&tb.lock)
 	}
-	unlock(&timers.lock)
-	return gp
+
+	return next
 }
 
 // Heap maintenance algorithms.
 
-func siftupTimer(i int) {
-	t := timers.t
+func siftupTimer(t []*timer, i int) {
 	when := t[i].when
 	tmp := t[i]
 	for i > 0 {
@@ -253,14 +336,15 @@ func siftupTimer(i int) {
 		}
 		t[i] = t[p]
 		t[i].i = i
-		t[p] = tmp
-		t[p].i = p
 		i = p
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
-func siftdownTimer(i int) {
-	t := timers.t
+func siftdownTimer(t []*timer, i int) {
 	n := len(t)
 	when := t[i].when
 	tmp := t[i]
@@ -291,10 +375,12 @@ func siftdownTimer(i int) {
 		}
 		t[i] = t[c]
 		t[i].i = i
-		t[c] = tmp
-		t[c].i = c
 		i = c
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
 // Entry points for net, time to call nanotime.
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 826dc9a999..fab797601b 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -28,8 +28,8 @@ const (
 	traceEvProcStop          = 6  // stop of P [timestamp]
 	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone            = 8  // GC done [timestamp]
-	traceEvGCScanStart       = 9  // GC mark termination start [timestamp]
-	traceEvGCScanDone        = 10 // GC mark termination done [timestamp]
+	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
+	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
 	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
 	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
@@ -235,21 +235,21 @@ func StartTrace() error {
 	trace.timeStart = nanotime()
 	trace.headerWritten = false
 	trace.footerWritten = false
-	trace.strings = make(map[string]uint64)
+
+	// string to id mapping
+	//  0 : reserved for an empty string
+	//  remaining: other strings registered by traceString
 	trace.stringSeq = 0
+	trace.strings = make(map[string]uint64)
+
 	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
 	// Register runtime goroutine labels.
 	_, pid, bufp := traceAcquireBuffer()
-	buf := (*bufp).ptr()
-	if buf == nil {
-		buf = traceFlush(0).ptr()
-		(*bufp).set(buf)
-	}
 	for i, label := range gcMarkWorkerModeStrings[:] {
-		trace.markWorkerLabels[i], buf = traceString(buf, label)
+		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
 	}
 	traceReleaseBuffer(pid)
 
@@ -277,10 +277,9 @@ func StopTrace() {
 
 	traceGoSched()
 
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	// Loop over all allocated Ps because dead Ps may still have
+	// trace buffers.
+	for _, p := range allp[:cap(allp)] {
 		buf := p.tracebuf
 		if buf != 0 {
 			traceFullQueue(buf)
@@ -320,10 +319,7 @@ func StopTrace() {
 
 	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
 	lock(&trace.lock)
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp[:cap(allp)] {
 		if p.tracebuf != 0 {
 			throw("trace: non-empty trace buffer in proc")
 		}
@@ -382,7 +378,7 @@ func ReadTrace() []byte {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.9 trace\x00\x00\x00\x00")
+		return []byte("go 1.10 trace\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -408,9 +404,12 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		if timers.gp != nil {
-			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
-			data = traceAppend(data, uint64(timers.gp.goid))
+		for i := range timers {
+			tb := &timers[i]
+			if tb.gp != nil {
+				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
+				data = traceAppend(data, uint64(tb.gp.goid))
+			}
 		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
@@ -514,18 +513,12 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 	buf := (*bufp).ptr()
 	const maxSize = 2 + 5*traceBytesPerNumber // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
 		(*bufp).set(buf)
 	}
 
 	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
-	if buf.pos == 0 {
-		buf.byte(traceEvBatch | 1<<traceArgCountShift)
-		buf.varint(uint64(pid))
-		buf.varint(ticks)
-		tickDiff = 0
-	}
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -603,7 +596,7 @@ func traceReleaseBuffer(pid int32) {
 }
 
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
-func traceFlush(buf traceBufPtr) traceBufPtr {
+func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
 	if dolock {
@@ -624,34 +617,51 @@ func traceFlush(buf traceBufPtr) traceBufPtr {
 	bufp := buf.ptr()
 	bufp.link.set(nil)
 	bufp.pos = 0
-	bufp.lastTicks = 0
+
+	// initialize the buffer for a new batch
+	ticks := uint64(cputicks()) / traceTickDiv
+	bufp.lastTicks = ticks
+	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
+	bufp.varint(uint64(pid))
+	bufp.varint(ticks)
+
 	if dolock {
 		unlock(&trace.lock)
 	}
 	return buf
 }
 
-func traceString(buf *traceBuf, s string) (uint64, *traceBuf) {
+// traceString adds a string to the trace.strings and returns the id.
+func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
 	if s == "" {
-		return 0, buf
+		return 0, bufp
 	}
 	if id, ok := trace.strings[s]; ok {
-		return id, buf
+		return id, bufp
 	}
 
 	trace.stringSeq++
 	id := trace.stringSeq
 	trace.strings[s] = id
 
+	// memory allocation in above may trigger tracing and
+	// cause *bufp changes. Following code now works with *bufp,
+	// so there must be no memory allocation or any activities
+	// that causes tracing after this point.
+
+	buf := (*bufp).ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
-	if len(buf.arr)-buf.pos < size {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+	if buf == nil || len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		(*bufp).set(buf)
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
 	buf.varint(uint64(len(s)))
 	buf.pos += copy(buf.arr[buf.pos:], s)
-	return id, buf
+
+	(*bufp).set(buf)
+	return id, bufp
 }
 
 // traceAppend appends v to buf in little-endian-base-128 encoding.
@@ -781,7 +791,7 @@ func allFrames(pcs []uintptr) []Frame {
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
 	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
-	buf := traceFlush(0).ptr()
+	bufp := traceFlush(0, 0)
 	for _, stk := range tab.tab {
 		stk := stk.ptr()
 		for ; stk != nil; stk = stk.link.ptr() {
@@ -791,7 +801,7 @@ func (tab *traceStackTable) dump() {
 			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
 			for _, f := range frames {
 				var frame traceFrame
-				frame, buf = traceFrameForPC(buf, f)
+				frame, bufp = traceFrameForPC(bufp, 0, f)
 				tmpbuf = traceAppend(tmpbuf, uint64(f.PC))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
@@ -799,9 +809,10 @@ func (tab *traceStackTable) dump() {
 			}
 			// Now copy to the buffer.
 			size := 1 + traceBytesPerNumber + len(tmpbuf)
-			if len(buf.arr)-buf.pos < size {
-				buf = traceFlush(traceBufPtrOf(buf)).ptr()
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
+				bufp = traceFlush(bufp, 0)
 			}
+			buf := bufp.ptr()
 			buf.byte(traceEvStack | 3<<traceArgCountShift)
 			buf.varint(uint64(len(tmpbuf)))
 			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
@@ -809,7 +820,7 @@ func (tab *traceStackTable) dump() {
 	}
 
 	lock(&trace.lock)
-	traceFullQueue(traceBufPtrOf(buf))
+	traceFullQueue(bufp)
 	unlock(&trace.lock)
 
 	tab.mem.drop()
@@ -822,7 +833,10 @@ type traceFrame struct {
 	line   uint64
 }
 
-func traceFrameForPC(buf *traceBuf, f Frame) (traceFrame, *traceBuf) {
+// traceFrameForPC records the frame information.
+// It may allocate memory.
+func traceFrameForPC(buf traceBufPtr, pid int32, f Frame) (traceFrame, traceBufPtr) {
+	bufp := &buf
 	var frame traceFrame
 
 	fn := f.Function
@@ -830,14 +844,14 @@ func traceFrameForPC(buf *traceBuf, f Frame) (traceFrame, *traceBuf) {
 	if len(fn) > maxLen {
 		fn = fn[len(fn)-maxLen:]
 	}
-	frame.funcID, buf = traceString(buf, fn)
+	frame.funcID, bufp = traceString(bufp, pid, fn)
 	frame.line = uint64(f.Line)
 	file := f.File
 	if len(file) > maxLen {
 		file = file[len(file)-maxLen:]
 	}
-	frame.fileID, buf = traceString(buf, file)
-	return frame, buf
+	frame.fileID, bufp = traceString(bufp, pid, file)
+	return frame, (*bufp)
 }
 
 // traceAlloc is a non-thread-safe region allocator.
@@ -924,12 +938,12 @@ func traceGCDone() {
 	traceEvent(traceEvGCDone, -1)
 }
 
-func traceGCScanStart() {
-	traceEvent(traceEvGCScanStart, -1)
+func traceGCSTWStart(kind int) {
+	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
 }
 
-func traceGCScanDone() {
-	traceEvent(traceEvGCScanDone, -1)
+func traceGCSTWDone() {
+	traceEvent(traceEvGCSTWDone, -1)
 }
 
 // traceGCSweepStart prepares to trace a sweep loop. This does not
diff --git a/src/runtime/trace/example_test.go b/src/runtime/trace/example_test.go
new file mode 100644
index 0000000000..ba96a829a3
--- /dev/null
+++ b/src/runtime/trace/example_test.go
@@ -0,0 +1,39 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace_test
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"runtime/trace"
+)
+
+// Example demonstrates the use of the trace package to trace
+// the execution of a Go program. The trace output will be
+// written to the file trace.out
+func Example() {
+	f, err := os.Create("trace.out")
+	if err != nil {
+		log.Fatalf("failed to create trace output file: %v", err)
+	}
+	defer func() {
+		if err := f.Close(); err != nil {
+			log.Fatalf("failed to close trace file: %v", err)
+		}
+	}()
+
+	if err := trace.Start(f); err != nil {
+		log.Fatalf("failed to start trace: %v", err)
+	}
+	defer trace.Stop()
+
+	// your program here
+	RunMyProgram()
+}
+
+func RunMyProgram() {
+	fmt.Printf("this function will be traced")
+}
diff --git a/src/runtime/trace/trace.go b/src/runtime/trace/trace.go
index 7cbb8a6e82..439f998c03 100644
--- a/src/runtime/trace/trace.go
+++ b/src/runtime/trace/trace.go
@@ -2,13 +2,36 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Go execution tracer.
-// The tracer captures a wide range of execution events like goroutine
-// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
-// changes of heap size, processor start/stop, etc and writes them to an io.Writer
-// in a compact form. A precise nanosecond-precision timestamp and a stack
-// trace is captured for most events. A trace can be analyzed later with
-// 'go tool trace' command.
+// Package trace contains facilities for programs to generate trace
+// for Go execution tracer.
+//
+// The execution trace captures a wide range of execution events such as
+// goroutine creation/blocking/unblocking, syscall enter/exit/block,
+// GC-related events, changes of heap size, processor start/stop, etc.
+// A precise nanosecond-precision timestamp and a stack trace is
+// captured for most events. The generated trace can be interpreted
+// using `go tool trace`.
+//
+// Tracing a Go program
+//
+// Support for tracing tests and benchmarks built with the standard
+// testing package is built into `go test`. For example, the following
+// command runs the test in the current directory and writes the trace
+// file (trace.out).
+//
+//    go test -trace=test.out
+//
+// This runtime/trace package provides APIs to add equivalent tracing
+// support to a standalone program. See the Example that demonstrates
+// how to use this API to enable tracing.
+//
+// There is also a standard HTTP interface to profiling data. Adding the
+// following line will install handlers under the /debug/pprof/trace URL
+// to download live profiles:
+//
+//     import _ "net/http/pprof"
+//
+// See the net/http/pprof package for more details.
 package trace
 
 import (
diff --git a/src/runtime/trace/trace_test.go b/src/runtime/trace/trace_test.go
index c5f64fcf4c..5fa5b82f8e 100644
--- a/src/runtime/trace/trace_test.go
+++ b/src/runtime/trace/trace_test.go
@@ -7,6 +7,7 @@ package trace_test
 import (
 	"bytes"
 	"flag"
+	"internal/race"
 	"internal/trace"
 	"io"
 	"io/ioutil"
@@ -14,6 +15,7 @@ import (
 	"os"
 	"runtime"
 	. "runtime/trace"
+	"strconv"
 	"sync"
 	"testing"
 	"time"
@@ -23,6 +25,61 @@ var (
 	saveTraces = flag.Bool("savetraces", false, "save traces collected by tests")
 )
 
+// TestEventBatch tests Flush calls that happen during Start
+// don't produce corrupted traces.
+func TestEventBatch(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping in race mode")
+	}
+	if testing.Short() {
+		t.Skip("skipping in short mode")
+	}
+	// During Start, bunch of records are written to reflect the current
+	// snapshot of the program, including state of each goroutines.
+	// And some string constants are written to the trace to aid trace
+	// parsing. This test checks Flush of the buffer occurred during
+	// this process doesn't cause corrupted traces.
+	// When a Flush is called during Start is complicated
+	// so we test with a range of number of goroutines hoping that one
+	// of them triggers Flush.
+	// This range was chosen to fill up a ~64KB buffer with traceEvGoCreate
+	// and traceEvGoWaiting events (12~13bytes per goroutine).
+	for g := 4950; g < 5050; g++ {
+		n := g
+		t.Run("G="+strconv.Itoa(n), func(t *testing.T) {
+			var wg sync.WaitGroup
+			wg.Add(n)
+
+			in := make(chan bool, 1000)
+			for i := 0; i < n; i++ {
+				go func() {
+					<-in
+					wg.Done()
+				}()
+			}
+			buf := new(bytes.Buffer)
+			if err := Start(buf); err != nil {
+				t.Fatalf("failed to start tracing: %v", err)
+			}
+
+			for i := 0; i < n; i++ {
+				in <- true
+			}
+			wg.Wait()
+			Stop()
+
+			_, err := trace.Parse(buf, "")
+			if err == trace.ErrTimeOrder {
+				t.Skipf("skipping trace: %v", err)
+			}
+
+			if err != nil {
+				t.Fatalf("failed to parse trace: %v", err)
+			}
+		})
+	}
+}
+
 func TestTraceStartStop(t *testing.T) {
 	buf := new(bytes.Buffer)
 	if err := Start(buf); err != nil {
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index c74d438757..501ecb0411 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -184,6 +184,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 	cgoCtxt := gp.cgoCtxt
 	printing := pcbuf == nil && callback == nil
 	_defer := gp._defer
+	elideWrapper := false
 
 	for _defer != nil && _defer.sp == _NoArgs {
 		_defer = _defer.link
@@ -386,8 +387,15 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		}
 
 		if printing {
-			// assume skip=0 for printing
-			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0) {
+			// assume skip=0 for printing.
+			//
+			// Never elide wrappers if we haven't printed
+			// any frames. And don't elide wrappers that
+			// called panic rather than the wrapped
+			// function. Otherwise, leave them out.
+			name := funcname(f)
+			nextElideWrapper := elideWrapperCalling(name)
+			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0, elideWrapper && nprint != 0) {
 				// Print during crash.
 				//	main(0x1, 0x2, 0x3)
 				//		/home/rsc/go/src/runtime/x.go:23 +0xf
@@ -411,7 +419,6 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 						ix = inltree[ix].parent
 					}
 				}
-				name := funcname(f)
 				if name == "runtime.gopanic" {
 					name = "panic"
 				}
@@ -438,6 +445,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 				print("\n")
 				nprint++
 			}
+			elideWrapper = nextElideWrapper
 		}
 		n++
 
@@ -647,7 +655,7 @@ func printcreatedby(gp *g) {
 	// Show what created goroutine, except main goroutine (goid 1).
 	pc := gp.gopc
 	f := findfunc(pc)
-	if f.valid() && showframe(f, gp, false) && gp.goid != 1 {
+	if f.valid() && showframe(f, gp, false, false) && gp.goid != 1 {
 		print("created by ", funcname(f), "\n")
 		tracepc := pc // back up to CALL instruction for funcline.
 		if pc > f.entry {
@@ -714,7 +722,7 @@ func traceback1(pc, sp, lr uintptr, gp *g, flags uint) {
 
 func callers(skip int, pcbuf []uintptr) int {
 	sp := getcallersp(unsafe.Pointer(&skip))
-	pc := getcallerpc(unsafe.Pointer(&skip))
+	pc := getcallerpc()
 	gp := getg()
 	var n int
 	systemstack(func() {
@@ -727,12 +735,28 @@ func gcallers(gp *g, skip int, pcbuf []uintptr) int {
 	return gentraceback(^uintptr(0), ^uintptr(0), 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
 }
 
-func showframe(f funcInfo, gp *g, firstFrame bool) bool {
+func showframe(f funcInfo, gp *g, firstFrame, elideWrapper bool) bool {
 	g := getg()
 	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
 		return true
 	}
 	level, _, _ := gotraceback()
+	if level > 1 {
+		// Show all frames.
+		return true
+	}
+
+	if !f.valid() {
+		return false
+	}
+
+	if elideWrapper {
+		file, _ := funcline(f, f.entry)
+		if file == "<autogenerated>" {
+			return false
+		}
+	}
+
 	name := funcname(f)
 
 	// Special case: always show runtime.gopanic frame
@@ -744,7 +768,7 @@ func showframe(f funcInfo, gp *g, firstFrame bool) bool {
 		return true
 	}
 
-	return level > 1 || f.valid() && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+	return contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.
@@ -754,6 +778,14 @@ func isExportedRuntime(name string) bool {
 	return len(name) > n && name[:n] == "runtime." && 'A' <= name[n] && name[n] <= 'Z'
 }
 
+// elideWrapperCalling returns whether a wrapper function that called
+// function "name" should be elided from stack traces.
+func elideWrapperCalling(name string) bool {
+	// If the wrapper called a panic function instead of the
+	// wrapped function, we want to include it in stacks.
+	return !(name == "runtime.gopanic" || name == "runtime.sigpanic" || name == "runtime.panicwrap")
+}
+
 var gStatusStrings = [...]string{
 	_Gidle:      "idle",
 	_Grunnable:  "runnable",
@@ -795,7 +827,7 @@ func goroutineheader(gp *g) {
 	if waitfor >= 1 {
 		print(", ", waitfor, " minutes")
 	}
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		print(", locked to thread")
 	}
 	print("]:\n")
diff --git a/src/runtime/type.go b/src/runtime/type.go
index bf54d54eb4..b3df3353ce 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -655,15 +655,15 @@ func typesEqual(t, v *_type, seen map[_typePair]struct{}) bool {
 		if len(st.fields) != len(sv.fields) {
 			return false
 		}
+		if st.pkgPath.name() != sv.pkgPath.name() {
+			return false
+		}
 		for i := range st.fields {
 			tf := &st.fields[i]
 			vf := &sv.fields[i]
 			if tf.name.name() != vf.name.name() {
 				return false
 			}
-			if tf.name.pkgPath() != vf.name.pkgPath() {
-				return false
-			}
 			if !typesEqual(tf.typ, vf.typ, seen) {
 				return false
 			}
diff --git a/src/runtime/vdso_linux.go b/src/runtime/vdso_linux.go
new file mode 100644
index 0000000000..5a4e8e578d
--- /dev/null
+++ b/src/runtime/vdso_linux.go
@@ -0,0 +1,281 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build 386 amd64
+
+package runtime
+
+import "unsafe"
+
+// Look up symbols in the Linux vDSO.
+
+// This code was originally based on the sample Linux vDSO parser at
+// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/vDSO/parse_vdso.c
+
+// This implements the ELF dynamic linking spec at
+// http://sco.com/developers/gabi/latest/ch5.dynamic.html
+
+// The version section is documented at
+// http://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
+
+const (
+	_AT_SYSINFO_EHDR = 33
+
+	_PT_LOAD    = 1 /* Loadable program segment */
+	_PT_DYNAMIC = 2 /* Dynamic linking information */
+
+	_DT_NULL     = 0          /* Marks end of dynamic section */
+	_DT_HASH     = 4          /* Dynamic symbol hash table */
+	_DT_STRTAB   = 5          /* Address of string table */
+	_DT_SYMTAB   = 6          /* Address of symbol table */
+	_DT_GNU_HASH = 0x6ffffef5 /* GNU-style dynamic symbol hash table */
+	_DT_VERSYM   = 0x6ffffff0
+	_DT_VERDEF   = 0x6ffffffc
+
+	_VER_FLG_BASE = 0x1 /* Version definition of file itself */
+
+	_SHN_UNDEF = 0 /* Undefined section */
+
+	_SHT_DYNSYM = 11 /* Dynamic linker symbol table */
+
+	_STT_FUNC = 2 /* Symbol is a code object */
+
+	_STB_GLOBAL = 1 /* Global symbol */
+	_STB_WEAK   = 2 /* Weak symbol */
+
+	_EI_NIDENT = 16
+
+	// Maximum indices for the array types used when traversing the vDSO ELF structures.
+	// Computed from architecture-specific max provided by vdso_linux_*.go
+	vdsoSymTabSize     = vdsoArrayMax / unsafe.Sizeof(elfSym{})
+	vdsoDynSize        = vdsoArrayMax / unsafe.Sizeof(elfDyn{})
+	vdsoSymStringsSize = vdsoArrayMax     // byte
+	vdsoVerSymSize     = vdsoArrayMax / 2 // uint16
+	vdsoHashSize       = vdsoArrayMax / 4 // uint32
+
+	// vdsoBloomSizeScale is a scaling factor for gnuhash tables which are uint32 indexed,
+	// but contain uintptrs
+	vdsoBloomSizeScale = unsafe.Sizeof(uintptr(0)) / 4 // uint32
+)
+
+/* How to extract and insert information held in the st_info field.  */
+func _ELF_ST_BIND(val byte) byte { return val >> 4 }
+func _ELF_ST_TYPE(val byte) byte { return val & 0xf }
+
+type symbol_key struct {
+	name     string
+	sym_hash uint32
+	gnu_hash uint32
+	ptr      *uintptr
+}
+
+type version_key struct {
+	version  string
+	ver_hash uint32
+}
+
+type vdso_info struct {
+	valid bool
+
+	/* Load information */
+	load_addr   uintptr
+	load_offset uintptr /* load_addr - recorded vaddr */
+
+	/* Symbol table */
+	symtab     *[vdsoSymTabSize]elfSym
+	symstrings *[vdsoSymStringsSize]byte
+	chain      []uint32
+	bucket     []uint32
+	symOff     uint32
+	isGNUHash  bool
+
+	/* Version table */
+	versym *[vdsoVerSymSize]uint16
+	verdef *elfVerdef
+}
+
+var linux26 = version_key{"LINUX_2.6", 0x3ae75f6}
+
+// see vdso_linux_*.go for sym_keys[] and __vdso_* vars
+
+func vdso_init_from_sysinfo_ehdr(info *vdso_info, hdr *elfEhdr) {
+	info.valid = false
+	info.load_addr = uintptr(unsafe.Pointer(hdr))
+
+	pt := unsafe.Pointer(info.load_addr + uintptr(hdr.e_phoff))
+
+	// We need two things from the segment table: the load offset
+	// and the dynamic table.
+	var found_vaddr bool
+	var dyn *[vdsoDynSize]elfDyn
+	for i := uint16(0); i < hdr.e_phnum; i++ {
+		pt := (*elfPhdr)(add(pt, uintptr(i)*unsafe.Sizeof(elfPhdr{})))
+		switch pt.p_type {
+		case _PT_LOAD:
+			if !found_vaddr {
+				found_vaddr = true
+				info.load_offset = info.load_addr + uintptr(pt.p_offset-pt.p_vaddr)
+			}
+
+		case _PT_DYNAMIC:
+			dyn = (*[vdsoDynSize]elfDyn)(unsafe.Pointer(info.load_addr + uintptr(pt.p_offset)))
+		}
+	}
+
+	if !found_vaddr || dyn == nil {
+		return // Failed
+	}
+
+	// Fish out the useful bits of the dynamic table.
+
+	var hash, gnuhash *[vdsoHashSize]uint32
+	info.symstrings = nil
+	info.symtab = nil
+	info.versym = nil
+	info.verdef = nil
+	for i := 0; dyn[i].d_tag != _DT_NULL; i++ {
+		dt := &dyn[i]
+		p := info.load_offset + uintptr(dt.d_val)
+		switch dt.d_tag {
+		case _DT_STRTAB:
+			info.symstrings = (*[vdsoSymStringsSize]byte)(unsafe.Pointer(p))
+		case _DT_SYMTAB:
+			info.symtab = (*[vdsoSymTabSize]elfSym)(unsafe.Pointer(p))
+		case _DT_HASH:
+			hash = (*[vdsoHashSize]uint32)(unsafe.Pointer(p))
+		case _DT_GNU_HASH:
+			gnuhash = (*[vdsoHashSize]uint32)(unsafe.Pointer(p))
+		case _DT_VERSYM:
+			info.versym = (*[vdsoVerSymSize]uint16)(unsafe.Pointer(p))
+		case _DT_VERDEF:
+			info.verdef = (*elfVerdef)(unsafe.Pointer(p))
+		}
+	}
+
+	if info.symstrings == nil || info.symtab == nil || (hash == nil && gnuhash == nil) {
+		return // Failed
+	}
+
+	if info.verdef == nil {
+		info.versym = nil
+	}
+
+	if gnuhash != nil {
+		// Parse the GNU hash table header.
+		nbucket := gnuhash[0]
+		info.symOff = gnuhash[1]
+		bloomSize := gnuhash[2]
+		info.bucket = gnuhash[4+bloomSize*uint32(vdsoBloomSizeScale):][:nbucket]
+		info.chain = gnuhash[4+bloomSize*uint32(vdsoBloomSizeScale)+nbucket:]
+		info.isGNUHash = true
+	} else {
+		// Parse the hash table header.
+		nbucket := hash[0]
+		nchain := hash[1]
+		info.bucket = hash[2 : 2+nbucket]
+		info.chain = hash[2+nbucket : 2+nbucket+nchain]
+	}
+
+	// That's all we need.
+	info.valid = true
+}
+
+func vdso_find_version(info *vdso_info, ver *version_key) int32 {
+	if !info.valid {
+		return 0
+	}
+
+	def := info.verdef
+	for {
+		if def.vd_flags&_VER_FLG_BASE == 0 {
+			aux := (*elfVerdaux)(add(unsafe.Pointer(def), uintptr(def.vd_aux)))
+			if def.vd_hash == ver.ver_hash && ver.version == gostringnocopy(&info.symstrings[aux.vda_name]) {
+				return int32(def.vd_ndx & 0x7fff)
+			}
+		}
+
+		if def.vd_next == 0 {
+			break
+		}
+		def = (*elfVerdef)(add(unsafe.Pointer(def), uintptr(def.vd_next)))
+	}
+
+	return -1 // cannot match any version
+}
+
+func vdso_parse_symbols(info *vdso_info, version int32) {
+	if !info.valid {
+		return
+	}
+
+	apply := func(symIndex uint32, k symbol_key) bool {
+		sym := &info.symtab[symIndex]
+		typ := _ELF_ST_TYPE(sym.st_info)
+		bind := _ELF_ST_BIND(sym.st_info)
+		if typ != _STT_FUNC || bind != _STB_GLOBAL && bind != _STB_WEAK || sym.st_shndx == _SHN_UNDEF {
+			return false
+		}
+		if k.name != gostringnocopy(&info.symstrings[sym.st_name]) {
+			return false
+		}
+
+		// Check symbol version.
+		if info.versym != nil && version != 0 && int32(info.versym[symIndex]&0x7fff) != version {
+			return false
+		}
+
+		*k.ptr = info.load_offset + uintptr(sym.st_value)
+		return true
+	}
+
+	if !info.isGNUHash {
+		// Old-style DT_HASH table.
+		for _, k := range sym_keys {
+			for chain := info.bucket[k.sym_hash%uint32(len(info.bucket))]; chain != 0; chain = info.chain[chain] {
+				if apply(chain, k) {
+					break
+				}
+			}
+		}
+		return
+	}
+
+	// New-style DT_GNU_HASH table.
+	for _, k := range sym_keys {
+		symIndex := info.bucket[k.gnu_hash%uint32(len(info.bucket))]
+		if symIndex < info.symOff {
+			continue
+		}
+		for ; ; symIndex++ {
+			hash := info.chain[symIndex-info.symOff]
+			if hash|1 == k.gnu_hash|1 {
+				// Found a hash match.
+				if apply(symIndex, k) {
+					break
+				}
+			}
+			if hash&1 != 0 {
+				// End of chain.
+				break
+			}
+		}
+	}
+}
+
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_SYSINFO_EHDR:
+		if val == 0 {
+			// Something went wrong
+			return
+		}
+		var info vdso_info
+		// TODO(rsc): I don't understand why the compiler thinks info escapes
+		// when passed to the three functions below.
+		info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
+		vdso_init_from_sysinfo_ehdr(info1, (*elfEhdr)(unsafe.Pointer(val)))
+		vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
+	}
+}
diff --git a/src/runtime/vdso_linux_386.go b/src/runtime/vdso_linux_386.go
new file mode 100644
index 0000000000..74ad953469
--- /dev/null
+++ b/src/runtime/vdso_linux_386.go
@@ -0,0 +1,93 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// ELF32 structure definitions for use by the Linux vDSO loader
+
+type elfSym struct {
+	st_name  uint32
+	st_value uint32
+	st_size  uint32
+	st_info  byte
+	st_other byte
+	st_shndx uint16
+}
+
+type elfVerdef struct {
+	vd_version uint16 /* Version revision */
+	vd_flags   uint16 /* Version information */
+	vd_ndx     uint16 /* Version Index */
+	vd_cnt     uint16 /* Number of associated aux entries */
+	vd_hash    uint32 /* Version name hash value */
+	vd_aux     uint32 /* Offset in bytes to verdaux array */
+	vd_next    uint32 /* Offset in bytes to next verdef entry */
+}
+
+type elfEhdr struct {
+	e_ident     [_EI_NIDENT]byte /* Magic number and other info */
+	e_type      uint16           /* Object file type */
+	e_machine   uint16           /* Architecture */
+	e_version   uint32           /* Object file version */
+	e_entry     uint32           /* Entry point virtual address */
+	e_phoff     uint32           /* Program header table file offset */
+	e_shoff     uint32           /* Section header table file offset */
+	e_flags     uint32           /* Processor-specific flags */
+	e_ehsize    uint16           /* ELF header size in bytes */
+	e_phentsize uint16           /* Program header table entry size */
+	e_phnum     uint16           /* Program header table entry count */
+	e_shentsize uint16           /* Section header table entry size */
+	e_shnum     uint16           /* Section header table entry count */
+	e_shstrndx  uint16           /* Section header string table index */
+}
+
+type elfPhdr struct {
+	p_type   uint32 /* Segment type */
+	p_offset uint32 /* Segment file offset */
+	p_vaddr  uint32 /* Segment virtual address */
+	p_paddr  uint32 /* Segment physical address */
+	p_filesz uint32 /* Segment size in file */
+	p_memsz  uint32 /* Segment size in memory */
+	p_flags  uint32 /* Segment flags */
+	p_align  uint32 /* Segment alignment */
+}
+
+type elfShdr struct {
+	sh_name      uint32 /* Section name (string tbl index) */
+	sh_type      uint32 /* Section type */
+	sh_flags     uint32 /* Section flags */
+	sh_addr      uint32 /* Section virtual addr at execution */
+	sh_offset    uint32 /* Section file offset */
+	sh_size      uint32 /* Section size in bytes */
+	sh_link      uint32 /* Link to another section */
+	sh_info      uint32 /* Additional section information */
+	sh_addralign uint32 /* Section alignment */
+	sh_entsize   uint32 /* Entry size if section holds table */
+}
+
+type elfDyn struct {
+	d_tag int32  /* Dynamic entry type */
+	d_val uint32 /* Integer value */
+}
+
+type elfVerdaux struct {
+	vda_name uint32 /* Version or dependency names */
+	vda_next uint32 /* Offset in bytes to next verdaux entry */
+}
+
+const (
+	// vdsoArrayMax is the byte-size of a maximally sized array on this architecture.
+	// See cmd/compile/internal/x86/galign.go arch.MAXWIDTH initialization, but must also
+	// be constrained to max +ve int.
+	vdsoArrayMax = 1<<31 - 1
+)
+
+var sym_keys = []symbol_key{
+	{"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &__vdso_clock_gettime_sym},
+}
+
+// initialize to fall back to syscall
+var (
+	__vdso_clock_gettime_sym uintptr = 0
+)
diff --git a/src/runtime/vdso_linux_amd64.go b/src/runtime/vdso_linux_amd64.go
index 8a970dfbe6..0bbe5c2e8f 100644
--- a/src/runtime/vdso_linux_amd64.go
+++ b/src/runtime/vdso_linux_amd64.go
@@ -4,51 +4,9 @@
 
 package runtime
 
-import "unsafe"
+// ELF64 structure definitions for use by the Linux vDSO loader
 
-// Look up symbols in the Linux vDSO.
-
-// This code was originally based on the sample Linux vDSO parser at
-// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/vDSO/parse_vdso.c
-
-// This implements the ELF dynamic linking spec at
-// http://sco.com/developers/gabi/latest/ch5.dynamic.html
-
-// The version section is documented at
-// http://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
-
-const (
-	_AT_SYSINFO_EHDR = 33
-
-	_PT_LOAD    = 1 /* Loadable program segment */
-	_PT_DYNAMIC = 2 /* Dynamic linking information */
-
-	_DT_NULL   = 0 /* Marks end of dynamic section */
-	_DT_HASH   = 4 /* Dynamic symbol hash table */
-	_DT_STRTAB = 5 /* Address of string table */
-	_DT_SYMTAB = 6 /* Address of symbol table */
-	_DT_VERSYM = 0x6ffffff0
-	_DT_VERDEF = 0x6ffffffc
-
-	_VER_FLG_BASE = 0x1 /* Version definition of file itself */
-
-	_SHN_UNDEF = 0 /* Undefined section */
-
-	_SHT_DYNSYM = 11 /* Dynamic linker symbol table */
-
-	_STT_FUNC = 2 /* Symbol is a code object */
-
-	_STB_GLOBAL = 1 /* Global symbol */
-	_STB_WEAK   = 2 /* Weak symbol */
-
-	_EI_NIDENT = 16
-)
-
-/* How to extract and insert information held in the st_info field.  */
-func _ELF64_ST_BIND(val byte) byte { return val >> 4 }
-func _ELF64_ST_TYPE(val byte) byte { return val & 0xf }
-
-type elf64Sym struct {
+type elfSym struct {
 	st_name  uint32
 	st_info  byte
 	st_other byte
@@ -57,7 +15,7 @@ type elf64Sym struct {
 	st_size  uint64
 }
 
-type elf64Verdef struct {
+type elfVerdef struct {
 	vd_version uint16 /* Version revision */
 	vd_flags   uint16 /* Version information */
 	vd_ndx     uint16 /* Version Index */
@@ -67,7 +25,7 @@ type elf64Verdef struct {
 	vd_next    uint32 /* Offset in bytes to next verdef entry */
 }
 
-type elf64Ehdr struct {
+type elfEhdr struct {
 	e_ident     [_EI_NIDENT]byte /* Magic number and other info */
 	e_type      uint16           /* Object file type */
 	e_machine   uint16           /* Architecture */
@@ -84,7 +42,7 @@ type elf64Ehdr struct {
 	e_shstrndx  uint16           /* Section header string table index */
 }
 
-type elf64Phdr struct {
+type elfPhdr struct {
 	p_type   uint32 /* Segment type */
 	p_flags  uint32 /* Segment flags */
 	p_offset uint64 /* Segment file offset */
@@ -95,7 +53,7 @@ type elf64Phdr struct {
 	p_align  uint64 /* Segment alignment */
 }
 
-type elf64Shdr struct {
+type elfShdr struct {
 	sh_name      uint32 /* Section name (string tbl index) */
 	sh_type      uint32 /* Section type */
 	sh_flags     uint64 /* Section flags */
@@ -108,56 +66,26 @@ type elf64Shdr struct {
 	sh_entsize   uint64 /* Entry size if section holds table */
 }
 
-type elf64Dyn struct {
+type elfDyn struct {
 	d_tag int64  /* Dynamic entry type */
 	d_val uint64 /* Integer value */
 }
 
-type elf64Verdaux struct {
+type elfVerdaux struct {
 	vda_name uint32 /* Version or dependency names */
 	vda_next uint32 /* Offset in bytes to next verdaux entry */
 }
 
-type elf64Auxv struct {
-	a_type uint64 /* Entry type */
-	a_val  uint64 /* Integer value */
-}
-
-type symbol_key struct {
-	name     string
-	sym_hash uint32
-	ptr      *uintptr
-}
-
-type version_key struct {
-	version  string
-	ver_hash uint32
-}
-
-type vdso_info struct {
-	valid bool
-
-	/* Load information */
-	load_addr   uintptr
-	load_offset uintptr /* load_addr - recorded vaddr */
-
-	/* Symbol table */
-	symtab     *[1 << 32]elf64Sym
-	symstrings *[1 << 32]byte
-	chain      []uint32
-	bucket     []uint32
-
-	/* Version table */
-	versym *[1 << 32]uint16
-	verdef *elf64Verdef
-}
-
-var linux26 = version_key{"LINUX_2.6", 0x3ae75f6}
+const (
+	// vdsoArrayMax is the byte-size of a maximally sized array on this architecture.
+	// See cmd/compile/internal/amd64/galign.go arch.MAXWIDTH initialization.
+	vdsoArrayMax = 1<<50 - 1
+)
 
 var sym_keys = []symbol_key{
-	{"__vdso_time", 0xa33c485, &__vdso_time_sym},
-	{"__vdso_gettimeofday", 0x315ca59, &__vdso_gettimeofday_sym},
-	{"__vdso_clock_gettime", 0xd35ec75, &__vdso_clock_gettime_sym},
+	{"__vdso_time", 0xa33c485, 0x821e8e0d, &__vdso_time_sym},
+	{"__vdso_gettimeofday", 0x315ca59, 0xb01bca00, &__vdso_gettimeofday_sym},
+	{"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &__vdso_clock_gettime_sym},
 }
 
 // initialize with vsyscall fallbacks
@@ -166,141 +94,3 @@ var (
 	__vdso_gettimeofday_sym  uintptr = 0xffffffffff600000
 	__vdso_clock_gettime_sym uintptr = 0
 )
-
-func vdso_init_from_sysinfo_ehdr(info *vdso_info, hdr *elf64Ehdr) {
-	info.valid = false
-	info.load_addr = uintptr(unsafe.Pointer(hdr))
-
-	pt := unsafe.Pointer(info.load_addr + uintptr(hdr.e_phoff))
-
-	// We need two things from the segment table: the load offset
-	// and the dynamic table.
-	var found_vaddr bool
-	var dyn *[1 << 20]elf64Dyn
-	for i := uint16(0); i < hdr.e_phnum; i++ {
-		pt := (*elf64Phdr)(add(pt, uintptr(i)*unsafe.Sizeof(elf64Phdr{})))
-		switch pt.p_type {
-		case _PT_LOAD:
-			if !found_vaddr {
-				found_vaddr = true
-				info.load_offset = info.load_addr + uintptr(pt.p_offset-pt.p_vaddr)
-			}
-
-		case _PT_DYNAMIC:
-			dyn = (*[1 << 20]elf64Dyn)(unsafe.Pointer(info.load_addr + uintptr(pt.p_offset)))
-		}
-	}
-
-	if !found_vaddr || dyn == nil {
-		return // Failed
-	}
-
-	// Fish out the useful bits of the dynamic table.
-
-	var hash *[1 << 30]uint32
-	hash = nil
-	info.symstrings = nil
-	info.symtab = nil
-	info.versym = nil
-	info.verdef = nil
-	for i := 0; dyn[i].d_tag != _DT_NULL; i++ {
-		dt := &dyn[i]
-		p := info.load_offset + uintptr(dt.d_val)
-		switch dt.d_tag {
-		case _DT_STRTAB:
-			info.symstrings = (*[1 << 32]byte)(unsafe.Pointer(p))
-		case _DT_SYMTAB:
-			info.symtab = (*[1 << 32]elf64Sym)(unsafe.Pointer(p))
-		case _DT_HASH:
-			hash = (*[1 << 30]uint32)(unsafe.Pointer(p))
-		case _DT_VERSYM:
-			info.versym = (*[1 << 32]uint16)(unsafe.Pointer(p))
-		case _DT_VERDEF:
-			info.verdef = (*elf64Verdef)(unsafe.Pointer(p))
-		}
-	}
-
-	if info.symstrings == nil || info.symtab == nil || hash == nil {
-		return // Failed
-	}
-
-	if info.verdef == nil {
-		info.versym = nil
-	}
-
-	// Parse the hash table header.
-	nbucket := hash[0]
-	nchain := hash[1]
-	info.bucket = hash[2 : 2+nbucket]
-	info.chain = hash[2+nbucket : 2+nbucket+nchain]
-
-	// That's all we need.
-	info.valid = true
-}
-
-func vdso_find_version(info *vdso_info, ver *version_key) int32 {
-	if !info.valid {
-		return 0
-	}
-
-	def := info.verdef
-	for {
-		if def.vd_flags&_VER_FLG_BASE == 0 {
-			aux := (*elf64Verdaux)(add(unsafe.Pointer(def), uintptr(def.vd_aux)))
-			if def.vd_hash == ver.ver_hash && ver.version == gostringnocopy(&info.symstrings[aux.vda_name]) {
-				return int32(def.vd_ndx & 0x7fff)
-			}
-		}
-
-		if def.vd_next == 0 {
-			break
-		}
-		def = (*elf64Verdef)(add(unsafe.Pointer(def), uintptr(def.vd_next)))
-	}
-
-	return -1 // cannot match any version
-}
-
-func vdso_parse_symbols(info *vdso_info, version int32) {
-	if !info.valid {
-		return
-	}
-
-	for _, k := range sym_keys {
-		for chain := info.bucket[k.sym_hash%uint32(len(info.bucket))]; chain != 0; chain = info.chain[chain] {
-			sym := &info.symtab[chain]
-			typ := _ELF64_ST_TYPE(sym.st_info)
-			bind := _ELF64_ST_BIND(sym.st_info)
-			if typ != _STT_FUNC || bind != _STB_GLOBAL && bind != _STB_WEAK || sym.st_shndx == _SHN_UNDEF {
-				continue
-			}
-			if k.name != gostringnocopy(&info.symstrings[sym.st_name]) {
-				continue
-			}
-
-			// Check symbol version.
-			if info.versym != nil && version != 0 && int32(info.versym[chain]&0x7fff) != version {
-				continue
-			}
-
-			*k.ptr = info.load_offset + uintptr(sym.st_value)
-			break
-		}
-	}
-}
-
-func archauxv(tag, val uintptr) {
-	switch tag {
-	case _AT_SYSINFO_EHDR:
-		if val == 0 {
-			// Something went wrong
-			return
-		}
-		var info vdso_info
-		// TODO(rsc): I don't understand why the compiler thinks info escapes
-		// when passed to the three functions below.
-		info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
-		vdso_init_from_sysinfo_ehdr(info1, (*elf64Ehdr)(unsafe.Pointer(val)))
-		vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
-	}
-}
diff --git a/src/runtime/vdso_linux_test.go b/src/runtime/vdso_linux_test.go
new file mode 100644
index 0000000000..f507ee98ee
--- /dev/null
+++ b/src/runtime/vdso_linux_test.go
@@ -0,0 +1,63 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build 386 amd64
+
+package runtime_test
+
+import (
+	"testing"
+	"time"
+	_ "unsafe"
+)
+
+// These tests are a little risky because they overwrite the __vdso_clock_gettime_sym value.
+// It's normally initialized at startup and remains unchanged after that.
+
+//go:linkname __vdso_clock_gettime_sym runtime.__vdso_clock_gettime_sym
+var __vdso_clock_gettime_sym uintptr
+
+func TestClockVDSOAndFallbackPaths(t *testing.T) {
+	// Check that we can call walltime() and nanotime() with and without their (1st) fast-paths.
+	// This just checks that fast and fallback paths can be called, rather than testing their
+	// results.
+	//
+	// Call them indirectly via time.Now(), so we don't need auxiliary .s files to allow us to
+	// use go:linkname to refer to the functions directly.
+
+	save := __vdso_clock_gettime_sym
+	if save == 0 {
+		t.Log("__vdso_clock_gettime symbol not found; fallback path will be used by default")
+	}
+
+	// Call with fast-path enabled (if vDSO symbol found at startup)
+	time.Now()
+
+	// Call with fast-path disabled
+	__vdso_clock_gettime_sym = 0
+	time.Now()
+	__vdso_clock_gettime_sym = save
+}
+
+func BenchmarkClockVDSOAndFallbackPaths(b *testing.B) {
+	run := func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			// Call via time.Now() - see comment in test above.
+			time.Now()
+		}
+	}
+
+	save := __vdso_clock_gettime_sym
+	b.Run("vDSO", run)
+	__vdso_clock_gettime_sym = 0
+	b.Run("Fallback", run)
+	__vdso_clock_gettime_sym = save
+}
+
+func BenchmarkTimeNow(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		time.Now()
+	}
+}
author	Russ Cox <rsc@golang.org>	2017-12-06 00:35:28 -0500
committer	Russ Cox <rsc@golang.org>	2017-12-06 01:03:36 -0500
commit	185e6094fd968b35b80e56aad1286c66bb2cc261 (patch)
tree	411babe570d6faa1e99251a9167123afd07407d2 /src/runtime
parent	c36033a379a4907fb75309416ffcf2904e613ab9 (diff)
parent	a032f74bf0b40a94669159e7d7e96722eb76199b (diff)
download	go-185e6094fd968b35b80e56aad1286c66bb2cc261.tar.xz