[dev.fuzz] all: merge master into dev.fuzz

Change-Id: I5d8c8329ccc9d747bd81ade6b1cb7cb8ae2e94b2
author: Katie Hockman <katie@golang.org> 2020-12-14 10:03:05 -0500
committer: Katie Hockman <katie@golang.org> 2020-12-14 10:06:13 -0500
commit: 0345ede87ee12698988973884cfc0fd3d499dffd (patch)
tree: 7123cff141ee5661208d2f5f437b8f5252ac7f6a /src/runtime
parent: 4651d6b267818b0e0d128a5443289717c4bb8cbc (diff)
parent: 0a02371b0576964e81c3b40d328db9a3ef3b031b (diff)
download: go-0345ede87ee12698988973884cfc0fd3d499dffd.tar.xz
287 files changed, 11184 insertions, 3779 deletions
diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md
index 993edc67d8..fbf22eeb44 100644
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md
@@ -281,11 +281,12 @@ go:notinheap
 ------------
 
 `go:notinheap` applies to type declarations. It indicates that a type
-must never be allocated from the GC'd heap. Specifically, pointers to
-this type must always fail the `runtime.inheap` check. The type may be
-used for global variables, for stack variables, or for objects in
-unmanaged memory (e.g., allocated with `sysAlloc`, `persistentalloc`,
-`fixalloc`, or from a manually-managed span). Specifically:
+must never be allocated from the GC'd heap or on the stack.
+Specifically, pointers to this type must always fail the
+`runtime.inheap` check. The type may be used for global variables, or
+for objects in unmanaged memory (e.g., allocated with `sysAlloc`,
+`persistentalloc`, `fixalloc`, or from a manually-managed span).
+Specifically:
 
 1. `new(T)`, `make([]T)`, `append([]T, ...)` and implicit heap
    allocation of T are disallowed. (Though implicit allocations are
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 0af48ab25c..1b3bf1180d 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -15,25 +15,6 @@ const (
 	c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503)
 )
 
-// type algorithms - known to compiler
-const (
-	alg_NOEQ = iota
-	alg_MEM0
-	alg_MEM8
-	alg_MEM16
-	alg_MEM32
-	alg_MEM64
-	alg_MEM128
-	alg_STRING
-	alg_INTER
-	alg_NILINTER
-	alg_FLOAT32
-	alg_FLOAT64
-	alg_CPLX64
-	alg_CPLX128
-	alg_max
-)
-
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 11863fba39..fa3b1be339 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -702,25 +702,9 @@ nosave:
 	MOVL	AX, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	LEAL	fn+0(FP), AX
-	MOVL	AX, 0(SP)
-	MOVL	frame+4(FP), AX
-	MOVL	AX, 4(SP)
-	MOVL	framesize+8(FP), AX
-	MOVL	AX, 8(SP)
-	MOVL	ctxt+12(FP), AX
-	MOVL	AX, 12(SP)
-	MOVL	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
+TEXT ·cgocallback(SB),NOSPLIT,$12-12  // Frame size must match commented places below
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -738,13 +722,12 @@ TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
 	CMPL	BP, $0
 	JEQ	needm
 	MOVL	g_m(BP), BP
-	MOVL	BP, DX // saved copy of oldm
+	MOVL	BP, savedm-4(SP) // saved copy of oldm
 	JMP	havem
 needm:
-	MOVL	$0, 0(SP)
 	MOVL	$runtime·needm(SB), AX
 	CALL	AX
-	MOVL	0(SP), DX
+	MOVL	$0, savedm-4(SP) // dropm on return
 	get_tls(CX)
 	MOVL	g(CX), BP
 	MOVL	g_m(BP), BP
@@ -780,34 +763,32 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
-	// 8(SP) is unused.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
 	MOVL	(g_sched+gobuf_pc)(SI), BP
-	MOVL	BP, -4(DI)
-	MOVL	ctxt+12(FP), CX
-	LEAL	-(4+12)(DI), SP
-	MOVL	DX, 4(SP)
-	MOVL	CX, 0(SP)
+	MOVL	BP, -4(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVL	fn+0(FP), AX
+	MOVL	frame+4(FP), BX
+	MOVL	ctxt+8(FP), CX
+	LEAL	-(4+12)(DI), SP  // Must match declared frame size
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	MOVL	CX, 8(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVL	4(SP), DX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	12(SP), BP
+	MOVL	12(SP), BP  // Must match declared frame size
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(12+4)(SP), DI
+	LEAL	(12+4)(SP), DI  // Must match declared frame size
 	MOVL	DI, (g_sched+gobuf_sp)(SI)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -823,6 +804,7 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVL	savedm-4(SP), DX
 	CMPL	DX, $0
 	JNE 3(PC)
 	MOVL	$runtime·dropm(SB), AX
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index fa25c55b96..196252e1dd 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -84,7 +84,9 @@ GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
 DATA _rt0_amd64_lib_argv<>(SB)/8, $0
 GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
 
-TEXT runtime·rt0_go(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stack-based Go ABI (and
+// in addition there are no calls to this entry point from Go code).
+TEXT runtime·rt0_go<ABIInternal>(SB),NOSPLIT,$0
 	// copy arguments forward on an even stack
 	MOVQ	DI, AX		// argc
 	MOVQ	SI, BX		// argv
@@ -229,10 +231,13 @@ ok:
 
 	// Prevent dead-code elimination of debugCallV1, which is
 	// intended to be called by debuggers.
-	MOVQ	$runtime·debugCallV1(SB), AX
+	MOVQ	$runtime·debugCallV1<ABIInternal>(SB), AX
 	RET
 
-DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
+// mainPC is a function value for runtime.main, to be passed to newproc.
+// The reference to runtime.main is made via ABIInternal, since the
+// actual function (not the ABI0 wrapper) is needed by newproc.
+DATA	runtime·mainPC+0(SB)/8,$runtime·main<ABIInternal>(SB)
 GLOBL	runtime·mainPC(SB),RODATA,$8
 
 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
@@ -468,8 +473,9 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 	JMP	AX
 // Note: can't just "JMP NAME(SB)" - bad inlining results.
 
-TEXT ·reflectcall(SB), NOSPLIT, $0-32
+TEXT ·reflectcall<ABIInternal>(SB), NOSPLIT, $0-32
 	MOVLQZX argsize+24(FP), CX
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -537,6 +543,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -689,25 +696,9 @@ nosave:
 	MOVL	AX, ret+16(FP)
 	RET
 
-// func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	LEAQ	fn+0(FP), AX
-	MOVQ	AX, 0(SP)
-	MOVQ	frame+8(FP), AX
-	MOVQ	AX, 8(SP)
-	MOVQ	framesize+16(FP), AX
-	MOVQ	AX, 16(SP)
-	MOVQ	ctxt+24(FP), AX
-	MOVQ	AX, 24(SP)
-	MOVQ	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// func cgocallback_gofunc(fn, frame, framesize, ctxt uintptr)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -725,13 +716,12 @@ TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
 	CMPQ	BX, $0
 	JEQ	needm
 	MOVQ	g_m(BX), BX
-	MOVQ	BX, R8 // holds oldm until end of function
+	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
 	JMP	havem
 needm:
-	MOVQ	$0, 0(SP)
-	MOVQ	$runtime·needm(SB), AX
+	MOVQ    $runtime·needm(SB), AX
 	CALL	AX
-	MOVQ	0(SP), R8
+	MOVQ	$0, savedm-8(SP) // dropm on return
 	get_tls(CX)
 	MOVQ	g(CX), BX
 	MOVQ	g_m(BX), BX
@@ -767,37 +757,36 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, 8(SP) holds the saved R8.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVQ	m_curg(BX), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
 	MOVQ	(g_sched+gobuf_pc)(SI), BX
-	MOVQ	BX, -8(DI)
+	MOVQ	BX, -8(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVQ	fn+0(FP), BX
+	MOVQ	frame+8(FP), CX
+	MOVQ	ctxt+16(FP), DX
 	// Compute the size of the frame, including return PC and, if
 	// GOEXPERIMENT=framepointer, the saved base pointer
-	MOVQ	ctxt+24(FP), BX
-	LEAQ	fv+0(FP), AX
-	SUBQ	SP, AX
-	SUBQ	AX, DI
+	LEAQ	fn+0(FP), AX
+	SUBQ	SP, AX   // AX is our actual frame size
+	SUBQ	AX, DI   // Allocate the same frame size on the g stack
 	MOVQ	DI, SP
 
-	MOVQ	R8, 8(SP)
 	MOVQ	BX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	DX, 16(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVQ	8(SP), R8
 
 	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
 	// but only their difference matters.
-	LEAQ	fv+0(FP), AX
+	LEAQ	fn+0(FP), AX
 	SUBQ	SP, AX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
@@ -822,7 +811,8 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	CMPQ	R8, $0
+	MOVQ	savedm-8(SP), BX
+	CMPQ	BX, $0
 	JNE 3(PC)
 	MOVQ	$runtime·dropm(SB), AX
 	CALL	AX
@@ -1369,8 +1359,11 @@ TEXT _cgo_topofstack(SB),NOSPLIT,$0
 	RET
 
 // The top-most function running on a goroutine
-// returns to goexit+PCQuantum.
-TEXT runtime·goexit(SB),NOSPLIT,$0-0
+// returns to goexit+PCQuantum. Defined as ABIInternal
+// so as to make it identifiable to traceback (this
+// function it used as a sentinel; traceback wants to
+// see the func PC, not a wrapper PC).
+TEXT runtime·goexit<ABIInternal>(SB),NOSPLIT,$0-0
 	BYTE	$0x90	// NOP
 	CALL	runtime·goexit1(SB)	// does not return
 	// traceback from goexit1 must hit code range of goexit
@@ -1392,7 +1385,8 @@ TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
 // - AX is the value being written at DI
 // It clobbers FLAGS. It does not clobber any general-purpose registers,
 // but may clobber others (e.g., SSE registers).
-TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
+// Defined as ABIInternal since it does not use the stack-based Go ABI.
+TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$120
 	// Save the registers clobbered by the fast path. This is slightly
 	// faster than having the caller spill these.
 	MOVQ	R14, 104(SP)
@@ -1476,51 +1470,58 @@ flush:
 	JMP	ret
 
 // gcWriteBarrierCX is gcWriteBarrier, but with args in DI and CX.
-TEXT runtime·gcWriteBarrierCX(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierCX<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ CX, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ CX, AX
 	RET
 
 // gcWriteBarrierDX is gcWriteBarrier, but with args in DI and DX.
-TEXT runtime·gcWriteBarrierDX(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierDX<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ DX, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ DX, AX
 	RET
 
 // gcWriteBarrierBX is gcWriteBarrier, but with args in DI and BX.
-TEXT runtime·gcWriteBarrierBX(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierBX<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ BX, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ BX, AX
 	RET
 
 // gcWriteBarrierBP is gcWriteBarrier, but with args in DI and BP.
-TEXT runtime·gcWriteBarrierBP(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierBP<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ BP, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ BP, AX
 	RET
 
 // gcWriteBarrierSI is gcWriteBarrier, but with args in DI and SI.
-TEXT runtime·gcWriteBarrierSI(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierSI<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ SI, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ SI, AX
 	RET
 
 // gcWriteBarrierR8 is gcWriteBarrier, but with args in DI and R8.
-TEXT runtime·gcWriteBarrierR8(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierR8<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ R8, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ R8, AX
 	RET
 
 // gcWriteBarrierR9 is gcWriteBarrier, but with args in DI and R9.
-TEXT runtime·gcWriteBarrierR9(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierR9<ABIInternal>(SB),NOSPLIT,$0
 	XCHGQ R9, AX
-	CALL runtime·gcWriteBarrier(SB)
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
 	XCHGQ R9, AX
 	RET
 
@@ -1559,7 +1560,10 @@ GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
 // obey escape analysis requirements. Specifically, it must not pass
 // a stack pointer to an escaping argument. debugCallV1 cannot check
 // this invariant.
-TEXT runtime·debugCallV1(SB),NOSPLIT,$152-0
+//
+// This is ABIInternal because Go code injects its PC directly into new
+// goroutine stacks.
+TEXT runtime·debugCallV1<ABIInternal>(SB),NOSPLIT,$152-0
 	// Save all registers that may contain pointers so they can be
 	// conservatively scanned.
 	//
@@ -1720,67 +1724,68 @@ TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
 // in the caller's stack frame. These stubs write the args into that stack space and
 // then tail call to the corresponding runtime handler.
 // The tail call makes these stubs disappear in backtraces.
-TEXT runtime·panicIndex(SB),NOSPLIT,$0-16
+// Defined as ABIInternal since they do not use the stack-based Go ABI.
+TEXT runtime·panicIndex<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicIndex(SB)
-TEXT runtime·panicIndexU(SB),NOSPLIT,$0-16
+TEXT runtime·panicIndexU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicIndexU(SB)
-TEXT runtime·panicSliceAlen(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceAlen<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSliceAlen(SB)
-TEXT runtime·panicSliceAlenU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceAlenU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSliceAlenU(SB)
-TEXT runtime·panicSliceAcap(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceAcap<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSliceAcap(SB)
-TEXT runtime·panicSliceAcapU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceAcapU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSliceAcapU(SB)
-TEXT runtime·panicSliceB(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceB<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicSliceB(SB)
-TEXT runtime·panicSliceBU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSliceBU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicSliceBU(SB)
-TEXT runtime·panicSlice3Alen(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3Alen<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
 	JMP	runtime·goPanicSlice3Alen(SB)
-TEXT runtime·panicSlice3AlenU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3AlenU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
 	JMP	runtime·goPanicSlice3AlenU(SB)
-TEXT runtime·panicSlice3Acap(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3Acap<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
 	JMP	runtime·goPanicSlice3Acap(SB)
-TEXT runtime·panicSlice3AcapU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3AcapU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
 	JMP	runtime·goPanicSlice3AcapU(SB)
-TEXT runtime·panicSlice3B(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3B<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSlice3B(SB)
-TEXT runtime·panicSlice3BU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3BU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
 	JMP	runtime·goPanicSlice3BU(SB)
-TEXT runtime·panicSlice3C(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3C<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicSlice3C(SB)
-TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
+TEXT runtime·panicSlice3CU<ABIInternal>(SB),NOSPLIT,$0-16
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 51a50c604c..c54b4eb006 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -643,25 +643,9 @@ nosave:
 	MOVW	R0, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R0
-	MOVW	R0, 4(R13)
-	MOVW	frame+4(FP), R0
-	MOVW	R0, 8(R13)
-	MOVW	framesize+8(FP), R0
-	MOVW	R0, 12(R13)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, 16(R13)
-	MOVW	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT	·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -686,7 +670,7 @@ needm:
 	MOVW	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -706,10 +690,10 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
-	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-12(SP).
 	MOVW	m_g0(R8), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
-	MOVW	R4, savedsp-8(SP)
+	MOVW	R4, savedsp-12(SP)	// must match frame size
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -718,30 +702,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVW	(g_sched+gobuf_pc)(g), R5
-	MOVW	R5, -12(R4)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, -8(R4)
-	MOVW	$-12(R4), R13
+	MOVW	R5, -(12+4)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R1
+	MOVW	frame+4(FP), R2
+	MOVW	ctxt+8(FP), R3
+	MOVW	$-(12+4)(R4), R13	// switch stack; must match frame size
+	MOVW	R1, 4(R13)
+	MOVW	R2, 8(R13)
+	MOVW	R3, 12(R13)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R13), R5
 	MOVW	R5, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R13), R4
+	MOVW	$(12+4)(R13), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -751,7 +735,7 @@ havem:
 	MOVW	m_g0(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R13
-	MOVW	savedsp-8(SP), R4
+	MOVW	savedsp-12(SP), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 6b3d1e779e..a09172f0c9 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -15,6 +15,19 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	MOVW	R0, 8(RSP) // argc
 	MOVD	R1, 16(RSP) // argv
 
+#ifdef TLS_darwin
+	// Initialize TLS.
+	MOVD	ZR, g // clear g, make sure it's not junk.
+	SUB	$32, RSP
+	MRS_TPIDR_R0
+	AND	$~7, R0
+	MOVD	R0, 16(RSP)             // arg2: TLS base
+	MOVD	$runtime·tls_g(SB), R2
+	MOVD	R2, 8(RSP)              // arg1: &tlsg
+	BL	·tlsinit(SB)
+	ADD	$32, RSP
+#endif
+
 	// create istack out of the given (operating system) stack.
 	// _cgo_init may update stackguard.
 	MOVD	$runtime·g0(SB), g
@@ -29,9 +42,9 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	MOVD	_cgo_init(SB), R12
 	CBZ	R12, nocgo
 
+#ifdef GOOS_android
 	MRS_TPIDR_R0			// load TLS base pointer
 	MOVD	R0, R3			// arg 3: TLS base pointer
-#ifdef TLSG_IS_VARIABLE
 	MOVD	$runtime·tls_g(SB), R2 	// arg 2: &tls_g
 #else
 	MOVD	$0, R2		        // arg 2: not used when using platform's TLS
@@ -331,6 +344,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R16
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -414,34 +428,33 @@ TEXT callRet<>(SB), NOSPLIT, $40-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
-// These have 8 added to make the overall frame size a multiple of 16,
-// as required by the ABI. (There is another +8 for the saved LR.)
-CALLFN(·call32, 40 )
-CALLFN(·call64, 72 )
-CALLFN(·call128, 136 )
-CALLFN(·call256, 264 )
-CALLFN(·call512, 520 )
-CALLFN(·call1024, 1032 )
-CALLFN(·call2048, 2056 )
-CALLFN(·call4096, 4104 )
-CALLFN(·call8192, 8200 )
-CALLFN(·call16384, 16392 )
-CALLFN(·call32768, 32776 )
-CALLFN(·call65536, 65544 )
-CALLFN(·call131072, 131080 )
-CALLFN(·call262144, 262152 )
-CALLFN(·call524288, 524296 )
-CALLFN(·call1048576, 1048584 )
-CALLFN(·call2097152, 2097160 )
-CALLFN(·call4194304, 4194312 )
-CALLFN(·call8388608, 8388616 )
-CALLFN(·call16777216, 16777224 )
-CALLFN(·call33554432, 33554440 )
-CALLFN(·call67108864, 67108872 )
-CALLFN(·call134217728, 134217736 )
-CALLFN(·call268435456, 268435464 )
-CALLFN(·call536870912, 536870920 )
-CALLFN(·call1073741824, 1073741832 )
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
 
 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
@@ -958,32 +971,13 @@ nosave:
 	MOVD	R0, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$40-32
-	MOVD	$fn+0(FP), R0
-	MOVD	R0, 8(RSP)
-	MOVD	frame+8(FP), R0
-	MOVD	R0, 16(RSP)
-	MOVD	framesize+16(FP), R0
-	MOVD	R0, 24(RSP)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, 32(RSP)
-	MOVD	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load g from thread-local storage.
-	MOVB	runtime·iscgo(SB), R3
-	CBZ	R3, nocgo
 	BL	runtime·load_g(SB)
-nocgo:
 
 	// If g is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
@@ -1001,7 +995,7 @@ needm:
 	MOVD	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -1037,16 +1031,11 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
@@ -1054,10 +1043,15 @@ havem:
 	MOVD	R5, -48(R4)
 	MOVD	(g_sched+gobuf_bp)(g), R5
 	MOVD	R5, -56(R4)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, -40(R4)
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
 	MOVD	$-48(R4), R0 // maintain 16-byte SP alignment
-	MOVD	R0, RSP
+	MOVD	R0, RSP	// switch stack
+	MOVD	R1, 8(RSP)
+	MOVD	R2, 16(RSP)
+	MOVD	R3, 24(RSP)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 7330f40e85..19781f7885 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -294,6 +294,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R1
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -470,25 +471,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVV	$fn+0(FP), R1
-	MOVV	R1, 8(R29)
-	MOVV	frame+8(FP), R1
-	MOVV	R1, 16(R29)
-	MOVV	framesize+16(FP), R1
-	MOVV	R1, 24(R29)
-	MOVV	ctxt+24(FP), R1
-	MOVV	R1, 32(R29)
-	MOVV	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -536,7 +521,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R29) aka savedsp-16(SP).
 	MOVV	m_g0(R3), R1
 	MOVV	(g_sched+gobuf_sp)(R1), R2
-	MOVV	R2, savedsp-16(SP)
+	MOVV	R2, savedsp-24(SP)	// must match frame size
 	MOVV	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -545,30 +530,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVV	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVV	(g_sched+gobuf_pc)(g), R4
-	MOVV	R4, -24(R2)
-	MOVV    ctxt+24(FP), R1
-	MOVV    R1, -16(R2)
-	MOVV	$-24(R2), R29
+	MOVV	R4, -(24+8)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVV	fn+0(FP), R5
+	MOVV	frame+8(FP), R6
+	MOVV	ctxt+16(FP), R7
+	MOVV	$-(24+8)(R2), R29	// switch stack; must match frame size
+	MOVV	R5, 8(R29)
+	MOVV	R6, 16(R29)
+	MOVV	R7, 24(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVV	0(R29), R4
 	MOVV	R4, (g_sched+gobuf_pc)(g)
-	MOVV	$24(R29), R2
+	MOVV	$(24+8)(R29), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -578,7 +563,7 @@ havem:
 	MOVV	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R29
-	MOVV	savedsp-16(SP), R2
+	MOVV	savedsp-24(SP), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index aca0510b69..ee87d81436 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -472,25 +472,9 @@ g0:
 	MOVW	R2, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R1
-	MOVW	R1, 4(R29)
-	MOVW	frame+4(FP), R1
-	MOVW	R1, 8(R29)
-	MOVW	framesize+8(FP), R1
-	MOVW	R1, 12(R29)
-	MOVW	ctxt+12(FP), R1
-	MOVW	R1, 16(R29)
-	MOVW	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT ·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -538,7 +522,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R29) aka savedsp-8(SP).
 	MOVW	m_g0(R3), R1
 	MOVW	(g_sched+gobuf_sp)(R1), R2
-	MOVW	R2, savedsp-8(SP)
+	MOVW	R2, savedsp-12(SP)	// must match frame size
 	MOVW	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -547,30 +531,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVW	(g_sched+gobuf_pc)(g), R4
-	MOVW	R4, -12(R2)
-	MOVW    ctxt+12(FP), R1
-	MOVW    R1, -8(R2)
-	MOVW	$-12(R2), R29
+	MOVW	R4, -(12+4)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R5
+	MOVW	frame+4(FP), R6
+	MOVW	ctxt+8(FP), R7
+	MOVW	$-(12+4)(R2), R29	// switch stack; must match frame size
+	MOVW	R5, 4(R29)
+	MOVW	R6, 8(R29)
+	MOVW	R7, 12(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R29), R4
 	MOVW	R4, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R29), R2
+	MOVW	$(12+4)(R29), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -580,7 +564,7 @@ havem:
 	MOVW	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R29
-	MOVW	savedsp-8(SP), R2
+	MOVW	savedsp-12(SP), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 23387a2165..dc34c0e4c8 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -372,6 +372,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -478,6 +479,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -649,26 +651,9 @@ g0:
 	MOVW	R3, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, FIXED_FRAME+8(R1)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, FIXED_FRAME+16(R1)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, FIXED_FRAME+24(R1)
-	MOVD	$runtime·cgocallback_gofunc(SB), R12
-	MOVD	R12, CTR
-	BL	(CTR)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -719,7 +704,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)      // must match frame size
 	MOVD	R1, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -728,30 +713,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -(FIXED_FRAME+16)(R4)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, -16(R4)
-	MOVD	$-(FIXED_FRAME+16)(R4), R1
+	MOVD	R5, -(24+FIXED_FRAME)(R4)       // "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R5
+	MOVD	frame+8(FP), R6
+	MOVD	ctxt+16(FP), R7
+	MOVD	$-(24+FIXED_FRAME)(R4), R1      // switch stack; must match frame size
+	MOVD    R5, FIXED_FRAME+0(R1)
+	MOVD    R6, FIXED_FRAME+8(R1)
+	MOVD    R7, FIXED_FRAME+16(R1)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R1), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$(FIXED_FRAME+16)(R1), R4
+	MOVD	$(24+FIXED_FRAME)(R1), R4       // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -761,7 +746,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R1
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4      // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 8f6c8773eb..01b42dc3de 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -9,10 +9,9 @@
 // func rt0_go()
 TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	// X2 = stack; A0 = argc; A1 = argv
-
 	ADD	$-24, X2
-	MOV	A0, 8(X2) // argc
-	MOV	A1, 16(X2) // argv
+	MOV	A0, 8(X2)	// argc
+	MOV	A1, 16(X2)	// argv
 
 	// create istack out of the given (operating system) stack.
 	// _cgo_init may update stackguard.
@@ -28,10 +27,10 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	MOV	_cgo_init(SB), T0
 	BEQ	T0, ZERO, nocgo
 
-	MOV	ZERO, A3	// arg 3: not used
-	MOV	ZERO, A2	// arg 2: not used
+	MOV	ZERO, A3		// arg 3: not used
+	MOV	ZERO, A2		// arg 2: not used
 	MOV	$setg_gcc<>(SB), A1	// arg 1: setg
-	MOV	g, A0	// arg 0: G
+	MOV	g, A0			// arg 0: G
 	JALR	RA, T0
 
 nocgo:
@@ -313,10 +312,62 @@ TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
 	CALL	runtime·badctxt(SB)
 	RET
 
+// Save state of caller into g->sched. Smashes X31.
+TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
+	MOV	X1, (g_sched+gobuf_pc)(g)
+	MOV	X2, (g_sched+gobuf_sp)(g)
+	MOV	ZERO, (g_sched+gobuf_lr)(g)
+	MOV	ZERO, (g_sched+gobuf_ret)(g)
+	// Assert ctxt is zero. See func save.
+	MOV	(g_sched+gobuf_ctxt)(g), X31
+	BEQ	ZERO, X31, 2(PC)
+	CALL	runtime·badctxt(SB)
+	RET
+
 // func asmcgocall(fn, arg unsafe.Pointer) int32
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.go for more details.
 TEXT ·asmcgocall(SB),NOSPLIT,$0-20
-	// TODO(jsing): Add support for cgo - issue #36641.
-	WORD $0		// crash
+	MOV	fn+0(FP), X5
+	MOV	arg+8(FP), X10
+
+	MOV	X2, X8	// save original stack pointer
+	MOV	g, X9
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOV	g_m(g), X6
+	MOV	m_g0(X6), X7
+	BEQ	X7, g, g0
+
+	CALL	gosave<>(SB)
+	MOV	X7, g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), X2
+
+	// Now on a scheduling stack (a pthread-created stack).
+g0:
+	// Save room for two of our pointers.
+	ADD	$-16, X2
+	MOV	X9, 0(X2)	// save old g on stack
+	MOV	(g_stack+stack_hi)(X9), X9
+	SUB	X8, X9, X8
+	MOV	X8, 8(X2)	// save depth in old g stack (can't just save SP, as stack might be copied during a callback)
+
+	JALR	RA, (X5)
+
+	// Restore g, stack pointer. X10 is return value.
+	MOV	0(X2), g
+	CALL	runtime·save_g(SB)
+	MOV	(g_stack+stack_hi)(g), X5
+	MOV	8(X2), X6
+	SUB	X6, X5, X6
+	MOV	X6, X2
+
+	MOVW	X10, ret+16(FP)
+	RET
 
 // func asminit()
 TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
@@ -342,6 +393,7 @@ TEXT reflect·call(SB), NOSPLIT, $0-0
 // func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), T0
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -443,6 +495,21 @@ CALLFN(·call268435456, 268435456)
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$8
+	// g (X27) and REG_TMP (X31) might be clobbered by load_g.
+	// X27 is callee-save in the gcc calling convention, so save it.
+	MOV	g, savedX27-8(SP)
+
+	CALL	runtime·load_g(SB)
+	MOV	g_m(g), X5
+	MOV	m_curg(X5), X5
+	MOV	(g_stack+stack_hi)(X5), X10 // return value in X10
+
+	MOV	savedX27-8(SP), g
+	RET
+
 // func goexit(neverCallThisFunction)
 // The top-most function running on a goroutine
 // returns to goexit+PCQuantum.
@@ -452,10 +519,111 @@ TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	MOV	ZERO, ZERO	// NOP
 
-// func cgocallback_gofunc(fv uintptr, frame uintptr, framesize, ctxt uintptr)
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
-	// TODO(jsing): Add support for cgo - issue #36641.
-	WORD $0		// crash
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
+// See cgocall.go for more details.
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
+	NO_LOCAL_POINTERS
+
+	// Load m and g from thread-local storage.
+	MOVBU	runtime·iscgo(SB), X5
+	BEQ	ZERO, X5, nocgo
+	CALL	runtime·load_g(SB)
+nocgo:
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	BEQ	ZERO, g, needm
+
+	MOV	g_m(g), X5
+	MOV	X5, savedm-8(SP)
+	JMP	havem
+
+needm:
+	MOV	g, savedm-8(SP) // g is zero, so is m.
+	MOV	$runtime·needm(SB), X6
+	JALR	RA, X6
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then systemstack will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOV	g_m(g), X5
+	MOV	m_g0(X5), X6
+	MOV	X2, (g_sched+gobuf_sp)(X6)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 8(X2) aka savedsp-24(SP).
+	MOV	m_g0(X5), X6
+	MOV	(g_sched+gobuf_sp)(X6), X7
+	MOV	X7, savedsp-24(SP)	// must match frame size
+	MOV	X2, (g_sched+gobuf_sp)(X6)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
+	MOV	m_curg(X5), g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), X6 // prepare stack as X6
+	MOV	(g_sched+gobuf_pc)(g), X7
+	MOV	X7, -(24+8)(X6)		// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOV	fn+0(FP), X7
+	MOV	frame+8(FP), X8
+	MOV	ctxt+16(FP), X9
+	MOV	$-(24+8)(X6), X2	// switch stack; must match frame size
+	MOV	X7, 8(X2)
+	MOV	X8, 16(X2)
+	MOV	X9, 24(X2)
+	CALL	runtime·cgocallbackg(SB)
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	MOV	0(X2), X7
+	MOV	X7, (g_sched+gobuf_pc)(g)
+	MOV	$(24+8)(X2), X6		// must match frame size
+	MOV	X6, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOV	g_m(g), X5
+	MOV	m_g0(X5), g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), X2
+	MOV	savedsp-24(SP), X6	// must match frame size
+	MOV	X6, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOV	savedm-8(SP), X5
+	BNE	ZERO, X5, droppedm
+	MOV	$runtime·dropm(SB), X6
+	JALR	RA, X6
+droppedm:
+
+	// Done!
+	RET
 
 TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
 	EBREAK
@@ -486,10 +654,10 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
 // The act of CALLing gcWriteBarrier will clobber RA (LR).
 // It does not clobber any other general-purpose registers,
 // but may clobber others (e.g., floating point registers).
-TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$296
+TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$216
 	// Save the registers clobbered by the fast path.
-	MOV	A0, 280(X2)
-	MOV	A1, 288(X2)
+	MOV	A0, 25*8(X2)
+	MOV	A1, 26*8(X2)
 	MOV	g_m(g), A0
 	MOV	m_p(A0), A0
 	MOV	(p_wbBuf+wbBuf_next)(A0), A1
@@ -505,8 +673,8 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$296
 	// Is the buffer full?
 	BEQ	A1, T6, flush
 ret:
-	MOV	280(X2), A0
-	MOV	288(X2), A1
+	MOV	25*8(X2), A0
+	MOV	26*8(X2), A1
 	// Do the write.
 	MOV	T1, (T0)
 	RET
@@ -514,84 +682,68 @@ ret:
 flush:
 	// Save all general purpose registers since these could be
 	// clobbered by wbBufFlush and were not saved by the caller.
-	MOV	T0, 8(X2)	// Also first argument to wbBufFlush
-	MOV	T1, 16(X2)	// Also second argument to wbBufFlush
-
-	// TODO: Optimise
-	// R3 is g.
-	// R4 already saved (T0)
-	// R5 already saved (T1)
-	// R9 already saved (A0)
-	// R10 already saved (A1)
-	// R30 is tmp register.
-	MOV	X0, 24(X2)
-	MOV	X1, 32(X2)
-	MOV	X2, 40(X2)
-	MOV	X3, 48(X2)
-	MOV	X4, 56(X2)
-	MOV	X5, 64(X2)
-	MOV	X6, 72(X2)
-	MOV	X7, 80(X2)
-	MOV	X8, 88(X2)
-	MOV	X9, 96(X2)
-	MOV	X10, 104(X2)
-	MOV	X11, 112(X2)
-	MOV	X12, 120(X2)
-	MOV	X13, 128(X2)
-	MOV	X14, 136(X2)
-	MOV	X15, 144(X2)
-	MOV	X16, 152(X2)
-	MOV	X17, 160(X2)
-	MOV	X18, 168(X2)
-	MOV	X19, 176(X2)
-	MOV	X20, 184(X2)
-	MOV	X21, 192(X2)
-	MOV	X22, 200(X2)
-	MOV	X23, 208(X2)
-	MOV	X24, 216(X2)
-	MOV	X25, 224(X2)
-	MOV	X26, 232(X2)
-	MOV	X27, 240(X2)
-	MOV	X28, 248(X2)
-	MOV	X29, 256(X2)
-	MOV	X30, 264(X2)
-	MOV	X31, 272(X2)
+	MOV	T0, 1*8(X2)	// Also first argument to wbBufFlush
+	MOV	T1, 2*8(X2)	// Also second argument to wbBufFlush
+	// X0 is zero register
+	// X1 is LR, saved by prologue
+	// X2 is SP
+	MOV	X3, 3*8(X2)
+	// X4 is TP
+	// X5 is first arg to wbBufFlush (T0)
+	// X6 is second arg to wbBufFlush (T1)
+	MOV	X7, 4*8(X2)
+	MOV	X8, 5*8(X2)
+	MOV	X9, 6*8(X2)
+	// X10 already saved (A0)
+	// X11 already saved (A1)
+	MOV	X12, 7*8(X2)
+	MOV	X13, 8*8(X2)
+	MOV	X14, 9*8(X2)
+	MOV	X15, 10*8(X2)
+	MOV	X16, 11*8(X2)
+	MOV	X17, 12*8(X2)
+	MOV	X18, 13*8(X2)
+	MOV	X19, 14*8(X2)
+	MOV	X20, 15*8(X2)
+	MOV	X21, 16*8(X2)
+	MOV	X22, 17*8(X2)
+	MOV	X23, 18*8(X2)
+	MOV	X24, 19*8(X2)
+	MOV	X25, 20*8(X2)
+	MOV	X26, 21*8(X2)
+	// X27 is g.
+	MOV	X28, 22*8(X2)
+	MOV	X29, 23*8(X2)
+	MOV	X30, 24*8(X2)
+	// X31 is tmp register.
 
 	// This takes arguments T0 and T1.
 	CALL	runtime·wbBufFlush(SB)
 
-	MOV	24(X2), X0
-	MOV	32(X2), X1
-	MOV	40(X2), X2
-	MOV	48(X2), X3
-	MOV	56(X2), X4
-	MOV	64(X2), X5
-	MOV	72(X2), X6
-	MOV	80(X2), X7
-	MOV	88(X2), X8
-	MOV	96(X2), X9
-	MOV	104(X2), X10
-	MOV	112(X2), X11
-	MOV	120(X2), X12
-	MOV	128(X2), X13
-	MOV	136(X2), X14
-	MOV	144(X2), X15
-	MOV	152(X2), X16
-	MOV	160(X2), X17
-	MOV	168(X2), X18
-	MOV	176(X2), X19
-	MOV	184(X2), X20
-	MOV	192(X2), X21
-	MOV	200(X2), X22
-	MOV	208(X2), X23
-	MOV	216(X2), X24
-	MOV	224(X2), X25
-	MOV	232(X2), X26
-	MOV	240(X2), X27
-	MOV	248(X2), X28
-	MOV	256(X2), X29
-	MOV	264(X2), X30
-	MOV	272(X2), X31
+	MOV	1*8(X2), T0
+	MOV	2*8(X2), T1
+	MOV	3*8(X2), X3
+	MOV	4*8(X2), X7
+	MOV	5*8(X2), X8
+	MOV	6*8(X2), X9
+	MOV	7*8(X2), X12
+	MOV	8*8(X2), X13
+	MOV	9*8(X2), X14
+	MOV	10*8(X2), X15
+	MOV	11*8(X2), X16
+	MOV	12*8(X2), X17
+	MOV	13*8(X2), X18
+	MOV	14*8(X2), X19
+	MOV	15*8(X2), X20
+	MOV	16*8(X2), X21
+	MOV	17*8(X2), X22
+	MOV	18*8(X2), X23
+	MOV	19*8(X2), X24
+	MOV	20*8(X2), X25
+	MOV	21*8(X2), X26
+	MOV	22*8(X2), X28
+	MOV	23*8(X2), X29
+	MOV	24*8(X2), X30
 
 	JMP	ret
 
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index cb39451faa..7baef37324 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -383,6 +383,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT, $-8-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -461,6 +462,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -573,25 +575,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, 8(R15)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, 16(R15)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, 24(R15)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, 32(R15)
-	MOVD	$runtime·cgocallback_gofunc(SB), R3
-	BL	(R3)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -639,7 +625,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)	// must match frame size
 	MOVD	R15, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -648,30 +634,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -24(R4)
-	MOVD	ctxt+24(FP), R5
-	MOVD	R5, -16(R4)
-	MOVD	$-24(R4), R15
+	MOVD	R5, -(24+8)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
+	MOVD	$-(24+8)(R4), R15	// switch stack; must match frame size
+	MOVD	R1, 8(R15)
+	MOVD	R2, 16(R15)
+	MOVD	R3, 24(R15)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R15), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$24(R15), R4
+	MOVD	$(24+8)(R15), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -681,7 +667,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R15
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index 7d88beb537..fcb780f1dc 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -196,7 +196,7 @@ TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
 	Get CTXT
 	I64Eqz
 	If
-		CALLNORESUME runtime·sigpanic(SB)
+		CALLNORESUME runtime·sigpanic<ABIInternal>(SB)
 	End
 
 	// caller sp after CALL
@@ -288,9 +288,6 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 TEXT ·asmcgocall(SB), NOSPLIT, $0-0
 	UNDEF
 
-TEXT ·cgocallback_gofunc(SB), NOSPLIT, $16-32
-	UNDEF
-
 #define DISPATCH(NAME, MAXSIZE) \
 	Get R0; \
 	I64Const $MAXSIZE; \
@@ -303,11 +300,12 @@ TEXT ·reflectcall(SB), NOSPLIT, $0-32
 	I64Load fn+8(FP)
 	I64Eqz
 	If
-		CALLNORESUME runtime·sigpanic(SB)
+		CALLNORESUME runtime·sigpanic<ABIInternal>(SB)
 	End
 
 	MOVW argsize+24(FP), R0
 
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -398,6 +396,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -430,7 +429,7 @@ TEXT runtime·goexit(SB), NOSPLIT, $0-0
 	CALL runtime·goexit1(SB) // does not return
 	UNDEF
 
-TEXT runtime·cgocallback(SB), NOSPLIT, $32-32
+TEXT runtime·cgocallback(SB), NOSPLIT, $0-24
 	UNDEF
 
 // gcWriteBarrier performs a heap pointer write and informs the GC.
diff --git a/src/runtime/auxv_none.go b/src/runtime/auxv_none.go
index 3a560a1793..3ca617b21e 100644
--- a/src/runtime/auxv_none.go
+++ b/src/runtime/auxv_none.go
@@ -7,7 +7,6 @@
 // +build !dragonfly
 // +build !freebsd
 // +build !netbsd
-// +build !openbsd !arm64
 // +build !solaris
 
 package runtime
diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
index 7293c20bf8..2e7e9512e2 100644
--- a/src/runtime/cgo/asm_386.s
+++ b/src/runtime/cgo/asm_386.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT,$28-16
 	MOVL BP, 24(SP)
 	MOVL BX, 20(SP)
@@ -15,12 +16,11 @@ TEXT crosscall2(SB),NOSPLIT,$28-16
 
 	MOVL	ctxt+12(FP), AX
 	MOVL	AX, 8(SP)
-	MOVL	n+8(FP), AX
-	MOVL	AX, 4(SP)
 	MOVL	a+4(FP), AX
-	MOVL	AX, 0(SP)
+	MOVL	AX, 4(SP)
 	MOVL	fn+0(FP), AX
-	CALL	AX
+	MOVL	AX, 0(SP)
+	CALL	runtime·cgocallback(SB)
 
 	MOVL 12(SP), DI
 	MOVL 16(SP), SI
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 06c538b9bc..5dc8e2d235 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s
@@ -5,8 +5,10 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
+// This signature is known to SWIG, so we can't change it.
 #ifndef GOOS_windows
 TEXT crosscall2(SB),NOSPLIT,$0x50-0 /* keeps stack pointer 32-byte aligned */
 #else
@@ -33,11 +35,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	X14, 0xe0(SP)
 	MOVUPS	X15, 0xf0(SP)
 
-	MOVQ	DX, 0x0(SP)	/* arg */
-	MOVQ	R8, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	CX, 0x0(SP)	/* fn */
+	MOVQ	DX, 0x8(SP)	/* arg */
+	// Skip n in R8.
 	MOVQ	R9, 0x10(SP)	/* ctxt */
 
-	CALL	CX	/* fn */
+	CALL	runtime·cgocallback(SB)
 
 	MOVQ	0x48(SP), DI
 	MOVQ	0x50(SP), SI
@@ -52,11 +55,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	0xe0(SP), X14
 	MOVUPS	0xf0(SP), X15
 #else
-	MOVQ	SI, 0x0(SP)	/* arg */
-	MOVQ	DX, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	DI, 0x0(SP)	/* fn */
+	MOVQ	SI, 0x8(SP)	/* arg */
+	// Skip n in DX.
 	MOVQ	CX, 0x10(SP)	/* ctxt */
 
-	CALL	DI	/* fn */
+	CALL	runtime·cgocallback(SB)
 #endif
 
 	MOVQ	0x18(SP), BX
diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
index 60132c14a8..ea55e173c1 100644
--- a/src/runtime/cgo/asm_arm.s
+++ b/src/runtime/cgo/asm_arm.s
@@ -5,51 +5,52 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
-	/*
-	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
-	 * Also note that at procedure entry in gc world, 4(R13) will be the
-	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
-	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
-	 *  nevertheless.
-	 */
 	SUB	$(8*9), R13 // Reserve space for the floating point registers.
-	MOVM.WP	[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	// The C arguments arrive in R0, R1, R2, and R3. We want to
+	// pass R0, R1, and R3 to Go, so we push those on the stack.
+	// Also, save C callee-save registers R4-R12.
+	MOVM.WP	[R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12], (R13)
+	// Finally, save the link register R14. This also puts the
+	// arguments we pushed for cgocallback where they need to be,
+	// starting at 4(R13).
+	MOVW.W	R14, -4(R13)
 
 	// Skip floating point registers on GOARM < 6.
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfpsave
-	MOVD	F8, (14*4+8*1)(R13)
-	MOVD	F9, (14*4+8*2)(R13)
-	MOVD	F10, (14*4+8*3)(R13)
-	MOVD	F11, (14*4+8*4)(R13)
-	MOVD	F12, (14*4+8*5)(R13)
-	MOVD	F13, (14*4+8*6)(R13)
-	MOVD	F14, (14*4+8*7)(R13)
-	MOVD	F15, (14*4+8*8)(R13)
+	MOVD	F8, (13*4+8*1)(R13)
+	MOVD	F9, (13*4+8*2)(R13)
+	MOVD	F10, (13*4+8*3)(R13)
+	MOVD	F11, (13*4+8*4)(R13)
+	MOVD	F12, (13*4+8*5)(R13)
+	MOVD	F13, (13*4+8*6)(R13)
+	MOVD	F14, (13*4+8*7)(R13)
+	MOVD	F15, (13*4+8*8)(R13)
 
 skipfpsave:
 	BL	runtime·load_g(SB)
-	MOVW	R15, R14 // R15 is PC.
-	MOVW	0(R13), R15
+	// We set up the arguments to cgocallback when saving registers above.
+	BL	runtime·cgocallback(SB)
 
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfprest
-	MOVD	(14*4+8*1)(R13), F8
-	MOVD	(14*4+8*2)(R13), F9
-	MOVD	(14*4+8*3)(R13), F10
-	MOVD	(14*4+8*4)(R13), F11
-	MOVD	(14*4+8*5)(R13), F12
-	MOVD	(14*4+8*6)(R13), F13
-	MOVD	(14*4+8*7)(R13), F14
-	MOVD	(14*4+8*8)(R13), F15
+	MOVD	(13*4+8*1)(R13), F8
+	MOVD	(13*4+8*2)(R13), F9
+	MOVD	(13*4+8*3)(R13), F10
+	MOVD	(13*4+8*4)(R13), F11
+	MOVD	(13*4+8*5)(R13), F12
+	MOVD	(13*4+8*6)(R13), F13
+	MOVD	(13*4+8*7)(R13), F14
+	MOVD	(13*4+8*8)(R13), F15
 
 skipfprest:
-	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14]
+	MOVW.P	4(R13), R14
+	MOVM.IAW	(R13), [R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12]
 	ADD	$(8*9), R13
 	MOVW	R14, R15
diff --git a/src/runtime/cgo/asm_arm64.s b/src/runtime/cgo/asm_arm64.s
index ce56f9b1c7..1cb25cf89e 100644
--- a/src/runtime/cgo/asm_arm64.s
+++ b/src/runtime/cgo/asm_arm64.s
@@ -5,19 +5,20 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
+	 *  push 3 args for fn (R0, R1, R3), skipping R2.
 	 * Also note that at procedure entry in gc world, 8(RSP) will be the
 	 *  first arg.
 	 * TODO(minux): use LDP/STP here if it matters.
 	 */
 	SUB	$(8*24), RSP
-	MOVD	R1, (8*1)(RSP)
-	MOVD	R2, (8*2)(RSP)
+	MOVD	R0, (8*1)(RSP)
+	MOVD	R1, (8*2)(RSP)
 	MOVD	R3, (8*3)(RSP)
 	MOVD	R19, (8*4)(RSP)
 	MOVD	R20, (8*5)(RSP)
@@ -40,15 +41,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	FMOVD	F14, (8*22)(RSP)
 	FMOVD	F15, (8*23)(RSP)
 
-	MOVD	R0, R19
-
 	// Initialize Go ABI environment
 	BL	runtime·load_g(SB)
-	BL	(R19)
 
-	MOVD	(8*1)(RSP), R1
-	MOVD	(8*2)(RSP), R2
-	MOVD	(8*3)(RSP), R3
+	BL	runtime·cgocallback(SB)
+
 	MOVD	(8*4)(RSP), R19
 	MOVD	(8*5)(RSP), R20
 	MOVD	(8*6)(RSP), R21
diff --git a/src/runtime/cgo/asm_mips64x.s b/src/runtime/cgo/asm_mips64x.s
index 1235852dbe..e51cdf3d12 100644
--- a/src/runtime/cgo/asm_mips64x.s
+++ b/src/runtime/cgo/asm_mips64x.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 8(R29) will be the
 	 *  first arg.
 	 */
@@ -22,9 +22,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	ADDV	$(-8*15), R29
 #endif
-	MOVV	R5, (8*1)(R29) // void*
-	MOVW	R6, (8*2)(R29) // int32
-	MOVV	R7, (8*3)(R29) // uintptr
+	MOVV	R4, (8*1)(R29) // fn unsafe.Pointer
+	MOVV	R5, (8*2)(R29) // a unsafe.Pointer
+	MOVV	R7, (8*3)(R29) // ctxt uintptr
 	MOVV	R16, (8*4)(R29)
 	MOVV	R17, (8*5)(R29)
 	MOVV	R18, (8*6)(R29)
@@ -52,7 +52,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	SRLV	$32, R31, RSB
 	SLLV	$32, RSB
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVV	(8*4)(R29), R16
 	MOVV	(8*5)(R29), R17
diff --git a/src/runtime/cgo/asm_mipsx.s b/src/runtime/cgo/asm_mipsx.s
index e3090da223..1127c8beb4 100644
--- a/src/runtime/cgo/asm_mipsx.s
+++ b/src/runtime/cgo/asm_mipsx.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 4(R29) will be the
 	 *  first arg.
 	 */
@@ -25,9 +25,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	SUBU	$(4*14-16), R29	// For soft-float, no FPR.
 #endif
-	MOVW	R5, (4*1)(R29)
-	MOVW	R6, (4*2)(R29)
-	MOVW	R7, (4*3)(R29)
+	MOVW	R4, (4*1)(R29)	// fn unsafe.Pointer
+	MOVW	R5, (4*2)(R29)	// a unsafe.Pointer
+	MOVW	R7, (4*3)(R29)	// ctxt uintptr
 	MOVW	R16, (4*4)(R29)
 	MOVW	R17, (4*5)(R29)
 	MOVW	R18, (4*6)(R29)
@@ -47,7 +47,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	MOVD	F30, (4*14+8*5)(R29)
 #endif
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVW	(4*4)(R29), R16
 	MOVW	(4*5)(R29), R17
diff --git a/src/runtime/cgo/asm_ppc64x.s b/src/runtime/cgo/asm_ppc64x.s
index 3876f9389c..f4efc1e67d 100644
--- a/src/runtime/cgo/asm_ppc64x.s
+++ b/src/runtime/cgo/asm_ppc64x.s
@@ -8,8 +8,9 @@
 #include "asm_ppc64x.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage
 	MOVD	LR, R0
@@ -26,19 +27,18 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	BL	runtime·reginit(SB)
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, R12
 #ifdef GOARCH_ppc64
 	// ppc64 use elf ABI v1. we must get the real entry address from
 	// first slot of the function descriptor before call.
 	// Same for AIX.
-	MOVD	8(R12), R2
-	MOVD	(R12), R12
+	MOVD	8(R3), R2
+	MOVD	(R3), R3
 #endif
-	MOVD	R12, CTR
-	MOVD	R4, FIXED_FRAME+0(R1)
-	MOVW	R5, FIXED_FRAME+8(R1)
-	MOVD	R6, FIXED_FRAME+16(R1)
-	BL	(CTR)
+	MOVD	R3, FIXED_FRAME+0(R1)	// fn unsafe.Pointer
+	MOVD	R4, FIXED_FRAME+8(R1)	// a unsafe.Pointer
+	// Skip R5 = n uint32
+	MOVD	R6, FIXED_FRAME+16(R1)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	ADD	$(288+3*8+FIXED_FRAME), R1
 
diff --git a/src/runtime/cgo/asm_riscv64.s b/src/runtime/cgo/asm_riscv64.s
new file mode 100644
index 0000000000..b4ddbb020f
--- /dev/null
+++ b/src/runtime/cgo/asm_riscv64.s
@@ -0,0 +1,84 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build riscv64
+
+#include "textflag.h"
+
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
+TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
+	/*
+	 * Push arguments for fn (X10, X11, X13), along with all callee-save
+	 * registers. Note that at procedure entry the first argument is at
+	 * 8(X2).
+	 */
+	ADD	$(-8*31), X2
+	MOV	X10, (8*1)(X2) // fn unsafe.Pointer
+	MOV	X11, (8*2)(X2) // a unsafe.Pointer
+	MOV	X13, (8*3)(X2) // ctxt uintptr
+	MOV	X8, (8*4)(X2)
+	MOV	X9, (8*5)(X2)
+	MOV	X18, (8*6)(X2)
+	MOV	X19, (8*7)(X2)
+	MOV	X20, (8*8)(X2)
+	MOV	X21, (8*9)(X2)
+	MOV	X22, (8*10)(X2)
+	MOV	X23, (8*11)(X2)
+	MOV	X24, (8*12)(X2)
+	MOV	X25, (8*13)(X2)
+	MOV	X26, (8*14)(X2)
+	MOV	g, (8*15)(X2)
+	MOV	X3, (8*16)(X2)
+	MOV	X4, (8*17)(X2)
+	MOV	X1, (8*18)(X2)
+	MOVD	F8, (8*19)(X2)
+	MOVD	F9, (8*20)(X2)
+	MOVD	F18, (8*21)(X2)
+	MOVD	F19, (8*22)(X2)
+	MOVD	F20, (8*23)(X2)
+	MOVD	F21, (8*24)(X2)
+	MOVD	F22, (8*25)(X2)
+	MOVD	F23, (8*26)(X2)
+	MOVD	F24, (8*27)(X2)
+	MOVD	F25, (8*28)(X2)
+	MOVD	F26, (8*29)(X2)
+	MOVD	F27, (8*30)(X2)
+
+	// Initialize Go ABI environment
+	CALL	runtime·load_g(SB)
+	CALL	runtime·cgocallback(SB)
+
+	MOV	(8*4)(X2), X8
+	MOV	(8*5)(X2), X9
+	MOV	(8*6)(X2), X18
+	MOV	(8*7)(X2), X19
+	MOV	(8*8)(X2), X20
+	MOV	(8*9)(X2), X21
+	MOV	(8*10)(X2), X22
+	MOV	(8*11)(X2), X23
+	MOV	(8*12)(X2), X24
+	MOV	(8*13)(X2), X25
+	MOV	(8*14)(X2), X26
+	MOV	(8*15)(X2), g
+	MOV	(8*16)(X2), X3
+	MOV	(8*17)(X2), X4
+	MOV	(8*18)(X2), X1
+	MOVD	(8*19)(X2), F8
+	MOVD	(8*20)(X2), F9
+	MOVD	(8*21)(X2), F18
+	MOVD	(8*22)(X2), F19
+	MOVD	(8*23)(X2), F20
+	MOVD	(8*24)(X2), F21
+	MOVD	(8*25)(X2), F22
+	MOVD	(8*26)(X2), F23
+	MOVD	(8*27)(X2), F24
+	MOVD	(8*28)(X2), F25
+	MOVD	(8*29)(X2), F26
+	MOVD	(8*30)(X2), F27
+	ADD	$(8*31), X2
+
+	RET
diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
index 7eab8f652a..8bf16e75e2 100644
--- a/src/runtime/cgo/asm_s390x.s
+++ b/src/runtime/cgo/asm_s390x.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage.
 
@@ -29,10 +30,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Initialize Go ABI environment.
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, 8(R15)  // arg1
-	MOVW	R4, 16(R15) // arg2
-	MOVD	R5, 24(R15) // arg3
-	BL	(R2)        // fn(arg1, arg2, arg3)
+	MOVD	R2, 8(R15)	// fn unsafe.Pointer
+	MOVD	R3, 16(R15)	// a unsafe.Pointer
+	// Skip R4 = n uint32
+	MOVD	R5, 24(R15)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	FMOVD	32(R15), F8
 	FMOVD	40(R15), F9
diff --git a/src/runtime/cgo/callbacks.go b/src/runtime/cgo/callbacks.go
index 14a218ec92..cd8b795387 100644
--- a/src/runtime/cgo/callbacks.go
+++ b/src/runtime/cgo/callbacks.go
@@ -9,20 +9,18 @@ import "unsafe"
 // These utility functions are available to be called from code
 // compiled with gcc via crosscall2.
 
-// cgocallback is defined in runtime
-//go:linkname _runtime_cgocallback runtime.cgocallback
-func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
-
 // The declaration of crosscall2 is:
-//   void crosscall2(void (*fn)(void *, int), void *, int);
+//   void crosscall2(void (*fn)(void *), void *, int);
 //
 // We need to export the symbol crosscall2 in order to support
 // callbacks from shared libraries. This applies regardless of
 // linking mode.
 //
-// Compatibility note: crosscall2 actually takes four arguments, but
-// it works to call it with three arguments when calling _cgo_panic.
-// That is supported for backward compatibility.
+// Compatibility note: SWIG uses crosscall2 in exactly one situation:
+// to call _cgo_panic using the pattern shown below. We need to keep
+// that pattern working. In particular, crosscall2 actually takes four
+// arguments, but it works to call it with three arguments when
+// calling _cgo_panic.
 //go:cgo_export_static crosscall2
 //go:cgo_export_dynamic crosscall2
 
@@ -34,21 +32,18 @@ func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
 //   crosscall2(_cgo_panic, &a, sizeof a);
 //   /* The function call will not return.  */
 
+// TODO: We should export a regular C function to panic, change SWIG
+// to use that instead of the above pattern, and then we can drop
+// backwards-compatibility from crosscall2 and stop exporting it.
+
 //go:linkname _runtime_cgo_panic_internal runtime._cgo_panic_internal
 func _runtime_cgo_panic_internal(p *byte)
 
 //go:linkname _cgo_panic _cgo_panic
 //go:cgo_export_static _cgo_panic
 //go:cgo_export_dynamic _cgo_panic
-//go:nosplit
-//go:norace
-func _cgo_panic(a unsafe.Pointer, n int32) {
-	f := _runtime_cgo_panic_internal
-	type funcval struct {
-		pc unsafe.Pointer
-	}
-	fv := *(**funcval)(unsafe.Pointer(&f))
-	_runtime_cgocallback(fv.pc, a, uintptr(n), 0)
+func _cgo_panic(a *struct{ cstr *byte }) {
+	_runtime_cgo_panic_internal(a.cstr)
 }
 
 //go:cgo_import_static x_cgo_init
diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index c02b837978..4d2caf6c4f 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go
@@ -21,6 +21,7 @@ package cgo
 #cgo openbsd LDFLAGS: -lpthread
 #cgo aix LDFLAGS: -Wl,-berok
 #cgo solaris LDFLAGS: -lxnet
+#cgo illumos LDFLAGS: -lsocket
 
 // Issue 35247.
 #cgo darwin CFLAGS: -Wno-nullability-completeness
diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index fd7d4084c9..a5f07f1f1b 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -10,42 +10,15 @@
 #include <unistd.h>
 #include <stdlib.h>
 
-#include <CoreFoundation/CFBundle.h>
-#include <CoreFoundation/CFString.h>
-
 #include "libcgo.h"
 #include "libcgo_unix.h"
 
-#define magic (0xc476c475c47957UL)
+#include <TargetConditionals.h>
 
-// inittls allocates a thread-local storage slot for g.
-//
-// It finds the first available slot using pthread_key_create and uses
-// it as the offset value for runtime.tlsg.
-static void
-inittls(void **tlsg, void **tlsbase)
-{
-	pthread_key_t k;
-	int i, err;
-
-	err = pthread_key_create(&k, nil);
-	if(err != 0) {
-		fprintf(stderr, "runtime/cgo: pthread_key_create failed: %d\n", err);
-		abort();
-	}
-	//fprintf(stderr, "runtime/cgo: k = %d, tlsbase = %p\n", (int)k, tlsbase); // debug
-	pthread_setspecific(k, (void*)magic);
-	// The first key should be at 257.
-	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
-		if (*(tlsbase+i) == (void*)magic) {
-			*tlsg = (void*)(i*sizeof(void *));
-			pthread_setspecific(k, 0);
-			return;
-		}
-	}
-	fprintf(stderr, "runtime/cgo: could not find pthread key.\n");
-	abort();
-}
+#if TARGET_OS_IPHONE
+#include <CoreFoundation/CFBundle.h>
+#include <CoreFoundation/CFString.h>
+#endif
 
 static void *threadentry(void*);
 static void (*setg_gcc)(void*);
@@ -87,14 +60,18 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
+#if TARGET_OS_IPHONE
 	darwin_arm_init_thread_exception_port();
+#endif
 
 	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
 
+#if TARGET_OS_IPHONE
+
 // init_working_dir sets the current working directory to the app root.
-// By default darwin/arm64 processes start in "/".
+// By default ios/arm64 processes start in "/".
 static void
 init_working_dir()
 {
@@ -131,7 +108,7 @@ init_working_dir()
 		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
 	}
 
-	// The test harness in go_darwin_arm_exec passes the relative working directory
+	// The test harness in go_ios_exec passes the relative working directory
 	// in the GoExecWrapperWorkingDirectory property of the app bundle.
 	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
 	if (wd_ref != NULL) {
@@ -145,8 +122,10 @@ init_working_dir()
 	}
 }
 
+#endif // TARGET_OS_IPHONE
+
 void
-x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -158,11 +137,9 @@ x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 	g->stacklo = (uintptr)&attr - size + 4096;
 	pthread_attr_destroy(&attr);
 
-	// yes, tlsbase from mrs might not be correctly aligned.
-	inittls(tlsg, (void**)((uintptr)tlsbase & ~7));
-
+#if TARGET_OS_IPHONE
 	darwin_arm_init_mach_exception_handler();
 	darwin_arm_init_thread_exception_port();
-
 	init_working_dir();
+#endif
 }
diff --git a/src/runtime/cgo/gcc_libinit_windows.c b/src/runtime/cgo/gcc_libinit_windows.c
index 9fd7d36bfb..2732248bdc 100644
--- a/src/runtime/cgo/gcc_libinit_windows.c
+++ b/src/runtime/cgo/gcc_libinit_windows.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo
+
 #define WIN64_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
diff --git a/src/runtime/cgo/gcc_linux_riscv64.c b/src/runtime/cgo/gcc_linux_riscv64.c
new file mode 100644
index 0000000000..22b76c2fcc
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_riscv64.c
@@ -0,0 +1,74 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static void *threadentry(void*);
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase);
+static void (*setg_gcc)(void*);
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
+	ts->g->stackhi = size;
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls(tlsg, tlsbase);
+	}
+}
diff --git a/src/runtime/cgo/gcc_netbsd_arm64.c b/src/runtime/cgo/gcc_netbsd_arm64.c
index b29fab0f8c..694116ce70 100644
--- a/src/runtime/cgo/gcc_netbsd_arm64.c
+++ b/src/runtime/cgo/gcc_netbsd_arm64.c
@@ -53,6 +53,8 @@ _cgo_sys_thread_start(ThreadStart *ts)
 	}
 }
 
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+
 static void*
 threadentry(void *v)
 {
diff --git a/src/runtime/cgo/gcc_riscv64.S b/src/runtime/cgo/gcc_riscv64.S
new file mode 100644
index 0000000000..f429dc64ee
--- /dev/null
+++ b/src/runtime/cgo/gcc_riscv64.S
@@ -0,0 +1,80 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
+ *
+ * Calling into the gc tool chain, where all registers are caller save.
+ * Called from standard RISCV ELF psABI, where x8-x9, x18-x27, f8-f9 and
+ * f18-f27 are callee-save, so they must be saved explicitly, along with
+ * x1 (LR).
+ */
+.globl crosscall1
+crosscall1:
+	sd	x1, -200(sp)
+	addi	sp, sp, -200
+	sd	x8, 8(sp)
+	sd	x9, 16(sp)
+	sd	x18, 24(sp)
+	sd	x19, 32(sp)
+	sd	x20, 40(sp)
+	sd	x21, 48(sp)
+	sd	x22, 56(sp)
+	sd	x23, 64(sp)
+	sd	x24, 72(sp)
+	sd	x25, 80(sp)
+	sd	x26, 88(sp)
+	sd	x27, 96(sp)
+	fsd	f8, 104(sp)
+	fsd	f9, 112(sp)
+	fsd	f18, 120(sp)
+	fsd	f19, 128(sp)
+	fsd	f20, 136(sp)
+	fsd	f21, 144(sp)
+	fsd	f22, 152(sp)
+	fsd	f23, 160(sp)
+	fsd	f24, 168(sp)
+	fsd	f25, 176(sp)
+	fsd	f26, 184(sp)
+	fsd	f27, 192(sp)
+
+	// a0 = *fn, a1 = *setg_gcc, a2 = *g
+	mv	s1, a0
+	mv	s0, a1
+	mv	a0, a2
+	jalr	ra, s0	// call setg_gcc (clobbers x30 aka g)
+	jalr	ra, s1	// call fn
+
+	ld	x1, 0(sp)
+	ld	x8, 8(sp)
+	ld	x9, 16(sp)
+	ld	x18, 24(sp)
+	ld	x19, 32(sp)
+	ld	x20, 40(sp)
+	ld	x21, 48(sp)
+	ld	x22, 56(sp)
+	ld	x23, 64(sp)
+	ld	x24, 72(sp)
+	ld	x25, 80(sp)
+	ld	x26, 88(sp)
+	ld	x27, 96(sp)
+	fld	f8, 104(sp)
+	fld	f9, 112(sp)
+	fld	f18, 120(sp)
+	fld	f19, 128(sp)
+	fld	f20, 136(sp)
+	fld	f21, 144(sp)
+	fld	f22, 152(sp)
+	fld	f23, 160(sp)
+	fld	f24, 168(sp)
+	fld	f25, 176(sp)
+	fld	f26, 184(sp)
+	fld	f27, 192(sp)
+	addi	sp, sp, 200
+
+	jr	ra
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/runtime/cgo/gcc_signal2_darwin_arm64.c b/src/runtime/cgo/gcc_signal2_ios_arm64.c
index 5b8a18ffd6..5b8a18ffd6 100644
--- a/src/runtime/cgo/gcc_signal2_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_signal2_ios_arm64.c
diff --git a/src/runtime/cgo/gcc_signal_darwin_arm64.c b/src/runtime/cgo/gcc_signal_ios_arm64.c
index 6519edd4cc..6519edd4cc 100644
--- a/src/runtime/cgo/gcc_signal_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_signal_ios_arm64.c
diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_ios_nolldb.c
index 0ccdae324e..cfa4025414 100644
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ b/src/runtime/cgo/gcc_signal_ios_nolldb.c
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build !lldb
-// +build darwin
+// +build ios
 // +build arm64
 
 #include <stdint.h>
diff --git a/src/runtime/cgo/gcc_windows_386.c b/src/runtime/cgo/gcc_windows_386.c
index 9184b91393..60cb011bf2 100644
--- a/src/runtime/cgo/gcc_windows_386.c
+++ b/src/runtime/cgo/gcc_windows_386.c
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <errno.h>
 #include "libcgo.h"
+#include "libcgo_windows.h"
 
 static void threadentry(void*);
 
diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
index 7192a24631..0f8c817f0e 100644
--- a/src/runtime/cgo/gcc_windows_amd64.c
+++ b/src/runtime/cgo/gcc_windows_amd64.c
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <errno.h>
 #include "libcgo.h"
+#include "libcgo_windows.h"
 
 static void threadentry(void*);
 
diff --git a/src/runtime/cgo/libcgo_windows.h b/src/runtime/cgo/libcgo_windows.h
new file mode 100644
index 0000000000..0013f06bae
--- /dev/null
+++ b/src/runtime/cgo/libcgo_windows.h
@@ -0,0 +1,12 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ensure there's one symbol marked __declspec(dllexport).
+// If there are no exported symbols, the unfortunate behavior of
+// the binutils linker is to also strip the relocations table,
+// resulting in non-PIE binary. The other option is the
+// --export-all-symbols flag, but we don't need to export all symbols
+// and this may overflow the export table (#40795).
+// See https://sourceware.org/bugzilla/show_bug.cgi?id=19011
+__declspec(dllexport) int _cgo_dummy_export;
diff --git a/src/runtime/cgo/linux.go b/src/runtime/cgo/linux.go
new file mode 100644
index 0000000000..76c0192c20
--- /dev/null
+++ b/src/runtime/cgo/linux.go
@@ -0,0 +1,74 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linux system call wrappers that provide POSIX semantics through the
+// corresponding cgo->libc (nptl) wrappers for various system calls.
+
+// +build linux
+
+package cgo
+
+import "unsafe"
+
+// Each of the following entries is needed to ensure that the
+// syscall.syscall_linux code can conditionally call these
+// function pointers:
+//
+//  1. find the C-defined function start
+//  2. force the local byte alias to be mapped to that location
+//  3. map the Go pointer to the function to the syscall package
+
+//go:cgo_import_static _cgo_libc_setegid
+//go:linkname _cgo_libc_setegid _cgo_libc_setegid
+//go:linkname cgo_libc_setegid syscall.cgo_libc_setegid
+var _cgo_libc_setegid byte
+var cgo_libc_setegid = unsafe.Pointer(&_cgo_libc_setegid)
+
+//go:cgo_import_static _cgo_libc_seteuid
+//go:linkname _cgo_libc_seteuid _cgo_libc_seteuid
+//go:linkname cgo_libc_seteuid syscall.cgo_libc_seteuid
+var _cgo_libc_seteuid byte
+var cgo_libc_seteuid = unsafe.Pointer(&_cgo_libc_seteuid)
+
+//go:cgo_import_static _cgo_libc_setregid
+//go:linkname _cgo_libc_setregid _cgo_libc_setregid
+//go:linkname cgo_libc_setregid syscall.cgo_libc_setregid
+var _cgo_libc_setregid byte
+var cgo_libc_setregid = unsafe.Pointer(&_cgo_libc_setregid)
+
+//go:cgo_import_static _cgo_libc_setresgid
+//go:linkname _cgo_libc_setresgid _cgo_libc_setresgid
+//go:linkname cgo_libc_setresgid syscall.cgo_libc_setresgid
+var _cgo_libc_setresgid byte
+var cgo_libc_setresgid = unsafe.Pointer(&_cgo_libc_setresgid)
+
+//go:cgo_import_static _cgo_libc_setresuid
+//go:linkname _cgo_libc_setresuid _cgo_libc_setresuid
+//go:linkname cgo_libc_setresuid syscall.cgo_libc_setresuid
+var _cgo_libc_setresuid byte
+var cgo_libc_setresuid = unsafe.Pointer(&_cgo_libc_setresuid)
+
+//go:cgo_import_static _cgo_libc_setreuid
+//go:linkname _cgo_libc_setreuid _cgo_libc_setreuid
+//go:linkname cgo_libc_setreuid syscall.cgo_libc_setreuid
+var _cgo_libc_setreuid byte
+var cgo_libc_setreuid = unsafe.Pointer(&_cgo_libc_setreuid)
+
+//go:cgo_import_static _cgo_libc_setgroups
+//go:linkname _cgo_libc_setgroups _cgo_libc_setgroups
+//go:linkname cgo_libc_setgroups syscall.cgo_libc_setgroups
+var _cgo_libc_setgroups byte
+var cgo_libc_setgroups = unsafe.Pointer(&_cgo_libc_setgroups)
+
+//go:cgo_import_static _cgo_libc_setgid
+//go:linkname _cgo_libc_setgid _cgo_libc_setgid
+//go:linkname cgo_libc_setgid syscall.cgo_libc_setgid
+var _cgo_libc_setgid byte
+var cgo_libc_setgid = unsafe.Pointer(&_cgo_libc_setgid)
+
+//go:cgo_import_static _cgo_libc_setuid
+//go:linkname _cgo_libc_setuid _cgo_libc_setuid
+//go:linkname cgo_libc_setuid syscall.cgo_libc_setuid
+var _cgo_libc_setuid byte
+var cgo_libc_setuid = unsafe.Pointer(&_cgo_libc_setuid)
diff --git a/src/runtime/cgo/linux_syscall.c b/src/runtime/cgo/linux_syscall.c
new file mode 100644
index 0000000000..56f3d67d8b
--- /dev/null
+++ b/src/runtime/cgo/linux_syscall.c
@@ -0,0 +1,85 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+#ifndef _GNU_SOURCE // setres[ug]id() API.
+#define _GNU_SOURCE
+#endif
+
+#include <grp.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include "libcgo.h"
+
+/*
+ * Assumed POSIX compliant libc system call wrappers. For linux, the
+ * glibc/nptl/setxid mechanism ensures that POSIX semantics are
+ * honored for all pthreads (by default), and this in turn with cgo
+ * ensures that all Go threads launched with cgo are kept in sync for
+ * these function calls.
+ */
+
+// argset_t matches runtime/cgocall.go:argset.
+typedef struct {
+	uintptr_t* args;
+	uintptr_t retval;
+} argset_t;
+
+// libc backed posix-compliant syscalls.
+
+#define SET_RETVAL(fn) \
+  uintptr_t ret = (uintptr_t) fn ; \
+  if (ret == -1) {                 \
+    x->retval = (uintptr_t) errno; \
+  } else                           \
+    x->retval = ret
+
+void
+_cgo_libc_setegid(argset_t* x) {
+	SET_RETVAL(setegid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_seteuid(argset_t* x) {
+	SET_RETVAL(seteuid((uid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgid(argset_t* x) {
+	SET_RETVAL(setgid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgroups(argset_t* x) {
+	SET_RETVAL(setgroups((size_t) x->args[0], (const gid_t *) x->args[1]));
+}
+
+void
+_cgo_libc_setregid(argset_t* x) {
+	SET_RETVAL(setregid((gid_t) x->args[0], (gid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setresgid(argset_t* x) {
+	SET_RETVAL(setresgid((gid_t) x->args[0], (gid_t) x->args[1],
+			     (gid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setresuid(argset_t* x) {
+	SET_RETVAL(setresuid((uid_t) x->args[0], (uid_t) x->args[1],
+			     (uid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setreuid(argset_t* x) {
+	SET_RETVAL(setreuid((uid_t) x->args[0], (uid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setuid(argset_t* x) {
+	SET_RETVAL(setuid((uid_t) x->args[0]));
+}
diff --git a/src/runtime/cgo/signal_darwin_arm64.go b/src/runtime/cgo/signal_ios_arm64.go
index 3425c448c4..3425c448c4 100644
--- a/src/runtime/cgo/signal_darwin_arm64.go
+++ b/src/runtime/cgo/signal_ios_arm64.go
diff --git a/src/runtime/cgo/signal_darwin_arm64.s b/src/runtime/cgo/signal_ios_arm64.s
index 1ae00d13f3..1ae00d13f3 100644
--- a/src/runtime/cgo/signal_darwin_arm64.s
+++ b/src/runtime/cgo/signal_ios_arm64.s
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 427ed0ffb9..20cacd6043 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -35,47 +35,52 @@
 // cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
 // know about packages).  The gcc-compiled C function f calls GoF.
 //
-// GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
-// (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
-// adapter from the gcc function call ABI to the 6c function call ABI.
-// It is called from gcc to call 6c functions. In this case it calls
-// _cgoexp_GoF(frame, framesize), still running on m->g0's stack
-// and outside the $GOMAXPROCS limit. Thus, this code cannot yet
-// call arbitrary Go code directly and must be careful not to allocate
-// memory or use up m->g0's stack.
+// GoF initializes "frame", a structure containing all of its
+// arguments and slots for p.GoF's results. It calls
+// crosscall2(_cgoexp_GoF, frame, framesize, ctxt) using the gcc ABI.
 //
-// _cgoexp_GoF calls runtime.cgocallback(p.GoF, frame, framesize, ctxt).
-// (The reason for having _cgoexp_GoF instead of writing a crosscall3
-// to make this call directly is that _cgoexp_GoF, because it is compiled
-// with 6c instead of gcc, can refer to dotted names like
-// runtime.cgocallback and p.GoF.)
+// crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
+// the gcc function call ABI to the gc function call ABI. At this
+// point we're in the Go runtime, but we're still running on m.g0's
+// stack and outside the $GOMAXPROCS limit. crosscall2 calls
+// runtime.cgocallback(_cgoexp_GoF, frame, ctxt) using the gc ABI.
+// (crosscall2's framesize argument is no longer used, but there's one
+// case where SWIG calls crosscall2 directly and expects to pass this
+// argument. See _cgo_panic.)
 //
-// runtime.cgocallback (in asm_$GOARCH.s) switches from m->g0's
-// stack to the original g (m->curg)'s stack, on which it calls
-// runtime.cgocallbackg(p.GoF, frame, framesize).
-// As part of the stack switch, runtime.cgocallback saves the current
-// SP as m->g0->sched.sp, so that any use of m->g0's stack during the
-// execution of the callback will be done below the existing stack frames.
-// Before overwriting m->g0->sched.sp, it pushes the old value on the
-// m->g0 stack, so that it can be restored later.
+// runtime.cgocallback (in asm_$GOARCH.s) switches from m.g0's stack
+// to the original g (m.curg)'s stack, on which it calls
+// runtime.cgocallbackg(_cgoexp_GoF, frame, ctxt). As part of the
+// stack switch, runtime.cgocallback saves the current SP as
+// m.g0.sched.sp, so that any use of m.g0's stack during the execution
+// of the callback will be done below the existing stack frames.
+// Before overwriting m.g0.sched.sp, it pushes the old value on the
+// m.g0 stack, so that it can be restored later.
 //
 // runtime.cgocallbackg (below) is now running on a real goroutine
-// stack (not an m->g0 stack).  First it calls runtime.exitsyscall, which will
+// stack (not an m.g0 stack).  First it calls runtime.exitsyscall, which will
 // block until the $GOMAXPROCS limit allows running this goroutine.
 // Once exitsyscall has returned, it is safe to do things like call the memory
-// allocator or invoke the Go callback function p.GoF.  runtime.cgocallbackg
-// first defers a function to unwind m->g0.sched.sp, so that if p.GoF
-// panics, m->g0.sched.sp will be restored to its old value: the m->g0 stack
-// and the m->curg stack will be unwound in lock step.
-// Then it calls p.GoF.  Finally it pops but does not execute the deferred
-// function, calls runtime.entersyscall, and returns to runtime.cgocallback.
+// allocator or invoke the Go callback function.  runtime.cgocallbackg
+// first defers a function to unwind m.g0.sched.sp, so that if p.GoF
+// panics, m.g0.sched.sp will be restored to its old value: the m.g0 stack
+// and the m.curg stack will be unwound in lock step.
+// Then it calls _cgoexp_GoF(frame).
+//
+// _cgoexp_GoF, which was generated by cmd/cgo, unpacks the arguments
+// from frame, calls p.GoF, writes the results back to frame, and
+// returns. Now we start unwinding this whole process.
+//
+// runtime.cgocallbackg pops but does not execute the deferred
+// function to unwind m.g0.sched.sp, calls runtime.entersyscall, and
+// returns to runtime.cgocallback.
 //
 // After it regains control, runtime.cgocallback switches back to
-// m->g0's stack (the pointer is still in m->g0.sched.sp), restores the old
-// m->g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+// m.g0's stack (the pointer is still in m.g0.sched.sp), restores the old
+// m.g0.sched.sp value from the stack, and returns to crosscall2.
 //
-// _cgoexp_GoF immediately returns to crosscall2, which restores the
-// callee-save registers for gcc and returns to GoF, which returns to f.
+// crosscall2 restores the callee-save registers for gcc and returns
+// to GoF, which unpacks any result values and returns to f.
 
 package runtime
 
@@ -89,6 +94,22 @@ import (
 // Length must match arg.Max in x_cgo_callers in runtime/cgo/gcc_traceback.c.
 type cgoCallers [32]uintptr
 
+// argset matches runtime/cgo/linux_syscall.c:argset_t
+type argset struct {
+	args   unsafe.Pointer
+	retval uintptr
+}
+
+// wrapper for syscall package to call cgocall for libc (cgo) calls.
+//go:linkname syscall_cgocaller syscall.cgocaller
+//go:nosplit
+//go:uintptrescapes
+func syscall_cgocaller(fn unsafe.Pointer, args ...uintptr) uintptr {
+	as := argset{args: unsafe.Pointer(&args[0])}
+	cgocall(fn, unsafe.Pointer(&as))
+	return as.retval
+}
+
 // Call from Go to C.
 //
 // This must be nosplit because it's used for syscalls on some
@@ -176,7 +197,7 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 
 // Call from C back to Go.
 //go:nosplit
-func cgocallbackg(ctxt uintptr) {
+func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp != gp.m.curg {
 		println("runtime: bad g in cgocallback")
@@ -204,7 +225,7 @@ func cgocallbackg(ctxt uintptr) {
 
 	osPreemptExtExit(gp.m)
 
-	cgocallbackg1(ctxt)
+	cgocallbackg1(fn, frame, ctxt)
 
 	// At this point unlockOSThread has been called.
 	// The following code must not change to a different m.
@@ -219,7 +240,7 @@ func cgocallbackg(ctxt uintptr) {
 	gp.m.syscall = syscall
 }
 
-func cgocallbackg1(ctxt uintptr) {
+func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp.m.needextram || atomic.Load(&extraMWaiters) > 0 {
 		gp.m.needextram = false
@@ -263,79 +284,16 @@ func cgocallbackg1(ctxt uintptr) {
 		raceacquire(unsafe.Pointer(&racecgosync))
 	}
 
-	type args struct {
-		fn      *funcval
-		arg     unsafe.Pointer
-		argsize uintptr
-	}
-	var cb *args
-
-	// Location of callback arguments depends on stack frame layout
-	// and size of stack frame of cgocallback_gofunc.
-	sp := gp.m.g0.sched.sp
-	switch GOARCH {
-	default:
-		throw("cgocallbackg is unimplemented on arch")
-	case "arm":
-		// On arm, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "arm64":
-		// On arm64, stack frame is four words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		// Additional two words (16-byte alignment) are for saving FP.
-		cb = (*args)(unsafe.Pointer(sp + 7*sys.PtrSize))
-	case "amd64":
-		// On amd64, stack frame is two words, plus caller PC and BP.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "386":
-		// On 386, stack frame is three words, plus caller PC.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "ppc64", "ppc64le", "s390x":
-		// On ppc64 and s390x, the callback arguments are in the arguments area of
-		// cgocallback's stack frame. The stack looks like this:
-		// +--------------------+------------------------------+
-		// |                    | ...                          |
-		// | cgoexp_$fn         +------------------------------+
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+
-		// |                    | arguments area               |
-		// | cgocallback        +------------------------------+ <- sp + 2*minFrameSize + 2*ptrSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp + minFrameSize + 2*ptrSize
-		// |                    | local variables (2 pointers) |
-		// | cgocallback_gofunc +------------------------------+ <- sp + minFrameSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp
-		cb = (*args)(unsafe.Pointer(sp + 2*sys.MinFrameSize + 2*sys.PtrSize))
-	case "mips64", "mips64le":
-		// On mips64x, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "mips", "mipsle":
-		// On mipsx, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	}
-
-	// Invoke callback.
-	// NOTE(rsc): passing nil for argtype means that the copying of the
-	// results back into cb.arg happens without any corresponding write barriers.
-	// For cgo, cb.arg points into a C stack frame and therefore doesn't
-	// hold any pointers that the GC can find anyway - the write barrier
-	// would be a no-op.
-	reflectcall(nil, unsafe.Pointer(cb.fn), cb.arg, uint32(cb.argsize), 0)
+	// Invoke callback. This function is generated by cmd/cgo and
+	// will unpack the argument frame and call the Go function.
+	var cb func(frame unsafe.Pointer)
+	cbFV := funcval{uintptr(fn)}
+	*(*unsafe.Pointer)(unsafe.Pointer(&cb)) = noescape(unsafe.Pointer(&cbFV))
+	cb(frame)
 
 	if raceenabled {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
 	}
-	if msanenabled {
-		// Tell msan that we wrote to the entire argument block.
-		// This tells msan that we set the results.
-		// Since we have already called the function it doesn't
-		// matter that we are writing to the non-result parameters.
-		msanwrite(cb.arg, cb.argsize)
-	}
 
 	// Do not unwind m->g0->sched.sp.
 	// Our caller, cgocallback, will do that.
@@ -351,7 +309,7 @@ func unwindm(restore *bool) {
 		switch GOARCH {
 		default:
 			throw("unwindm not implemented")
-		case "386", "amd64", "arm", "ppc64", "ppc64le", "mips64", "mips64le", "s390x", "mips", "mipsle":
+		case "386", "amd64", "arm", "ppc64", "ppc64le", "mips64", "mips64le", "s390x", "mips", "mipsle", "riscv64":
 			sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + sys.MinFrameSize))
 		case "arm64":
 			sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 16))
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 0afe5d962b..ba56e2cc40 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -215,8 +215,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 		// Space is available in the channel buffer. Enqueue the element to send.
 		qp := chanbuf(c, c.sendx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
+			racenotify(c, c.sendx, nil)
 		}
 		typedmemmove(c.elemtype, qp, ep)
 		c.sendx++
@@ -250,6 +249,11 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	gp.waiting = mysg
 	gp.param = nil
 	c.sendq.enqueue(mysg)
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
 	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
 	// Ensure the value being sent is kept alive until the
 	// receiver copies it out. The sudog has a pointer to the
@@ -293,11 +297,8 @@ func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 			// Pretend we go through the buffer, even though
 			// we copy directly. Note that we need to increment
 			// the head/tail locations only when raceenabled.
-			qp := chanbuf(c, c.recvx)
-			raceacquire(qp)
-			racerelease(qp)
-			raceacquireg(sg.g, qp)
-			racereleaseg(sg.g, qp)
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
 			c.recvx++
 			if c.recvx == c.dataqsiz {
 				c.recvx = 0
@@ -530,8 +531,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 		// Receive directly from queue
 		qp := chanbuf(c, c.recvx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
+			racenotify(c, c.recvx, nil)
 		}
 		if ep != nil {
 			typedmemmove(c.elemtype, ep, qp)
@@ -568,6 +568,11 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
 	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
 
 	// someone woke us up
@@ -615,10 +620,8 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 		// queue is full, those are both the same slot.
 		qp := chanbuf(c, c.recvx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
-			raceacquireg(sg.g, qp)
-			racereleaseg(sg.g, qp)
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
 		}
 		// copy data from queue to receiver
 		if ep != nil {
@@ -646,7 +649,19 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool {
 	// There are unlocked sudogs that point into gp's stack. Stack
 	// copying must lock the channels of those sudogs.
+	// Set activeStackChans here instead of before we try parking
+	// because we could self-deadlock in stack growth on the
+	// channel lock.
 	gp.activeStackChans = true
+	// Mark that it's safe for stack shrinking to occur now,
+	// because any thread acquiring this G's stack for shrinking
+	// is guaranteed to observe activeStackChans after this store.
+	atomic.Store8(&gp.parkingOnChan, 0)
+	// Make sure we unlock after setting activeStackChans and
+	// unsetting parkingOnChan. The moment we unlock chanLock
+	// we risk gp getting readied by a channel operation and
+	// so gp could continue running before everything before
+	// the unlock is visible (even to gp itself).
 	unlock((*mutex)(chanLock))
 	return true
 }
@@ -817,3 +832,38 @@ func racesync(c *hchan, sg *sudog) {
 	racereleaseg(sg.g, chanbuf(c, 0))
 	raceacquire(chanbuf(c, 0))
 }
+
+// Notify the race detector of a send or receive involving buffer entry idx
+// and a channel c or its communicating partner sg.
+// This function handles the special case of c.elemsize==0.
+func racenotify(c *hchan, idx uint, sg *sudog) {
+	// We could have passed the unsafe.Pointer corresponding to entry idx
+	// instead of idx itself.  However, in a future version of this function,
+	// we can use idx to better handle the case of elemsize==0.
+	// A future improvement to the detector is to call TSan with c and idx:
+	// this way, Go will continue to not allocating buffer entries for channels
+	// of elemsize==0, yet the race detector can be made to handle multiple
+	// sync objects underneath the hood (one sync object per idx)
+	qp := chanbuf(c, idx)
+	// When elemsize==0, we don't allocate a full buffer for the channel.
+	// Instead of individual buffer entries, the race detector uses the
+	// c.buf as the only buffer entry.  This simplification prevents us from
+	// following the memory model's happens-before rules (rules that are
+	// implemented in racereleaseacquire).  Instead, we accumulate happens-before
+	// information in the synchronization object associated with c.buf.
+	if c.elemsize == 0 {
+		if sg == nil {
+			raceacquire(qp)
+			racerelease(qp)
+		} else {
+			raceacquireg(sg.g, qp)
+			racereleaseg(sg.g, qp)
+		}
+	} else {
+		if sg == nil {
+			racereleaseacquire(qp)
+		} else {
+			racereleaseacquireg(sg.g, qp)
+		}
+	}
+}
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index 039a086e9b..756bbbeccf 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@@ -623,6 +623,62 @@ func TestShrinkStackDuringBlockedSend(t *testing.T) {
 	<-done
 }
 
+func TestNoShrinkStackWhileParking(t *testing.T) {
+	// The goal of this test is to trigger a "racy sudog adjustment"
+	// throw. Basically, there's a window between when a goroutine
+	// becomes available for preemption for stack scanning (and thus,
+	// stack shrinking) but before the goroutine has fully parked on a
+	// channel. See issue 40641 for more details on the problem.
+	//
+	// The way we try to induce this failure is to set up two
+	// goroutines: a sender and a reciever that communicate across
+	// a channel. We try to set up a situation where the sender
+	// grows its stack temporarily then *fully* blocks on a channel
+	// often. Meanwhile a GC is triggered so that we try to get a
+	// mark worker to shrink the sender's stack and race with the
+	// sender parking.
+	//
+	// Unfortunately the race window here is so small that we
+	// either need a ridiculous number of iterations, or we add
+	// "usleep(1000)" to park_m, just before the unlockf call.
+	const n = 10
+	send := func(c chan<- int, done chan struct{}) {
+		for i := 0; i < n; i++ {
+			c <- i
+			// Use lots of stack briefly so that
+			// the GC is going to want to shrink us
+			// when it scans us. Make sure not to
+			// do any function calls otherwise
+			// in order to avoid us shrinking ourselves
+			// when we're preempted.
+			stackGrowthRecursive(20)
+		}
+		done <- struct{}{}
+	}
+	recv := func(c <-chan int, done chan struct{}) {
+		for i := 0; i < n; i++ {
+			// Sleep here so that the sender always
+			// fully blocks.
+			time.Sleep(10 * time.Microsecond)
+			<-c
+		}
+		done <- struct{}{}
+	}
+	for i := 0; i < n*20; i++ {
+		c := make(chan int)
+		done := make(chan struct{})
+		go recv(c, done)
+		go send(c, done)
+		// Wait a little bit before triggering
+		// the GC to make sure the sender and
+		// reciever have gotten into their groove.
+		time.Sleep(50 * time.Microsecond)
+		runtime.GC()
+		<-done
+		<-done
+	}
+}
+
 func TestSelectDuplicateChannel(t *testing.T) {
 	// This test makes sure we can queue a G on
 	// the same channel multiple times.
diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go
new file mode 100644
index 0000000000..7576bef4a7
--- /dev/null
+++ b/src/runtime/cpuflags_arm64.go
@@ -0,0 +1,17 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+)
+
+var arm64UseAlignedLoads bool
+
+func init() {
+	if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
+		arm64UseAlignedLoads = true
+	}
+}
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 4872189f16..140c170ddc 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -154,7 +154,7 @@ func TestCgoExecSignalMask(t *testing.T) {
 	case "windows", "plan9":
 		t.Skipf("skipping signal mask test on %s", runtime.GOOS)
 	}
-	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask")
+	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask", "GOTRACEBACK=system")
 	want := "OK\n"
 	if got != want {
 		t.Errorf("expected %q, got %v", want, got)
@@ -254,6 +254,24 @@ func TestCgoCrashTraceback(t *testing.T) {
 	}
 }
 
+func TestCgoCrashTracebackGo(t *testing.T) {
+	t.Parallel()
+	switch platform := runtime.GOOS + "/" + runtime.GOARCH; platform {
+	case "darwin/amd64":
+	case "linux/amd64":
+	case "linux/ppc64le":
+	default:
+		t.Skipf("not yet supported on %s", platform)
+	}
+	got := runTestProg(t, "testprogcgo", "CrashTracebackGo")
+	for i := 1; i <= 3; i++ {
+		want := fmt.Sprintf("main.h%d", i)
+		if !strings.Contains(got, want) {
+			t.Errorf("missing %s", want)
+		}
+	}
+}
+
 func TestCgoTracebackContext(t *testing.T) {
 	t.Parallel()
 	got := runTestProg(t, "testprogcgo", "TracebackContext")
@@ -600,3 +618,16 @@ func TestEINTR(t *testing.T) {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+// Issue #42207.
+func TestNeedmDeadlock(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprogcgo", "NeedmDeadlock")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 34f30c9a37..58ad4f3eba 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -9,7 +9,6 @@ import (
 	"flag"
 	"fmt"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -117,7 +116,7 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	testprog.Lock()
 	defer testprog.Unlock()
 	if testprog.dir == "" {
-		dir, err := ioutil.TempDir("", "go-build")
+		dir, err := os.MkdirTemp("", "go-build")
 		if err != nil {
 			t.Fatalf("failed to create temp directory: %v", err)
 		}
@@ -181,6 +180,9 @@ func TestCrashHandler(t *testing.T) {
 }
 
 func testDeadlock(t *testing.T, name string) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", name)
 	want := "fatal error: all goroutines are asleep - deadlock!\n"
 	if !strings.HasPrefix(output, want) {
@@ -205,6 +207,9 @@ func TestLockedDeadlock2(t *testing.T) {
 }
 
 func TestGoexitDeadlock(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitDeadlock")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -290,6 +295,9 @@ func TestRecursivePanic4(t *testing.T) {
 }
 
 func TestGoexitCrash(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -348,6 +356,9 @@ func TestBreakpoint(t *testing.T) {
 }
 
 func TestGoexitInPanic(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	// see issue 8774: this code used to trigger an infinite recursion
 	output := runTestProg(t, "testprog", "GoexitInPanic")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -412,6 +423,9 @@ func TestPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoveredPanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "RecoveredPanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
@@ -420,6 +434,9 @@ func TestRecoveredPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -429,6 +446,9 @@ func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -667,7 +687,9 @@ func TestBadTraceback(t *testing.T) {
 }
 
 func TestTimePprof(t *testing.T) {
-	fn := runTestProg(t, "testprog", "TimeProf")
+	// Pass GOTRACEBACK for issue #41120 to try to get more
+	// information on timeout.
+	fn := runTestProg(t, "testprog", "TimeProf", "GOTRACEBACK=crash")
 	fn = strings.TrimSpace(fn)
 	defer os.Remove(fn)
 
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 8ef52aba48..803b031873 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -10,7 +10,6 @@ import (
 	"bytes"
 	"internal/testenv"
 	"io"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -70,6 +69,10 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skipf("skipping; not supported on %v", runtime.GOOS)
 	}
 
+	if runtime.GOOS == "openbsd" && runtime.GOARCH == "mips64" {
+		t.Skipf("skipping; test fails on %s/%s - see issue #42464", runtime.GOOS, runtime.GOARCH)
+	}
+
 	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
 		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
 	}
@@ -81,13 +84,13 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	t.Parallel()
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
 	defer os.RemoveAll(dir)
 
-	if err := ioutil.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
+	if err := os.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
 		t.Fatalf("failed to create Go file: %v", err)
 	}
 
@@ -228,20 +231,27 @@ func TestPanicSystemstack(t *testing.T) {
 	}
 	defer pr.Close()
 
-	// Wait for "x\nx\n" to indicate readiness.
+	// Wait for "x\nx\n" to indicate almost-readiness.
 	buf := make([]byte, 4)
 	_, err = io.ReadFull(pr, buf)
 	if err != nil || string(buf) != "x\nx\n" {
 		t.Fatal("subprocess failed; output:\n", string(buf))
 	}
 
+	// The child blockers print "x\n" and then block on a lock. Receiving
+	// those bytes only indicates that the child is _about to block_. Since
+	// we don't have a way to know when it is fully blocked, sleep a bit to
+	// make us less likely to lose the race and signal before the child
+	// blocks.
+	time.Sleep(100 * time.Millisecond)
+
 	// Send SIGQUIT.
 	if err := cmd.Process.Signal(syscall.SIGQUIT); err != nil {
 		t.Fatal("signaling subprocess: ", err)
 	}
 
 	// Get traceback.
-	tb, err := ioutil.ReadAll(pr)
+	tb, err := io.ReadAll(pr)
 	if err != nil {
 		t.Fatal("reading traceback from pipe: ", err)
 	}
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index 76eeb2e41a..f411b22676 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -10,9 +10,8 @@ import (
 )
 
 // GOMAXPROCS sets the maximum number of CPUs that can be executing
-// simultaneously and returns the previous setting. If n < 1, it does not
-// change the current setting.
-// The number of logical CPUs on the local machine can be queried with NumCPU.
+// simultaneously and returns the previous setting. It defaults to
+// the value of runtime.NumCPU. If n < 1, it does not change the current setting.
 // This call will go away when the scheduler improves.
 func GOMAXPROCS(n int) int {
 	if GOARCH == "wasm" && n > 1 {
diff --git a/src/runtime/debug/garbage.go b/src/runtime/debug/garbage.go
index 785e9d4598..00f92c3ddf 100644
--- a/src/runtime/debug/garbage.go
+++ b/src/runtime/debug/garbage.go
@@ -106,6 +106,8 @@ func FreeOSMemory() {
 // the program crashes.
 // SetMaxStack returns the previous setting.
 // The initial setting is 1 GB on 64-bit systems, 250 MB on 32-bit systems.
+// There may be a system-imposed maximum stack limit regardless
+// of the value provided to SetMaxStack.
 //
 // SetMaxStack is useful mainly for limiting the damage done by
 // goroutines that enter an infinite recursion. It only limits future
@@ -139,6 +141,11 @@ func SetMaxThreads(threads int) int {
 // manipulation of memory may cause faults at non-nil addresses in less
 // dramatic situations; SetPanicOnFault allows such programs to request
 // that the runtime trigger only a panic, not a crash.
+// The runtime.Error that the runtime panics with may have an additional method:
+//     Addr() uintptr
+// If that method exists, it returns the memory address which triggered the fault.
+// The results of Addr are best-effort and the veracity of the result
+// may depend on the platform.
 // SetPanicOnFault applies only to the current goroutine.
 // It returns the previous setting.
 func SetPanicOnFault(enabled bool) bool {
diff --git a/src/runtime/debug/heapdump_test.go b/src/runtime/debug/heapdump_test.go
index de1ec27d21..768934d05d 100644
--- a/src/runtime/debug/heapdump_test.go
+++ b/src/runtime/debug/heapdump_test.go
@@ -5,7 +5,6 @@
 package debug_test
 
 import (
-	"io/ioutil"
 	"os"
 	"runtime"
 	. "runtime/debug"
@@ -16,7 +15,7 @@ func TestWriteHeapDumpNonempty(t *testing.T) {
 	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
-	f, err := ioutil.TempFile("", "heapdumptest")
+	f, err := os.CreateTemp("", "heapdumptest")
 	if err != nil {
 		t.Fatalf("TempFile failed: %v", err)
 	}
@@ -45,7 +44,7 @@ func TestWriteHeapDumpFinalizers(t *testing.T) {
 	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
-	f, err := ioutil.TempFile("", "heapdumptest")
+	f, err := os.CreateTemp("", "heapdumptest")
 	if err != nil {
 		t.Fatalf("TempFile failed: %v", err)
 	}
diff --git a/src/runtime/debug/panic_test.go b/src/runtime/debug/panic_test.go
new file mode 100644
index 0000000000..b67a3de4f9
--- /dev/null
+++ b/src/runtime/debug/panic_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd
+
+// TODO: test on Windows?
+
+package debug_test
+
+import (
+	"runtime"
+	"runtime/debug"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestPanicOnFault(t *testing.T) {
+	if runtime.GOARCH == "s390x" {
+		t.Skip("s390x fault addresses are missing the low order bits")
+	}
+	if runtime.GOOS == "ios" {
+		t.Skip("iOS doesn't provide fault addresses")
+	}
+	m, err := syscall.Mmap(-1, 0, 0x1000, syscall.PROT_READ /* Note: no PROT_WRITE */, syscall.MAP_SHARED|syscall.MAP_ANON)
+	if err != nil {
+		t.Fatalf("can't map anonymous memory: %s", err)
+	}
+	defer syscall.Munmap(m)
+	old := debug.SetPanicOnFault(true)
+	defer debug.SetPanicOnFault(old)
+	const lowBits = 0x3e7
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatalf("write did not fault")
+		}
+		type addressable interface {
+			Addr() uintptr
+		}
+		a, ok := r.(addressable)
+		if !ok {
+			t.Fatalf("fault does not contain address")
+		}
+		want := uintptr(unsafe.Pointer(&m[lowBits]))
+		got := a.Addr()
+		if got != want {
+			t.Fatalf("fault address %x, want %x", got, want)
+		}
+	}()
+	m[lowBits] = 1 // will fault
+}
diff --git a/src/runtime/debug_test.go b/src/runtime/debug_test.go
index 722e81121f..a0b3f84382 100644
--- a/src/runtime/debug_test.go
+++ b/src/runtime/debug_test.go
@@ -17,7 +17,7 @@ package runtime_test
 
 import (
 	"fmt"
-	"io/ioutil"
+	"os"
 	"regexp"
 	"runtime"
 	"runtime/debug"
@@ -95,7 +95,7 @@ func debugCallTKill(tid int) error {
 // Linux-specific.
 func skipUnderDebugger(t *testing.T) {
 	pid := syscall.Getpid()
-	status, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
+	status, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
 	if err != nil {
 		t.Logf("couldn't get proc tracer: %s", err)
 		return
diff --git a/src/runtime/debugcall.go b/src/runtime/debugcall.go
index b5480c73ae..efc68a767d 100644
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go
@@ -76,32 +76,14 @@ func debugCallCheck(pc uintptr) string {
 			return
 		}
 
-		if !go115ReduceLiveness {
-			// Look up PC's register map.
-			pcdata := int32(-1)
-			if pc != f.entry {
-				pc--
-				pcdata = pcdatavalue(f, _PCDATA_RegMapIndex, pc, nil)
-			}
-			if pcdata == -1 {
-				pcdata = 0 // in prologue
-			}
-			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_RegPointerMaps))
-			if pcdata == _PCDATA_RegMapUnsafe || stkmap == nil {
-				// Not at a safe point.
-				ret = debugCallUnsafePoint
-				return
-			}
-		} else {
-			// Check that this isn't an unsafe-point.
-			if pc != f.entry {
-				pc--
-			}
-			up := pcdatavalue(f, _PCDATA_UnsafePoint, pc, nil)
-			if up != _PCDATA_UnsafePointSafe {
-				// Not at a safe point.
-				ret = debugCallUnsafePoint
-			}
+		// Check that this isn't an unsafe-point.
+		if pc != f.entry {
+			pc--
+		}
+		up := pcdatavalue(f, _PCDATA_UnsafePoint, pc, nil)
+		if up != _PCDATA_UnsafePointSafe {
+			// Not at a safe point.
+			ret = debugCallUnsafePoint
 		}
 	})
 	return ret
diff --git a/src/runtime/defs_darwin_arm64.go b/src/runtime/defs_darwin_arm64.go
index 2f466045d4..9076e8bd54 100644
--- a/src/runtime/defs_darwin_arm64.go
+++ b/src/runtime/defs_darwin_arm64.go
@@ -94,6 +94,8 @@ const (
 
 	_PTHREAD_CREATE_DETACHED = 0x2
 
+	_PTHREAD_KEYS_MAX = 512
+
 	_F_SETFD    = 0x2
 	_F_GETFL    = 0x3
 	_F_SETFL    = 0x4
@@ -233,3 +235,5 @@ type machTimebaseInfo struct {
 	numer uint32
 	denom uint32
 }
+
+type pthreadkey uint64
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index 4774e36c92..53e9d59a3c 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -7,8 +7,11 @@
 /*
 Input to cgo.
 
-GOARCH=amd64 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_amd64.h
-GOARCH=386 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_386.h
+GOARCH=amd64 go tool cgo -godefs defs_openbsd.go
+GOARCH=386 go tool cgo -godefs defs_openbsd.go
+GOARCH=arm go tool cgo -godefs defs_openbsd.go
+GOARCH=arm64 go tool cgo -godefs defs_openbsd.go
+GOARCH=mips64 go tool cgo -godefs defs_openbsd.go
 */
 
 package runtime
@@ -21,6 +24,7 @@ package runtime
 #include <sys/unistd.h>
 #include <sys/signal.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <signal.h>
 */
 import "C"
diff --git a/src/runtime/defs_openbsd_mips64.go b/src/runtime/defs_openbsd_mips64.go
new file mode 100644
index 0000000000..28d70b7a01
--- /dev/null
+++ b/src/runtime/defs_openbsd_mips64.go
@@ -0,0 +1,167 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Generated from:
+//
+//   GOARCH=mips64 go tool cgo -godefs defs_openbsd.go
+//
+// Then converted to the form used by the runtime.
+
+package runtime
+
+import "unsafe"
+
+const (
+	_EINTR  = 0x4
+	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x10000
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x1000
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+	_MAP_STACK   = 0x4000
+
+	_MADV_FREE = 0x6
+
+	_SA_SIGINFO = 0x40
+	_SA_RESTART = 0x2
+	_SA_ONSTACK = 0x1
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGEMT    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGBUS    = 0xa
+	_SIGSEGV   = 0xb
+	_SIGSYS    = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGTERM   = 0xf
+	_SIGURG    = 0x10
+	_SIGSTOP   = 0x11
+	_SIGTSTP   = 0x12
+	_SIGCONT   = 0x13
+	_SIGCHLD   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGIO     = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGINFO   = 0x1d
+	_SIGUSR1   = 0x1e
+	_SIGUSR2   = 0x1f
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EV_ADD       = 0x1
+	_EV_DELETE    = 0x2
+	_EV_CLEAR     = 0x20
+	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
+	_EVFILT_READ  = -0x1
+	_EVFILT_WRITE = -0x2
+)
+
+type tforkt struct {
+	tf_tcb   unsafe.Pointer
+	tf_tid   *int32
+	tf_stack uintptr
+}
+
+type sigcontext struct {
+	sc_cookie  uint64
+	sc_mask    uint64
+	sc_pc      uint64
+	sc_regs    [32]uint64
+	mullo      uint64
+	mulhi      uint64
+	sc_fpregs  [33]uint64
+	sc_fpused  uint64
+	sc_fpc_eir uint64
+	_xxx       [8]int64
+}
+
+type siginfo struct {
+	si_signo  int32
+	si_code   int32
+	si_errno  int32
+	pad_cgo_0 [4]byte
+	_data     [120]byte
+}
+
+type stackt struct {
+	ss_sp     uintptr
+	ss_size   uintptr
+	ss_flags  int32
+	pad_cgo_0 [4]byte
+}
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type keventt struct {
+	ident  uint64
+	filter int16
+	flags  uint16
+	fflags uint32
+	data   int64
+	udata  *byte
+}
diff --git a/src/runtime/duff_amd64.s b/src/runtime/duff_amd64.s
index 44dc75d297..2ff5bf6dbc 100644
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s
@@ -4,7 +4,7 @@
 
 #include "textflag.h"
 
-TEXT runtime·duffzero(SB), NOSPLIT, $0-0
+TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT, $0-0
 	MOVUPS	X0,(DI)
 	MOVUPS	X0,16(DI)
 	MOVUPS	X0,32(DI)
@@ -103,7 +103,7 @@ TEXT runtime·duffzero(SB), NOSPLIT, $0-0
 
 	RET
 
-TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
+TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT, $0-0
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
diff --git a/src/runtime/duff_riscv64.s b/src/runtime/duff_riscv64.s
new file mode 100644
index 0000000000..f7bd3f326e
--- /dev/null
+++ b/src/runtime/duff_riscv64.s
@@ -0,0 +1,907 @@
+// Code generated by mkduff.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+#include "textflag.h"
+
+TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	RET
+
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	RET
diff --git a/src/runtime/env_plan9.go b/src/runtime/env_plan9.go
index b7ea863735..f1ac4760a7 100644
--- a/src/runtime/env_plan9.go
+++ b/src/runtime/env_plan9.go
@@ -23,8 +23,8 @@ const (
 // to the (possibly shared) Plan 9 environment, so that Setenv and Getenv
 // conform to the same Posix semantics as on other operating systems.
 // For Plan 9 shared environment semantics, instead of Getenv(key) and
-// Setenv(key, value), one can use ioutil.ReadFile("/env/" + key) and
-// ioutil.WriteFile("/env/" + key, value, 0666) respectively.
+// Setenv(key, value), one can use os.ReadFile("/env/" + key) and
+// os.WriteFile("/env/" + key, value, 0666) respectively.
 //go:nosplit
 func goenvs() {
 	buf := make([]byte, envBufSize)
diff --git a/src/runtime/error.go b/src/runtime/error.go
index 386569bead..9e6cdf35dd 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -77,6 +77,26 @@ func (e errorString) Error() string {
 	return "runtime error: " + string(e)
 }
 
+type errorAddressString struct {
+	msg  string  // error message
+	addr uintptr // memory address where the error occurred
+}
+
+func (e errorAddressString) RuntimeError() {}
+
+func (e errorAddressString) Error() string {
+	return "runtime error: " + e.msg
+}
+
+// Addr returns the memory address where a fault occurred.
+// The address provided is best-effort.
+// The veracity of the result may depend on the platform.
+// Errors providing this method will only be returned as
+// a result of using runtime/debug.SetPanicOnFault.
+func (e errorAddressString) Addr() uintptr {
+	return e.addr
+}
+
 // plainError represents a runtime error described a string without
 // the prefix "runtime error: " after invoking errorString.Error().
 // See Issue #14965.
diff --git a/src/runtime/export_pipe2_test.go b/src/runtime/export_pipe2_test.go
new file mode 100644
index 0000000000..9d580d3313
--- /dev/null
+++ b/src/runtime/export_pipe2_test.go
@@ -0,0 +1,15 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func Pipe() (r, w int32, errno int32) {
+	r, w, errno = pipe2(0)
+	if errno == _ENOSYS {
+		return pipe()
+	}
+	return r, w, errno
+}
diff --git a/src/runtime/export_pipe_test.go b/src/runtime/export_pipe_test.go
new file mode 100644
index 0000000000..8f66770fb9
--- /dev/null
+++ b/src/runtime/export_pipe_test.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly
+
+package runtime
+
+var Pipe = pipe
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 3307000c51..44551dcaf1 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -298,6 +298,32 @@ func (p *ProfBuf) Close() {
 	(*profBuf)(p).close()
 }
 
+func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
+	stopTheWorld("ReadMetricsSlow")
+
+	// Initialize the metrics beforehand because this could
+	// allocate and skew the stats.
+	semacquire(&metricsSema)
+	initMetrics()
+	semrelease(&metricsSema)
+
+	systemstack(func() {
+		// Read memstats first. It's going to flush
+		// the mcaches which readMetrics does not do, so
+		// going the other way around may result in
+		// inconsistent statistics.
+		readmemstats_m(memStats)
+	})
+
+	// Read metrics off the system stack.
+	//
+	// The only part of readMetrics that could allocate
+	// and skew the stats is initMetrics.
+	readMetrics(samplesp, len, cap)
+
+	startTheWorld()
+}
+
 // ReadMemStatsSlow returns both the runtime-computed MemStats and
 // MemStats accumulated by scanning the heap.
 func ReadMemStatsSlow() (base, slow MemStats) {
@@ -337,20 +363,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
 		var smallFree uint64
-		slow.Frees = mheap_.nlargefree
-		for i := range mheap_.nsmallfree {
-			slow.Frees += mheap_.nsmallfree[i]
-			bySize[i].Frees = mheap_.nsmallfree[i]
-			bySize[i].Mallocs += mheap_.nsmallfree[i]
-			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += memstats.tinyallocs
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
@@ -358,7 +386,11 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 		}
 
 		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
-			pg := mheap_.pages.chunkOf(i).scavenged.popcntRange(0, pallocChunkPages)
+			chunk := mheap_.pages.tryChunkOf(i)
+			if chunk == nil {
+				continue
+			}
+			pg := chunk.scavenged.popcntRange(0, pallocChunkPages)
 			slow.HeapReleased += uint64(pg) * pageSize
 		}
 		for _, p := range allp {
@@ -711,7 +743,16 @@ func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
 	return (*pageCache)(c).alloc(npages)
 }
 func (c *PageCache) Flush(s *PageAlloc) {
-	(*pageCache)(c).flush((*pageAlloc)(s))
+	cp := (*pageCache)(c)
+	sp := (*pageAlloc)(s)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(sp.mheapLock)
+		cp.flush(sp)
+		unlock(sp.mheapLock)
+	})
 }
 
 // Expose chunk index type.
@@ -722,13 +763,41 @@ type ChunkIdx chunkIdx
 type PageAlloc pageAlloc
 
 func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
-	return (*pageAlloc)(p).alloc(npages)
+	pp := (*pageAlloc)(p)
+
+	var addr, scav uintptr
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		addr, scav = pp.alloc(npages)
+		unlock(pp.mheapLock)
+	})
+	return addr, scav
 }
 func (p *PageAlloc) AllocToCache() PageCache {
-	return PageCache((*pageAlloc)(p).allocToCache())
+	pp := (*pageAlloc)(p)
+
+	var c PageCache
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		c = PageCache(pp.allocToCache())
+		unlock(pp.mheapLock)
+	})
+	return c
 }
 func (p *PageAlloc) Free(base, npages uintptr) {
-	(*pageAlloc)(p).free(base, npages)
+	pp := (*pageAlloc)(p)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		pp.free(base, npages)
+		unlock(pp.mheapLock)
+	})
 }
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
@@ -736,6 +805,8 @@ func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
 		lock(pp.mheapLock)
 		r = pp.scavenge(nbytes, mayUnlock)
 		unlock(pp.mheapLock)
@@ -745,10 +816,7 @@ func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 func (p *PageAlloc) InUse() []AddrRange {
 	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
 	for _, r := range p.inUse.ranges {
-		ranges = append(ranges, AddrRange{
-			Base:  r.base.addr(),
-			Limit: r.limit.addr(),
-		})
+		ranges = append(ranges, AddrRange{r})
 	}
 	return ranges
 }
@@ -756,17 +824,114 @@ func (p *PageAlloc) InUse() []AddrRange {
 // Returns nil if the PallocData's L2 is missing.
 func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
 	ci := chunkIdx(i)
-	l2 := (*pageAlloc)(p).chunks[ci.l1()]
-	if l2 == nil {
-		return nil
-	}
-	return (*PallocData)(&l2[ci.l2()])
+	return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
 }
 
-// AddrRange represents a range over addresses.
-// Specifically, it represents the range [Base, Limit).
+// AddrRange is a wrapper around addrRange for testing.
 type AddrRange struct {
-	Base, Limit uintptr
+	addrRange
+}
+
+// MakeAddrRange creates a new address range.
+func MakeAddrRange(base, limit uintptr) AddrRange {
+	return AddrRange{makeAddrRange(base, limit)}
+}
+
+// Base returns the virtual base address of the address range.
+func (a AddrRange) Base() uintptr {
+	return a.addrRange.base.addr()
+}
+
+// Base returns the virtual address of the limit of the address range.
+func (a AddrRange) Limit() uintptr {
+	return a.addrRange.limit.addr()
+}
+
+// Equals returns true if the two address ranges are exactly equal.
+func (a AddrRange) Equals(b AddrRange) bool {
+	return a == b
+}
+
+// Size returns the size in bytes of the address range.
+func (a AddrRange) Size() uintptr {
+	return a.addrRange.size()
+}
+
+// AddrRanges is a wrapper around addrRanges for testing.
+type AddrRanges struct {
+	addrRanges
+	mutable bool
+}
+
+// NewAddrRanges creates a new empty addrRanges.
+//
+// Note that this initializes addrRanges just like in the
+// runtime, so its memory is persistentalloc'd. Call this
+// function sparingly since the memory it allocates is
+// leaked.
+//
+// This AddrRanges is mutable, so we can test methods like
+// Add.
+func NewAddrRanges() AddrRanges {
+	r := addrRanges{}
+	r.init(new(sysMemStat))
+	return AddrRanges{r, true}
+}
+
+// MakeAddrRanges creates a new addrRanges populated with
+// the ranges in a.
+//
+// The returned AddrRanges is immutable, so methods like
+// Add will fail.
+func MakeAddrRanges(a ...AddrRange) AddrRanges {
+	// Methods that manipulate the backing store of addrRanges.ranges should
+	// not be used on the result from this function (e.g. add) since they may
+	// trigger reallocation. That would normally be fine, except the new
+	// backing store won't come from the heap, but from persistentalloc, so
+	// we'll leak some memory implicitly.
+	ranges := make([]addrRange, 0, len(a))
+	total := uintptr(0)
+	for _, r := range a {
+		ranges = append(ranges, r.addrRange)
+		total += r.Size()
+	}
+	return AddrRanges{addrRanges{
+		ranges:     ranges,
+		totalBytes: total,
+		sysStat:    new(sysMemStat),
+	}, false}
+}
+
+// Ranges returns a copy of the ranges described by the
+// addrRanges.
+func (a *AddrRanges) Ranges() []AddrRange {
+	result := make([]AddrRange, 0, len(a.addrRanges.ranges))
+	for _, r := range a.addrRanges.ranges {
+		result = append(result, AddrRange{r})
+	}
+	return result
+}
+
+// FindSucc returns the successor to base. See addrRanges.findSucc
+// for more details.
+func (a *AddrRanges) FindSucc(base uintptr) int {
+	return a.findSucc(base)
+}
+
+// Add adds a new AddrRange to the AddrRanges.
+//
+// The AddrRange must be mutable (i.e. created by NewAddrRanges),
+// otherwise this method will throw.
+func (a *AddrRanges) Add(r AddrRange) {
+	if !a.mutable {
+		throw("attempt to mutate immutable AddrRanges")
+	}
+	a.add(r.addrRange)
+}
+
+// TotalBytes returns the totalBytes field of the addrRanges.
+func (a *AddrRanges) TotalBytes() uintptr {
+	return a.addrRanges.totalBytes
 }
 
 // BitRange represents a range over a bitmap.
@@ -800,7 +965,11 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		addr := chunkBase(chunkIdx(i))
 
 		// Mark the chunk's existence in the pageAlloc.
-		p.grow(addr, pallocChunkBytes)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.grow(addr, pallocChunkBytes)
+			unlock(p.mheapLock)
+		})
 
 		// Initialize the bitmap and update pageAlloc metadata.
 		chunk := p.chunkOf(chunkIndex(addr))
@@ -831,13 +1000,19 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		}
 
 		// Update heap metadata for the allocRange calls above.
-		p.update(addr, pallocChunkPages, false, false)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.update(addr, pallocChunkPages, false, false)
+			unlock(p.mheapLock)
+		})
 	}
+
 	systemstack(func() {
 		lock(p.mheapLock)
 		p.scavengeStartGen()
 		unlock(p.mheapLock)
 	})
+
 	return (*PageAlloc)(p)
 }
 
@@ -900,7 +1075,10 @@ func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
 		lock(&mheap_.lock)
 	chunkLoop:
 		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
-			chunk := mheap_.pages.chunkOf(i)
+			chunk := mheap_.pages.tryChunkOf(i)
+			if chunk == nil {
+				continue
+			}
 			for j := 0; j < pallocChunkPages/64; j++ {
 				// Run over each 64-bit bitmap section and ensure
 				// scavenged is being cleared properly on allocation.
@@ -980,9 +1158,59 @@ func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
 	return x, y
 }
 
-func MSpanCountAlloc(bits []byte) int {
-	s := (*mspan)(mheap_.spanalloc.alloc())
+// mspan wrapper for testing.
+//go:notinheap
+type MSpan mspan
+
+// Allocate an mspan for testing.
+func AllocMSpan() *MSpan {
+	var s *mspan
+	systemstack(func() {
+		lock(&mheap_.lock)
+		s = (*mspan)(mheap_.spanalloc.alloc())
+		unlock(&mheap_.lock)
+	})
+	return (*MSpan)(s)
+}
+
+// Free an allocated mspan.
+func FreeMSpan(s *MSpan) {
+	systemstack(func() {
+		lock(&mheap_.lock)
+		mheap_.spanalloc.free(unsafe.Pointer(s))
+		unlock(&mheap_.lock)
+	})
+}
+
+func MSpanCountAlloc(ms *MSpan, bits []byte) int {
+	s := (*mspan)(ms)
 	s.nelems = uintptr(len(bits) * 8)
 	s.gcmarkBits = (*gcBits)(unsafe.Pointer(&bits[0]))
-	return s.countAlloc()
+	result := s.countAlloc()
+	s.gcmarkBits = nil
+	return result
+}
+
+const (
+	TimeHistSubBucketBits   = timeHistSubBucketBits
+	TimeHistNumSubBuckets   = timeHistNumSubBuckets
+	TimeHistNumSuperBuckets = timeHistNumSuperBuckets
+)
+
+type TimeHistogram timeHistogram
+
+// Counts returns the counts for the given bucket, subBucket indices.
+// Returns true if the bucket was valid, otherwise returns the counts
+// for the overflow bucket and false.
+func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
+	t := (*timeHistogram)(th)
+	i := bucket*TimeHistNumSubBuckets + subBucket
+	if i >= uint(len(t.counts)) {
+		return t.overflow, false
+	}
+	return t.counts[i], true
+}
+
+func (th *TimeHistogram) Record(duration int64) {
+	(*timeHistogram)(th).record(duration)
 }
diff --git a/src/runtime/export_unix_test.go b/src/runtime/export_unix_test.go
index 621488eaba..307c63fd68 100644
--- a/src/runtime/export_unix_test.go
+++ b/src/runtime/export_unix_test.go
@@ -9,7 +9,6 @@ package runtime
 import "unsafe"
 
 var NonblockingPipe = nonblockingPipe
-var Pipe = pipe
 var SetNonblock = setNonblock
 var Closeonexec = closeonexec
 
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 7316503ed2..dacdf4f383 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -78,10 +78,23 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
-	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
-	instead of MADV_FREE on Linux when returning memory to the
-	kernel. This is less efficient, but causes RSS numbers to drop
-	more quickly.
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
+	madvdontneed: setting madvdontneed=0 will use MADV_FREE
+	instead of MADV_DONTNEED on Linux when returning memory to the
+	kernel. This is more efficient, but means RSS numbers will
+	drop only when the OS is under memory pressure.
 
 	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
 	When set to 0 memory profiling is disabled.  Refer to the description of
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 0fb50ddfba..798dbaceab 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -8,16 +8,15 @@
 //
 // These must agree with symtab.go and ../cmd/internal/objabi/funcdata.go.
 
-#define PCDATA_RegMapIndex 0
+#define PCDATA_UnsafePoint 0
 #define PCDATA_StackMapIndex 1
 #define PCDATA_InlTreeIndex 2
 
 #define FUNCDATA_ArgsPointerMaps 0 /* garbage collector blocks */
 #define FUNCDATA_LocalsPointerMaps 1
-#define FUNCDATA_RegPointerMaps 2
-#define FUNCDATA_StackObjects 3
-#define FUNCDATA_InlTree 4
-#define FUNCDATA_OpenCodedDeferInfo 5 /* info for func with open-coded defers */
+#define FUNCDATA_StackObjects 2
+#define FUNCDATA_InlTree 3
+#define FUNCDATA_OpenCodedDeferInfo 4 /* info for func with open-coded defers */
 
 // Pseudo-assembly statements.
 
@@ -32,9 +31,9 @@
 // defines the pointer map for the function's arguments.
 // GO_ARGS should be the first instruction in a function that uses it.
 // It can be omitted if there are no arguments at all.
-// GO_ARGS is inserted implicitly by the linker for any function
-// that also has a Go prototype and therefore is usually not necessary
-// to write explicitly.
+// GO_ARGS is inserted implicitly by the linker for any function whose
+// name starts with a middle-dot and that also has a Go prototype; it
+// is therefore usually not necessary to write explicitly.
 #define GO_ARGS	FUNCDATA $FUNCDATA_ArgsPointerMaps, go_args_stackmap(SB)
 
 // GO_RESULTS_INITIALIZED indicates that the assembly function
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index c5c8a4cecf..7870f31ae9 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -518,7 +518,7 @@ func BenchmarkReadMemStats(b *testing.B) {
 	hugeSink = nil
 }
 
-func BenchmarkReadMemStatsLatency(b *testing.B) {
+func applyGCLoad(b *testing.B) func() {
 	// We’ll apply load to the runtime with maxProcs-1 goroutines
 	// and use one more to actually benchmark. It doesn't make sense
 	// to try to run this test with only 1 P (that's what
@@ -563,6 +563,14 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 			runtime.KeepAlive(hold)
 		}()
 	}
+	return func() {
+		close(done)
+		wg.Wait()
+	}
+}
+
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
 
 	// Spend this much time measuring latencies.
 	latencies := make([]time.Duration, 0, 1024)
@@ -579,12 +587,11 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 		runtime.ReadMemStats(&ms)
 		latencies = append(latencies, time.Now().Sub(start))
 	}
-	close(done)
-	// Make sure to stop the timer before we wait! The goroutines above
-	// are very heavy-weight and not easy to stop, so we could end up
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
 	// confusing the benchmarking framework for small b.N.
 	b.StopTimer()
-	wg.Wait()
+	stop()
 
 	// Disable the default */op metrics.
 	// ns/op doesn't mean anything because it's an average, but we
@@ -763,6 +770,10 @@ func BenchmarkScanStackNoLocals(b *testing.B) {
 }
 
 func BenchmarkMSpanCountAlloc(b *testing.B) {
+	// Allocate one dummy mspan for the whole benchmark.
+	s := runtime.AllocMSpan()
+	defer runtime.FreeMSpan(s)
+
 	// n is the number of bytes to benchmark against.
 	// n must always be a multiple of 8, since gcBits is
 	// always rounded up 8 bytes.
@@ -774,7 +785,7 @@ func BenchmarkMSpanCountAlloc(b *testing.B) {
 
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
-				runtime.MSpanCountAlloc(bits)
+				runtime.MSpanCountAlloc(s, bits)
 			}
 		})
 	}
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 4c35309211..2d531571aa 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -20,8 +20,19 @@ import (
 func runtime_debug_WriteHeapDump(fd uintptr) {
 	stopTheWorld("write heap dump")
 
+	// Keep m on this G's stack instead of the system stack.
+	// Both readmemstats_m and writeheapdump_m have pretty large
+	// peak stack depths and we risk blowing the system stack.
+	// This is safe because the world is stopped, so we don't
+	// need to worry about anyone shrinking and therefore moving
+	// our stack.
+	var m MemStats
 	systemstack(func() {
-		writeheapdump_m(fd)
+		// Call readmemstats_m here instead of deeper in
+		// writeheapdump_m because we might blow the system stack
+		// otherwise.
+		readmemstats_m(&m)
+		writeheapdump_m(fd, &m)
 	})
 
 	startTheWorld()
@@ -420,6 +431,9 @@ func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, o
 }
 
 func dumproots() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
 	// TODO(mwhudson): dump datamask etc from all objects
 	// data segment
 	dumpint(tagData)
@@ -457,6 +471,9 @@ func dumproots() {
 var freemark [_PageSize / 8]bool
 
 func dumpobjs() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
 	for _, s := range mheap_.allspans {
 		if s.state.get() != mSpanInUse {
 			continue
@@ -539,36 +556,42 @@ func dumpms() {
 	}
 }
 
-func dumpmemstats() {
+//go:systemstack
+func dumpmemstats(m *MemStats) {
+	assertWorldStopped()
+
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
 	dumpint(tagMemStats)
-	dumpint(memstats.alloc)
-	dumpint(memstats.total_alloc)
-	dumpint(memstats.sys)
-	dumpint(memstats.nlookup)
-	dumpint(memstats.nmalloc)
-	dumpint(memstats.nfree)
-	dumpint(memstats.heap_alloc)
-	dumpint(memstats.heap_sys)
-	dumpint(memstats.heap_idle)
-	dumpint(memstats.heap_inuse)
-	dumpint(memstats.heap_released)
-	dumpint(memstats.heap_objects)
-	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys)
-	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys)
-	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys)
-	dumpint(memstats.buckhash_sys)
-	dumpint(memstats.gc_sys)
-	dumpint(memstats.other_sys)
-	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc_unix)
-	dumpint(memstats.pause_total_ns)
+	dumpint(m.Alloc)
+	dumpint(m.TotalAlloc)
+	dumpint(m.Sys)
+	dumpint(m.Lookups)
+	dumpint(m.Mallocs)
+	dumpint(m.Frees)
+	dumpint(m.HeapAlloc)
+	dumpint(m.HeapSys)
+	dumpint(m.HeapIdle)
+	dumpint(m.HeapInuse)
+	dumpint(m.HeapReleased)
+	dumpint(m.HeapObjects)
+	dumpint(m.StackInuse)
+	dumpint(m.StackSys)
+	dumpint(m.MSpanInuse)
+	dumpint(m.MSpanSys)
+	dumpint(m.MCacheInuse)
+	dumpint(m.MCacheSys)
+	dumpint(m.BuckHashSys)
+	dumpint(m.GCSys)
+	dumpint(m.OtherSys)
+	dumpint(m.NextGC)
+	dumpint(m.LastGC)
+	dumpint(m.PauseTotalNs)
 	for i := 0; i < 256; i++ {
-		dumpint(memstats.pause_ns[i])
+		dumpint(m.PauseNs[i])
 	}
-	dumpint(uint64(memstats.numgc))
+	dumpint(uint64(m.NumGC))
 }
 
 func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
@@ -619,6 +642,9 @@ func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs,
 }
 
 func dumpmemprof() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
 	iterate_memprof(dumpmemprof_callback)
 	for _, s := range mheap_.allspans {
 		if s.state.get() != mSpanInUse {
@@ -639,7 +665,9 @@ func dumpmemprof() {
 
 var dumphdr = []byte("go1.7 heap dump\n")
 
-func mdump() {
+func mdump(m *MemStats) {
+	assertWorldStopped()
+
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
 		if s.state.get() == mSpanInUse {
@@ -654,13 +682,15 @@ func mdump() {
 	dumpgs()
 	dumpms()
 	dumproots()
-	dumpmemstats()
+	dumpmemstats(m)
 	dumpmemprof()
 	dumpint(tagEOF)
 	flush()
 }
 
-func writeheapdump_m(fd uintptr) {
+func writeheapdump_m(fd uintptr, m *MemStats) {
+	assertWorldStopped()
+
 	_g_ := getg()
 	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
 	_g_.waitreason = waitReasonDumpingHeap
@@ -674,7 +704,7 @@ func writeheapdump_m(fd uintptr) {
 	dumpfd = fd
 
 	// Call dump routine.
-	mdump()
+	mdump(m)
 
 	// Reset dump file.
 	dumpfd = 0
diff --git a/src/runtime/histogram.go b/src/runtime/histogram.go
new file mode 100644
index 0000000000..4020969eb9
--- /dev/null
+++ b/src/runtime/histogram.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+)
+
+const (
+	// For the time histogram type, we use an HDR histogram.
+	// Values are placed in super-buckets based solely on the most
+	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are then placed into sub-buckets based on the value of
+	// the next timeHistSubBucketBits most significant bits. Thus,
+	// sub-buckets are linear within a super-bucket.
+	//
+	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
+	// defines the error. This error may be computed as
+	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
+	// per super-bucket the error is approximately 6%.
+	//
+	// The number of super-buckets (timeHistNumSuperBuckets), on the
+	// other hand, defines the range. To reserve room for sub-buckets,
+	// bit timeHistSubBucketBits is the first bit considered for
+	// super-buckets, so super-bucket indicies are adjusted accordingly.
+	//
+	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	//
+	//    00110
+	//    ^----
+	//    │  ^
+	//    │  └---- Lowest 4 bits -> sub-bucket 6
+	//    └------- Bit 4 unset -> super-bucket 0
+	//
+	//    10110
+	//    ^----
+	//    │  ^
+	//    │  └---- Next 4 bits -> sub-bucket 6
+	//    └------- Bit 4 set -> super-bucket 1
+	//    100010
+	//    ^----^
+	//    │  ^ └-- Lower bits ignored
+	//    │  └---- Next 4 bits -> sub-bucket 1
+	//    └------- Bit 5 set -> super-bucket 2
+	//
+	// Following this pattern, bucket 45 will have the bit 48 set. We don't
+	// have any buckets for higher values, so the highest sub-bucket will
+	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
+	// more than enough to handle durations produced by the runtime.
+	timeHistSubBucketBits   = 4
+	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
+	timeHistNumSuperBuckets = 45
+	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+)
+
+// timeHistogram represents a distribution of durations in
+// nanoseconds.
+//
+// The accuracy and range of the histogram is defined by the
+// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+//
+// It is an HDR histogram with exponentially-distributed
+// buckets and linearly distributed sub-buckets.
+//
+// Counts in the histogram are updated atomically, so it is safe
+// for concurrent use. It is also safe to read all the values
+// atomically.
+type timeHistogram struct {
+	counts   [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+	overflow uint64
+}
+
+// record adds the given duration to the distribution.
+//
+// Although the duration is an int64 to facilitate ease-of-use
+// with e.g. nanotime, the duration must be non-negative.
+func (h *timeHistogram) record(duration int64) {
+	if duration < 0 {
+		throw("timeHistogram encountered negative duration")
+	}
+	// The index of the exponential bucket is just the index
+	// of the highest set bit adjusted for how many bits we
+	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
+	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
+	var superBucket, subBucket uint
+	if duration >= timeHistNumSubBuckets {
+		// At this point, we know the duration value will always be
+		// at least timeHistSubBucketsBits long.
+		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
+		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
+			// The bucket index we got is larger than what we support, so
+			// add into the special overflow bucket.
+			atomic.Xadd64(&h.overflow, 1)
+			return
+		}
+		// The linear subbucket index is just the timeHistSubBucketsBits
+		// bits after the top bit. To extract that value, shift down
+		// the duration such that we leave the top bit and the next bits
+		// intact, then extract the index.
+		subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
+	} else {
+		subBucket = uint(duration)
+	}
+	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+}
+
+// timeHistogramMetricsBuckets generates a slice of boundaries for
+// the timeHistogram. These boundaries are represented in seconds,
+// not nanoseconds like the timeHistogram represents durations.
+func timeHistogramMetricsBuckets() []float64 {
+	b := make([]float64, timeHistTotalBuckets-1)
+	for i := 0; i < timeHistNumSuperBuckets; i++ {
+		superBucketMin := uint64(0)
+		// The (inclusive) minimum for the first bucket is 0.
+		if i > 0 {
+			// The minimum for the second bucket will be
+			// 1 << timeHistSubBucketBits, indicating that all
+			// sub-buckets are represented by the next timeHistSubBucketBits
+			// bits.
+			// Thereafter, we shift up by 1 each time, so we can represent
+			// this pattern as (i-1)+timeHistSubBucketBits.
+			superBucketMin = uint64(1) << uint(i-1+timeHistSubBucketBits)
+		}
+		// subBucketShift is the amount that we need to shift the sub-bucket
+		// index to combine it with the bucketMin.
+		subBucketShift := uint(0)
+		if i > 1 {
+			// The first two buckets are exact with respect to integers,
+			// so we'll never have to shift the sub-bucket index. Thereafter,
+			// we shift up by 1 with each subsequent bucket.
+			subBucketShift = uint(i - 2)
+		}
+		for j := 0; j < timeHistNumSubBuckets; j++ {
+			// j is the sub-bucket index. By shifting the index into position to
+			// combine with the bucket minimum, we obtain the minimum value for that
+			// sub-bucket.
+			subBucketMin := superBucketMin + (uint64(j) << subBucketShift)
+
+			// Convert the subBucketMin which is in nanoseconds to a float64 seconds value.
+			// These values will all be exactly representable by a float64.
+			b[i*timeHistNumSubBuckets+j] = float64(subBucketMin) / 1e9
+		}
+	}
+	return b
+}
diff --git a/src/runtime/histogram_test.go b/src/runtime/histogram_test.go
new file mode 100644
index 0000000000..5f5b28f784
--- /dev/null
+++ b/src/runtime/histogram_test.go
@@ -0,0 +1,58 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+var dummyTimeHistogram TimeHistogram
+
+func TestTimeHistogram(t *testing.T) {
+	// We need to use a global dummy because this
+	// could get stack-allocated with a non-8-byte alignment.
+	// The result of this bad alignment is a segfault on
+	// 32-bit platforms when calling Record.
+	h := &dummyTimeHistogram
+
+	// Record exactly one sample in each bucket.
+	for i := 0; i < TimeHistNumSuperBuckets; i++ {
+		var base int64
+		if i > 0 {
+			base = int64(1) << (i + TimeHistSubBucketBits - 1)
+		}
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
+			v := int64(j)
+			if i > 0 {
+				v <<= i - 1
+			}
+			h.Record(base + v)
+		}
+	}
+	// Hit the overflow bucket.
+	h.Record(int64(^uint64(0) >> 1))
+
+	// Check to make sure there's exactly one count in each
+	// bucket.
+	for i := uint(0); i < TimeHistNumSuperBuckets; i++ {
+		for j := uint(0); j < TimeHistNumSubBuckets; j++ {
+			c, ok := h.Count(i, j)
+			if !ok {
+				t.Errorf("hit overflow bucket unexpectedly: (%d, %d)", i, j)
+			} else if c != 1 {
+				t.Errorf("bucket (%d, %d) has count that is not 1: %d", i, j, c)
+			}
+		}
+	}
+	c, ok := h.Count(TimeHistNumSuperBuckets, 0)
+	if ok {
+		t.Errorf("expected to hit overflow bucket: (%d, %d)", TimeHistNumSuperBuckets, 0)
+	}
+	if c != 1 {
+		t.Errorf("overflow bucket has count that is not 1: %d", c)
+	}
+	dummyTimeHistogram = TimeHistogram{}
+}
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 9b9dc14a60..d82faef1f0 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool Cas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -11,7 +12,7 @@
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+TEXT ·Cas(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -20,32 +21,31 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
 	SETEQ	ret+12(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·CasRel(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduint(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Xadd(SB)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Load64(SB)
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	·Xadd(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
-	JMP runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Loadint64(SB), NOSPLIT, $0-12
+	JMP	·Load64(SB)
 
+TEXT ·Xaddint64(SB), NOSPLIT, $0-20
+	JMP	·Xadd64(SB)
 
-// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
 //		*val = new;
@@ -53,11 +53,12 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
 //	} else {
 //		return 0;
 //	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+TEXT ·Cas64(SB), NOSPLIT, $0-21
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, BP // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVL	old_lo+4(FP), AX
 	MOVL	old_hi+8(FP), DX
 	MOVL	new_lo+12(FP), BX
@@ -74,7 +75,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+TEXT ·Casp1(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -87,7 +88,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
 // Atomically:
 //	*val += delta;
 //	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+TEXT ·Xadd(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	delta+4(FP), AX
 	MOVL	AX, CX
@@ -97,12 +98,13 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-20
+TEXT ·Xadd64(SB), NOSPLIT, $0-20
+	NO_LOCAL_POINTERS
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// DI:SI = delta
 	MOVL	delta_lo+4(FP), SI
 	MOVL	delta_hi+8(FP), DI
@@ -133,22 +135,23 @@ addloop:
 	MOVL	CX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+TEXT ·Xchg(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xchg(SB)
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	·Xchg(SB)
 
-TEXT  runtime∕internal∕atomic·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$0-20
+	NO_LOCAL_POINTERS
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// CX:BX = new
 	MOVL	new_lo+4(FP), BX
 	MOVL	new_hi+8(FP), CX
@@ -171,38 +174,43 @@ swaploop:
 	MOVL	DX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+TEXT ·Store(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-8
+TEXT ·StoreRel(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-8
 	JMP	runtime∕internal∕atomic·Store(SB)
 
 // uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+TEXT ·Load64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVQ	(AX), M0
 	MOVQ	M0, ret+4(FP)
 	EMMS
 	RET
 
-// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+// void ·Store64(uint64 volatile* addr, uint64 v);
+TEXT ·Store64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	// MOVQ and EMMS were introduced on the Pentium MMX.
 	MOVQ	val+4(FP), M0
 	MOVQ	M0, (AX)
@@ -214,24 +222,40 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
 	XADDL	AX, (SP)
 	RET
 
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ORB	BX, (AX)
 	RET
 
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ANDB	BX, (AX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-5
+TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), BX
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 90c56424c9..2cf7c55870 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -136,6 +136,12 @@ TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
 	MOVQ	ptr+0(FP), BX
 	MOVB	val+8(FP), AX
@@ -163,3 +169,19 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	LOCK
 	ANDB	BX, (AX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index d4ef11560e..274925ed60 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -12,13 +13,13 @@
 //	}else
 //		return 0;
 //
-// To implement runtime∕internal∕atomic·cas in sys_$GOOS_arm.s
+// To implement ·cas in sys_$GOOS_arm.s
 // using the native instructions, use:
 //
-//	TEXT runtime∕internal∕atomic·cas(SB),NOSPLIT,$0
-//		B	runtime∕internal∕atomic·armcas(SB)
+//	TEXT ·cas(SB),NOSPLIT,$0
+//		B	·armcas(SB)
 //
-TEXT runtime∕internal∕atomic·armcas(SB),NOSPLIT,$0-13
+TEXT ·armcas(SB),NOSPLIT,$0-13
 	MOVW	ptr+0(FP), R1
 	MOVW	old+4(FP), R2
 	MOVW	new+8(FP), R3
@@ -50,44 +51,50 @@ casfail:
 
 // stubs
 
-TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-8
+	B 	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casp1(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·CasRel(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·Loaduint(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StoreRel(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Xadd(SB)
+TEXT ·StoreRel(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Load64(SB)
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
-	B	runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
+	B	·Xadd(SB)
+
+TEXT ·Loadint64(SB),NOSPLIT,$0-12
+	B	·Load64(SB)
+
+TEXT ·Xaddint64(SB),NOSPLIT,$0-20
+	B	·Xadd64(SB)
 
 // 64-bit atomics
 // The native ARM implementations use LDREXD/STREXD, which are
@@ -95,12 +102,8 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
 // On older ARM, we use Go implementations which simulate 64-bit
 // atomics with locks.
 
-TEXT	armCas64<>(SB),NOSPLIT,$0-21
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armCas64<>(SB),NOSPLIT,$0-21
+	// addr is already in R1
 	MOVW	old_lo+4(FP), R2
 	MOVW	old_hi+8(FP), R3
 	MOVW	new_lo+12(FP), R4
@@ -128,12 +131,8 @@ cas64fail:
 	MOVBU	R0, swapped+20(FP)
 	RET
 
-TEXT	armXadd64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armXadd64<>(SB),NOSPLIT,$0-20
+	// addr is already in R1
 	MOVW	delta_lo+4(FP), R2
 	MOVW	delta_hi+8(FP), R3
 
@@ -154,12 +153,8 @@ add64loop:
 	MOVW	R5, new_hi+16(FP)
 	RET
 
-TEXT	armXchg64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armXchg64<>(SB),NOSPLIT,$0-20
+	// addr is already in R1
 	MOVW	new_lo+4(FP), R2
 	MOVW	new_hi+8(FP), R3
 
@@ -178,12 +173,8 @@ swap64loop:
 	MOVW	R5, old_hi+16(FP)
 	RET
 
-TEXT	armLoad64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armLoad64<>(SB),NOSPLIT,$0-12
+	// addr is already in R1
 
 	LDREXD	(R1), R2	// loads R2 and R3
 	DMB	MB_ISH
@@ -192,12 +183,8 @@ TEXT	armLoad64<>(SB),NOSPLIT,$0-12
 	MOVW	R3, val_hi+8(FP)
 	RET
 
-TEXT	armStore64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armStore64<>(SB),NOSPLIT,$0-12
+	// addr is already in R1
 	MOVW	val_lo+4(FP), R2
 	MOVW	val_hi+8(FP), R3
 
@@ -213,35 +200,83 @@ store64loop:
 	DMB	MB_ISH
 	RET
 
-TEXT	·Cas64(SB),NOSPLIT,$0-21
+// The following functions all panic if their address argument isn't
+// 8-byte aligned. Since we're calling back into Go code to do this,
+// we have to cooperate with stack unwinding. In the normal case, the
+// functions tail-call into the appropriate implementation, which
+// means they must not open a frame. Hence, when they go down the
+// panic path, at that point they push the LR to create a real frame
+// (they don't need to pop it because panic won't return).
+
+TEXT ·Cas64(SB),NOSPLIT,$-4-21
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armCas64<>(SB)
 	JMP	·goCas64(SB)
 
-TEXT	·Xadd64(SB),NOSPLIT,$0-20
+TEXT ·Xadd64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXadd64<>(SB)
 	JMP	·goXadd64(SB)
 
-TEXT	·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXchg64<>(SB)
 	JMP	·goXchg64(SB)
 
-TEXT	·Load64(SB),NOSPLIT,$0-12
+TEXT ·Load64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armLoad64<>(SB)
 	JMP	·goLoad64(SB)
 
-TEXT	·Store64(SB),NOSPLIT,$0-12
+TEXT ·Store64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 3290fb726a..a515683ebb 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -158,6 +158,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 TEXT ·Store(SB), NOSPLIT, $0-12
 	MOVV	ptr+0(FP), R1
 	MOVW	val+8(FP), R2
@@ -237,3 +243,29 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	BEQ	R4, -4(PC)
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 62811a6599..2b2cfabe08 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -122,6 +122,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	JMP	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
 // void	Or8(byte volatile*, byte);
 TEXT ·Or8(SB),NOSPLIT,$0-5
 	MOVW	ptr+0(FP), R1
@@ -169,3 +172,29 @@ try_and8:
 	BEQ	R4, try_and8
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 06dc931bf4..bb009ab34d 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -83,12 +83,18 @@ TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
 TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
+TEXT runtime∕internal∕atomic·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·LoadAcq64(SB)
+
 TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
 TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
 	BR	runtime∕internal∕atomic·Store64(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·StoreRel64(SB)
+
 TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
 	BR	runtime∕internal∕atomic·Xadd64(SB)
 
@@ -191,6 +197,13 @@ TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	MOVW	R4, 0(R3)
 	RET
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	LWSYNC
+	MOVD	R4, 0(R3)
+	RET
+
 // void runtime∕internal∕atomic·Or8(byte volatile*, byte);
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
@@ -209,8 +222,32 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVBZ	val+8(FP), R4
 	LWSYNC
 again:
-	LBAR	(R3),R6
-	AND	R4,R6
-	STBCCC	R6,(R3)
+	LBAR	(R3), R6
+	AND	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3), R6
+	OR	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3),R6
+	AND	R4, R6
+	STWCCC	R6, (R3)
 	BNE	again
 	RET
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 9a19bc0ece..daf1f3cc9f 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -174,8 +174,8 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 
 // func Or8(addr *uint8, v uint8)
 TEXT ·Or8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to OR with the entire word atomically.
@@ -188,8 +188,8 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to AND with the entire word atomically.
@@ -200,3 +200,17 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
 	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 8d002ebfe3..1bfcb1143d 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -30,6 +30,12 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd64(ptr *uint64, delta int64) uint64
 
@@ -63,6 +69,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
@@ -83,5 +95,8 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 14b8101720..e36eb83a11 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -35,6 +35,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
@@ -65,6 +77,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
@@ -85,6 +103,12 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 95713afcc1..546b3d6120 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -81,6 +81,9 @@ func Store(addr *uint32, v uint32)
 //go:noescape
 func StoreRel(addr *uint32, v uint32)
 
+//go:noescape
+func StoreReluintptr(addr *uintptr, v uintptr)
+
 //go:nosplit
 func goCas64(addr *uint64, old, new uint64) bool {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
@@ -180,6 +183,26 @@ func And8(addr *uint8, v uint8) {
 }
 
 //go:nosplit
+func Or(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old|v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old&v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
 func armcas(ptr *uint32, old, new uint32) bool
 
 //go:noescape
@@ -195,6 +218,9 @@ func Load8(addr *uint8) uint8
 func LoadAcq(addr *uint32) uint32
 
 //go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Cas64(addr *uint64, old, new uint64) bool
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 26ca94d54c..d49bee8936 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -42,12 +42,24 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(addr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -67,3 +79,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index a2eb7568d2..0cf3c40223 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -36,12 +36,26 @@ TEXT ·Loadp(SB),NOSPLIT,$0-16
 TEXT ·LoadAcq(SB),NOSPLIT,$0-12
 	B	·Load(SB)
 
+// uint64 runtime∕internal∕atomic·LoadAcquintptr(uint64 volatile* addr)
+TEXT ·LoadAcq64(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcq64(uintptr volatile* addr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
 TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	B	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	B	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 	MOVD	ptr+0(FP), R0
 	MOVW	val+8(FP), R1
@@ -150,3 +164,22 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	CBNZ	R3, -3(PC)
 	RET
 
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	AND	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	ORR	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index 1d9977850b..b0109d72b0 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -42,6 +42,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
@@ -50,6 +56,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -69,3 +81,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
index 1ed90937c9..125c0c221c 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.s
+++ b/src/runtime/internal/atomic/atomic_mips64x.s
@@ -47,3 +47,11 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 // uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcquintptr(uintptr volatile* ptr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 0e2d77ade1..1336b50121 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -34,7 +34,7 @@ func spinUnlock(state *uint32)
 func lockAndCheck(addr *uint64) {
 	// ensure 8-byte alignment
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
-		addr = nil
+		panicUnaligned()
 	}
 	// force dereference before taking lock
 	_ = *addr
@@ -133,12 +133,21 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
@@ -151,4 +160,7 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 func StoreRel(ptr *uint32, val uint32)
 
 //go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
+//go:noescape
 func CasRel(addr *uint32, old, new uint32) bool
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index a48ecf5ee8..e4b109f0ec 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -42,6 +42,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
@@ -50,6 +56,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -67,5 +79,11 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
index c2f696fb34..b79cdbca34 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.s
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -6,6 +6,15 @@
 
 #include "textflag.h"
 
+
+// For more details about how various memory models are
+// enforced on POWER, the following paper provides more
+// details about how they enforce C/C++ like models. This
+// gives context about why the strange looking code
+// sequences below work.
+//
+// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+
 // uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD	ptr+0(FP), R3
@@ -56,5 +65,16 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVWZ  0(R3), R3
 	CMPW   R3, R3, CR7
 	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
 	MOVW   R3, ret+8(FP)
 	RET
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD   ptr+0(FP), R3
+	MOVD   0(R3), R3
+	CMP    R3, R3, CR7
+	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
+	MOVD   R3, ret+8(FP)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index d52512369e..8f24d61625 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -40,12 +40,24 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -65,3 +77,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index d005325ca3..74c896cea6 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -150,6 +150,12 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-24
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	·Load(SB)
 
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
 // func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 TEXT ·Loadp(SB),NOSPLIT,$0-16
 	JMP	·Load64(SB)
@@ -161,6 +167,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 // func Xchg(ptr *uint32, new uint32) uint32
 TEXT ·Xchg(SB), NOSPLIT, $0-20
 	MOV	ptr+0(FP), A0
@@ -230,3 +242,17 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	SLL	A2, A1
 	AMOORW	A1, (A0), ZERO
 	RET
+
+// func And(ptr *uint32, val uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOANDW	A1, (A0), ZERO
+	RET
+
+// func Or(ptr *uint32, val uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOORW	A1, (A0), ZERO
+	RET
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index 4d73b39baf..a058d60102 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -41,6 +41,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
@@ -59,6 +71,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -68,6 +92,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index b0a8fa0610..c9c2eba248 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -73,8 +73,15 @@ func TestXadduintptrOnUint64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
@@ -143,6 +150,45 @@ func TestAnd8(t *testing.T) {
 	}
 }
 
+func TestAnd(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0xffffffff)
+	for i := uint32(0); i < 32; i++ {
+		atomic.And(&x, ^(1 << i))
+		if r := uint32(0xffffffff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint32, 1<<12)
+	for i := range a {
+		a[i] = 0xffffffff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := ^uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestOr8(t *testing.T) {
 	// Basic sanity check.
 	x := uint8(0)
@@ -179,7 +225,43 @@ func TestOr8(t *testing.T) {
 	}
 }
 
-func TestBitwiseContended(t *testing.T) {
+func TestOr(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0)
+	for i := uint32(0); i < 32; i++ {
+		atomic.Or(&x, 1<<i)
+		if r := (uint32(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint32(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xffffffff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint32(0xffffffff), v)
+		}
+	}
+}
+
+func TestBitwiseContended8(t *testing.T) {
 	// Start with every bit in array set to 0.
 	a := make([]uint8, 16)
 
@@ -221,6 +303,48 @@ func TestBitwiseContended(t *testing.T) {
 	}
 }
 
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or(&a[i], m)
+					if atomic.Load(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And(&a[i], ^m)
+					if atomic.Load(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 2c0c3a8174..b05d98ed51 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -47,6 +47,18 @@ func LoadAcq(ptr *uint32) uint32 {
 
 //go:nosplit
 //go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
 func Load8(ptr *uint8) uint8 {
 	return *ptr
 }
@@ -121,6 +133,18 @@ func Or8(ptr *uint8, val uint8) {
 
 //go:nosplit
 //go:noinline
+func And(ptr *uint32, val uint32) {
+	*ptr = *ptr & val
+}
+
+//go:nosplit
+//go:noinline
+func Or(ptr *uint32, val uint32) {
+	*ptr = *ptr | val
+}
+
+//go:nosplit
+//go:noinline
 func Cas64(ptr *uint64, old, new uint64) bool {
 	if *ptr == old {
 		*ptr = new
@@ -143,6 +167,18 @@ func StoreRel(ptr *uint32, val uint32) {
 
 //go:nosplit
 //go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
 func Store8(ptr *uint8, val uint8) {
 	*ptr = val
 }
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index de71b0f2c7..2476c06c52 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -51,6 +51,14 @@ func BenchmarkAnd8(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkAnd8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -63,6 +71,18 @@ func BenchmarkAnd8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkAndParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.And(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkOr8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -71,6 +91,14 @@ func BenchmarkOr8(b *testing.B) {
 	}
 }
 
+func BenchmarkOr(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkOr8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -83,6 +111,18 @@ func BenchmarkOr8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkOrParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.Or(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
@@ -102,3 +142,54 @@ func BenchmarkXadd64(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkCas(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas(ptr, 1, 0)
+			atomic.Cas(ptr, 0, 1)
+		}
+	})
+}
+
+func BenchmarkCas64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas64(ptr, 1, 0)
+			atomic.Cas64(ptr, 0, 1)
+		}
+	})
+}
+func BenchmarkXchg(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint32
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg(ptr, y)
+			y += 1
+		}
+	})
+}
+
+func BenchmarkXchg64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint64
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg64(ptr, y)
+			y += 1
+		}
+	})
+}
diff --git a/src/runtime/internal/atomic/unaligned.go b/src/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000000..a859de4144
--- /dev/null
+++ b/src/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
diff --git a/src/runtime/internal/sys/gengoos.go b/src/runtime/internal/sys/gengoos.go
index 952b13649d..9bbc48d94f 100644
--- a/src/runtime/internal/sys/gengoos.go
+++ b/src/runtime/internal/sys/gengoos.go
@@ -9,8 +9,8 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"log"
+	"os"
 	"strconv"
 	"strings"
 )
@@ -18,7 +18,7 @@ import (
 var gooses, goarches []string
 
 func main() {
-	data, err := ioutil.ReadFile("../../../go/build/syslist.go")
+	data, err := os.ReadFile("../../../go/build/syslist.go")
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -44,6 +44,9 @@ func main() {
 	}
 
 	for _, target := range gooses {
+		if target == "nacl" {
+			continue
+		}
 		var buf bytes.Buffer
 		fmt.Fprintf(&buf, "// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.\n\n")
 		if target == "linux" {
@@ -52,6 +55,9 @@ func main() {
 		if target == "solaris" {
 			fmt.Fprintf(&buf, "// +build !illumos\n") // must explicitly exclude illumos for solaris
 		}
+		if target == "darwin" {
+			fmt.Fprintf(&buf, "// +build !ios\n") // must explicitly exclude ios for darwin
+		}
 		fmt.Fprintf(&buf, "// +build %s\n\n", target) // must explicitly include target for bootstrapping purposes
 		fmt.Fprintf(&buf, "package sys\n\n")
 		fmt.Fprintf(&buf, "const GOOS = `%s`\n\n", target)
@@ -62,13 +68,16 @@ func main() {
 			}
 			fmt.Fprintf(&buf, "const Goos%s = %d\n", strings.Title(goos), value)
 		}
-		err := ioutil.WriteFile("zgoos_"+target+".go", buf.Bytes(), 0666)
+		err := os.WriteFile("zgoos_"+target+".go", buf.Bytes(), 0666)
 		if err != nil {
 			log.Fatal(err)
 		}
 	}
 
 	for _, target := range goarches {
+		if target == "amd64p32" {
+			continue
+		}
 		var buf bytes.Buffer
 		fmt.Fprintf(&buf, "// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.\n\n")
 		fmt.Fprintf(&buf, "// +build %s\n\n", target) // must explicitly include target for bootstrapping purposes
@@ -81,7 +90,7 @@ func main() {
 			}
 			fmt.Fprintf(&buf, "const Goarch%s = %d\n", strings.Title(goarch), value)
 		}
-		err := ioutil.WriteFile("zgoarch_"+target+".go", buf.Bytes(), 0666)
+		err := os.WriteFile("zgoarch_"+target+".go", buf.Bytes(), 0666)
 		if err != nil {
 			log.Fatal(err)
 		}
diff --git a/src/runtime/internal/sys/zgoos_aix.go b/src/runtime/internal/sys/zgoos_aix.go
index d97485c43c..0631d02aa5 100644
--- a/src/runtime/internal/sys/zgoos_aix.go
+++ b/src/runtime/internal/sys/zgoos_aix.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_android.go b/src/runtime/internal/sys/zgoos_android.go
index eec970b064..d356a40bec 100644
--- a/src/runtime/internal/sys/zgoos_android.go
+++ b/src/runtime/internal/sys/zgoos_android.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_darwin.go b/src/runtime/internal/sys/zgoos_darwin.go
index c40819ee55..6aa2db7e3a 100644
--- a/src/runtime/internal/sys/zgoos_darwin.go
+++ b/src/runtime/internal/sys/zgoos_darwin.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+// +build !ios
 // +build darwin
 
 package sys
@@ -13,6 +14,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_dragonfly.go b/src/runtime/internal/sys/zgoos_dragonfly.go
index 3dc4edcc31..88ee1174f1 100644
--- a/src/runtime/internal/sys/zgoos_dragonfly.go
+++ b/src/runtime/internal/sys/zgoos_dragonfly.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 1
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_freebsd.go b/src/runtime/internal/sys/zgoos_freebsd.go
index 6c98b342f9..8de2ec0559 100644
--- a/src/runtime/internal/sys/zgoos_freebsd.go
+++ b/src/runtime/internal/sys/zgoos_freebsd.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 1
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_hurd.go b/src/runtime/internal/sys/zgoos_hurd.go
index d6dcc7bad4..183ccb02a1 100644
--- a/src/runtime/internal/sys/zgoos_hurd.go
+++ b/src/runtime/internal/sys/zgoos_hurd.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 1
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_illumos.go b/src/runtime/internal/sys/zgoos_illumos.go
index 17f4ecc34e..d04134e1df 100644
--- a/src/runtime/internal/sys/zgoos_illumos.go
+++ b/src/runtime/internal/sys/zgoos_illumos.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 1
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_ios.go b/src/runtime/internal/sys/zgoos_ios.go
new file mode 100644
index 0000000000..cf6e9d61f0
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_ios.go
@@ -0,0 +1,25 @@
+// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
+
+// +build ios
+
+package sys
+
+const GOOS = `ios`
+
+const GoosAix = 0
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosHurd = 0
+const GoosIllumos = 0
+const GoosIos = 1
+const GoosJs = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0
+const GoosZos = 0
diff --git a/src/runtime/internal/sys/zgoos_js.go b/src/runtime/internal/sys/zgoos_js.go
index 74c9943d9b..1d9279ab38 100644
--- a/src/runtime/internal/sys/zgoos_js.go
+++ b/src/runtime/internal/sys/zgoos_js.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 1
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_linux.go b/src/runtime/internal/sys/zgoos_linux.go
index 1d5fcb0685..0f718d704f 100644
--- a/src/runtime/internal/sys/zgoos_linux.go
+++ b/src/runtime/internal/sys/zgoos_linux.go
@@ -14,6 +14,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 1
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_netbsd.go b/src/runtime/internal/sys/zgoos_netbsd.go
index 194fa6e432..2ae149ff13 100644
--- a/src/runtime/internal/sys/zgoos_netbsd.go
+++ b/src/runtime/internal/sys/zgoos_netbsd.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_openbsd.go b/src/runtime/internal/sys/zgoos_openbsd.go
index 2108691679..7d4d61e4ca 100644
--- a/src/runtime/internal/sys/zgoos_openbsd.go
+++ b/src/runtime/internal/sys/zgoos_openbsd.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_plan9.go b/src/runtime/internal/sys/zgoos_plan9.go
index e632a90b2e..30aec46df3 100644
--- a/src/runtime/internal/sys/zgoos_plan9.go
+++ b/src/runtime/internal/sys/zgoos_plan9.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_solaris.go b/src/runtime/internal/sys/zgoos_solaris.go
index 67b2ffbfcd..4bb8c99bce 100644
--- a/src/runtime/internal/sys/zgoos_solaris.go
+++ b/src/runtime/internal/sys/zgoos_solaris.go
@@ -14,6 +14,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_windows.go b/src/runtime/internal/sys/zgoos_windows.go
index cf2d6f4fb0..d1f4290204 100644
--- a/src/runtime/internal/sys/zgoos_windows.go
+++ b/src/runtime/internal/sys/zgoos_windows.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/internal/sys/zgoos_zos.go b/src/runtime/internal/sys/zgoos_zos.go
index e5d79accb4..d22be46fc0 100644
--- a/src/runtime/internal/sys/zgoos_zos.go
+++ b/src/runtime/internal/sys/zgoos_zos.go
@@ -13,6 +13,7 @@ const GoosDragonfly = 0
 const GoosFreebsd = 0
 const GoosHurd = 0
 const GoosIllumos = 0
+const GoosIos = 0
 const GoosJs = 0
 const GoosLinux = 0
 const GoosNacl = 0
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 042f10b1d3..b3c01ba104 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
@@ -213,7 +213,7 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankNotifyList:    {},
 	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
 	lockRankTraceStrings:  {lockRankTraceBuf},
-	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankRoot:          {},
@@ -222,16 +222,16 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankNetpollInit:   {lockRankTimers},
 
 	lockRankRwmutexW: {},
-	lockRankRwmutexR: {lockRankRwmutexW},
+	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
 
-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
 	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
 	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
 	lockRankDefer:        {},
 	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
 	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
 	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
diff --git a/src/runtime/lockrank_off.go b/src/runtime/lockrank_off.go
index 32378a9627..7dcd8f5fe9 100644
--- a/src/runtime/lockrank_off.go
+++ b/src/runtime/lockrank_off.go
@@ -18,29 +18,47 @@ func getLockRank(l *mutex) lockRank {
 	return 0
 }
 
-// The following functions may be called in nosplit context.
-// Nosplit is not strictly required for lockWithRank, unlockWithRank
-// and lockWithRankMayAcquire, but these nosplit annotations must
-// be kept consistent with the equivalent functions in lockrank_on.go.
-
-//go:nosplit
 func lockWithRank(l *mutex, rank lockRank) {
 	lock2(l)
 }
 
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func acquireLockRank(rank lockRank) {
 }
 
-//go:nosplit
 func unlockWithRank(l *mutex) {
 	unlock2(l)
 }
 
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func releaseLockRank(rank lockRank) {
 }
 
-//go:nosplit
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 }
+
+//go:nosplit
+func assertLockHeld(l *mutex) {
+}
+
+//go:nosplit
+func assertRankHeld(r lockRank) {
+}
+
+//go:nosplit
+func worldStopped() {
+}
+
+//go:nosplit
+func worldStarted() {
+}
+
+//go:nosplit
+func assertWorldStopped() {
+}
+
+//go:nosplit
+func assertWorldStoppedOrLockHeld(l *mutex) {
+}
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
index fbc5ff58b7..88ac95a004 100644
--- a/src/runtime/lockrank_on.go
+++ b/src/runtime/lockrank_on.go
@@ -7,9 +7,14 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
+// worldIsStopped is accessed atomically to track world-stops. 1 == world
+// stopped.
+var worldIsStopped uint32
+
 // lockRankStruct is embedded in mutex
 type lockRankStruct struct {
 	// static lock ranking of the lock
@@ -40,15 +45,19 @@ func getLockRank(l *mutex) lockRank {
 	return l.rank
 }
 
-// The following functions are the entry-points to record lock
-// operations.
-// All of these are nosplit and switch to the system stack immediately
-// to avoid stack growths. Since a stack growth could itself have lock
-// operations, this prevents re-entrant calls.
-
 // lockWithRank is like lock(l), but allows the caller to specify a lock rank
 // when acquiring a non-static lock.
-//go:nosplit
+//
+// Note that we need to be careful about stack splits:
+//
+// This function is not nosplit, thus it may split at function entry. This may
+// introduce a new edge in the lock order, but it is no different from any
+// other (nosplit) call before this call (including the call to lock() itself).
+//
+// However, we switch to the systemstack to record the lock held to ensure that
+// we record an accurate lock ordering. e.g., without systemstack, a stack
+// split on entry to lock2() would record stack split locks as taken after l,
+// even though l is not actually locked yet.
 func lockWithRank(l *mutex, rank lockRank) {
 	if l == &debuglock || l == &paniclk {
 		// debuglock is only used for println/printlock(). Don't do lock
@@ -86,11 +95,26 @@ func lockWithRank(l *mutex, rank lockRank) {
 	})
 }
 
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func printHeldLocks(gp *g) {
+	if gp.m.locksHeldLen == 0 {
+		println("<none>")
+		return
+	}
+
+	for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
+		println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
+	}
+}
+
 // acquireLockRank acquires a rank which is not associated with a mutex lock
+//
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func acquireLockRank(rank lockRank) {
 	gp := getg()
-	// Log the new class.
+	// Log the new class. See comment on lockWithRank.
 	systemstack(func() {
 		i := gp.m.locksHeldLen
 		if i >= len(gp.m.locksHeld) {
@@ -109,6 +133,8 @@ func acquireLockRank(rank lockRank) {
 
 // checkRanks checks if goroutine g, which has mostly recently acquired a lock
 // with rank 'prevRank', can now acquire a lock with rank 'rank'.
+//
+//go:systemstack
 func checkRanks(gp *g, prevRank, rank lockRank) {
 	rankOK := false
 	if rank < prevRank {
@@ -135,14 +161,12 @@ func checkRanks(gp *g, prevRank, rank lockRank) {
 	if !rankOK {
 		printlock()
 		println(gp.m.procid, " ======")
-		for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
-			println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
-		}
+		printHeldLocks(gp)
 		throw("lock ordering problem")
 	}
 }
 
-//go:nosplit
+// See comment on lockWithRank regarding stack splitting.
 func unlockWithRank(l *mutex) {
 	if l == &debuglock || l == &paniclk {
 		// See comment at beginning of lockWithRank.
@@ -169,6 +193,8 @@ func unlockWithRank(l *mutex) {
 }
 
 // releaseLockRank releases a rank which is not associated with a mutex lock
+//
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func releaseLockRank(rank lockRank) {
 	gp := getg()
@@ -189,7 +215,7 @@ func releaseLockRank(rank lockRank) {
 	})
 }
 
-//go:nosplit
+// See comment on lockWithRank regarding stack splitting.
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 	gp := getg()
 	if gp.m.locksHeldLen == 0 {
@@ -212,3 +238,146 @@ func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 		gp.m.locksHeldLen--
 	})
 }
+
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func checkLockHeld(gp *g, l *mutex) bool {
+	for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+		if gp.m.locksHeld[i].lockAddr == uintptr(unsafe.Pointer(l)) {
+			return true
+		}
+	}
+	return false
+}
+
+// assertLockHeld throws if l is not held by the caller.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertLockHeld(l *mutex) {
+	gp := getg()
+
+	held := checkLockHeld(gp, l)
+	if held {
+		return
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires lock ", l, " (rank ", l.rank.String(), "), holding:\n")
+		printHeldLocks(gp)
+		throw("not holding required lock!")
+	})
+}
+
+// assertRankHeld throws if a mutex with rank r is not held by the caller.
+//
+// This is less precise than assertLockHeld, but can be used in places where a
+// pointer to the exact mutex is not available.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertRankHeld(r lockRank) {
+	gp := getg()
+
+	for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+		if gp.m.locksHeld[i].rank == r {
+			return
+		}
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires lock with rank ", r.String(), "), holding:\n")
+		printHeldLocks(gp)
+		throw("not holding required lock!")
+	})
+}
+
+// worldStopped notes that the world is stopped.
+//
+// Caller must hold worldsema.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func worldStopped() {
+	if stopped := atomic.Xadd(&worldIsStopped, 1); stopped != 1 {
+		systemstack(func() {
+			print("world stop count=", stopped, "\n")
+			throw("recursive world stop")
+		})
+	}
+}
+
+// worldStarted that the world is starting.
+//
+// Caller must hold worldsema.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func worldStarted() {
+	if stopped := atomic.Xadd(&worldIsStopped, -1); stopped != 0 {
+		systemstack(func() {
+			print("world stop count=", stopped, "\n")
+			throw("released non-stopped world stop")
+		})
+	}
+}
+
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func checkWorldStopped() bool {
+	stopped := atomic.Load(&worldIsStopped)
+	if stopped > 1 {
+		systemstack(func() {
+			print("inconsistent world stop count=", stopped, "\n")
+			throw("inconsistent world stop count")
+		})
+	}
+
+	return stopped == 1
+}
+
+// assertWorldStopped throws if the world is not stopped. It does not check
+// which M stopped the world.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertWorldStopped() {
+	if checkWorldStopped() {
+		return
+	}
+
+	throw("world not stopped")
+}
+
+// assertWorldStoppedOrLockHeld throws if the world is not stopped and the
+// passed lock is not held.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertWorldStoppedOrLockHeld(l *mutex) {
+	if checkWorldStopped() {
+		return
+	}
+
+	gp := getg()
+	held := checkLockHeld(gp, l)
+	if held {
+		return
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires world stop or lock ", l, " (rank ", l.rank.String(), "), holding:\n")
+		println("<no world stop>")
+		printHeldLocks(gp)
+		throw("no world stop or required lock!")
+	})
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index e46327f9ce..f20ded5bf7 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -198,7 +198,7 @@ const (
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
-	// On darwin/arm64, although 64-bit pointers are presumably
+	// On ios/arm64, although 64-bit pointers are presumably
 	// available, pointers are truncated to 33 bits. Furthermore,
 	// only the top 4 GiB of the address space are actually available
 	// to the application, but we allow the whole 33 bits anyway for
@@ -207,7 +207,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosDarwin*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosDarwin*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosIos*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -514,14 +514,22 @@ func mallocinit() {
 		// However, on arm64, we ignore all this advice above and slam the
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
-		// On darwin/arm64, the address space is even smaller.
+		// On ios/arm64, the address space is even smaller.
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && GOOS == "darwin":
+			case raceenabled:
+				// The TSAN runtime requires the heap
+				// to be in the range [0x00c000000000,
+				// 0x00e000000000).
+				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
+				if p >= uintptrMask&0x00e000000000 {
+					continue
+				}
+			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
@@ -532,14 +540,6 @@ func mallocinit() {
 					continue
 				}
 				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
-			case raceenabled:
-				// The TSAN runtime requires the heap
-				// to be in the range [0x00c000000000,
-				// 0x00e000000000).
-				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
-				if p >= uintptrMask&0x00e000000000 {
-					continue
-				}
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
@@ -627,6 +627,8 @@ func mallocinit() {
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
+	assertLockHeld(&h.lock)
+
 	n = alignUp(n, heapArenaBytes)
 
 	// First, try the arena pre-reservation.
@@ -743,9 +745,9 @@ mapped:
 			throw("arena already initialized")
 		}
 		var r *heapArena
-		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 		if r == nil {
-			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 			if r == nil {
 				throw("out of memory allocating heap arena metadata")
 			}
@@ -757,7 +759,7 @@ mapped:
 			if size == 0 {
 				size = physPageSize
 			}
-			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gcMiscSys))
 			if newArray == nil {
 				throw("out of memory allocating allArenas")
 			}
@@ -909,27 +911,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if debug.sbrk != 0 {
-		align := uintptr(16)
-		if typ != nil {
-			// TODO(austin): This should be just
-			//   align = uintptr(typ.align)
-			// but that's only 4 on 32-bit platforms,
-			// even if there's a uint64 field in typ (see #599).
-			// This causes 64-bit atomic accesses to panic.
-			// Hence, we use stricter alignment that matches
-			// the normal allocator better.
-			if size&7 == 0 {
-				align = 8
-			} else if size&3 == 0 {
-				align = 4
-			} else if size&1 == 0 {
-				align = 2
-			} else {
-				align = 1
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
 			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.allocs += 1
 		}
-		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
 	// assistG is the G to charge for this allocation, or nil if
@@ -965,18 +974,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
+	c := getMCache()
+	if c == nil {
+		throw("mallocgc called without a P or outside bootstrapping")
 	}
 	var span *mspan
 	var x unsafe.Pointer
@@ -1016,6 +1016,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
 				off = alignUp(off, 8)
+			} else if sys.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
 				off = alignUp(off, 4)
 			} else if size&1 == 0 {
@@ -1025,7 +1033,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
@@ -1067,9 +1075,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1098,7 +1104,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1128,13 +1134,20 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
-	if debug.allocfreetrace != 0 {
-		tracealloc(x, size, typ)
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.bytes += uint64(size)
+		}
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1157,35 +1170,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	// Put the large span in the mcentral swept list so that it's
-	// visible to the background sweeper.
-	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
@@ -1221,16 +1205,11 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
+	c := getMCache()
+	if c == nil {
+		throw("profilealloc called without a P or outside bootstrapping")
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
@@ -1242,6 +1221,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 // distribution (exp(MemProfileRate)), so the best return value is a random
 // number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() uintptr {
+	if MemProfileRate == 1 {
+		// Callers assign our return value to
+		// mcache.next_sample, but next_sample is not used
+		// when the rate is 1. So avoid the math below and
+		// just return something.
+		return 0
+	}
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
 		if g := getg(); g == g.m.gsignal {
@@ -1322,7 +1308,7 @@ var persistentChunks *notInHeap
 // The returned memory will be zeroed.
 //
 // Consider marking persistentalloc'd types go:notinheap.
-func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
@@ -1333,7 +1319,7 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 	const (
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
@@ -1392,8 +1378,8 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	}
 
 	if sysStat != &memstats.other_sys {
-		mSysStatInc(sysStat, size)
-		mSysStatDec(&memstats.other_sys, size)
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
 	}
 	return p
 }
@@ -1434,7 +1420,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	l.end = base + size
 }
 
-func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 5c97f548fd..4ba94d0494 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -12,8 +12,10 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"runtime"
 	. "runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -168,6 +170,61 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+var (
+	tinyByteSink   *byte
+	tinyUint32Sink *uint32
+	tinyObj12Sink  *obj12
+)
+
+type obj12 struct {
+	a uint64
+	b uint32
+}
+
+func TestTinyAllocIssue37262(t *testing.T) {
+	// Try to cause an alignment access fault
+	// by atomically accessing the first 64-bit
+	// value of a tiny-allocated object.
+	// See issue 37262 for details.
+
+	// GC twice, once to reach a stable heap state
+	// and again to make sure we finish the sweep phase.
+	runtime.GC()
+	runtime.GC()
+
+	// Make 1-byte allocations until we get a fresh tiny slot.
+	aligned := false
+	for i := 0; i < 16; i++ {
+		tinyByteSink = new(byte)
+		if uintptr(unsafe.Pointer(tinyByteSink))&0xf == 0xf {
+			aligned = true
+			break
+		}
+	}
+	if !aligned {
+		t.Fatal("unable to get a fresh tiny slot")
+	}
+
+	// Create a 4-byte object so that the current
+	// tiny slot is partially filled.
+	tinyUint32Sink = new(uint32)
+
+	// Create a 12-byte object, which fits into the
+	// tiny slot. If it actually gets place there,
+	// then the field "a" will be improperly aligned
+	// for atomic access on 32-bit architectures.
+	// This won't be true if issue 36606 gets resolved.
+	tinyObj12Sink = new(obj12)
+
+	// Try to atomically access "x.a".
+	atomic.StoreUint64(&tinyObj12Sink.a, 10)
+
+	// Clear the sinks.
+	tinyByteSink = nil
+	tinyUint32Sink = nil
+	tinyObj12Sink = nil
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
diff --git a/src/runtime/map.go b/src/runtime/map.go
index 8be1d3991d..0beff57a1a 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go
@@ -162,8 +162,8 @@ type bmap struct {
 // If you modify hiter, also change cmd/compile/internal/gc/reflect.go to indicate
 // the layout of this structure.
 type hiter struct {
-	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/internal/gc/range.go).
-	elem        unsafe.Pointer // Must be in second position (see cmd/internal/gc/range.go).
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/gc/range.go).
+	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/gc/range.go).
 	t           *maptype
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
@@ -599,7 +599,7 @@ again:
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 	top := tophash(hash)
 
 	var inserti *uint8
@@ -650,7 +650,7 @@ bucketloop:
 	}
 
 	if inserti == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		newb := h.newoverflow(t, b)
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
@@ -1380,5 +1380,5 @@ func reflectlite_maplen(h *hmap) int {
 	return h.count
 }
 
-const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go:zeroValSize
+const maxZero = 1024 // must match value in reflect/value.go:maxZero cmd/compile/internal/gc/walk.go:zeroValSize
 var zeroVal [maxZero]byte
diff --git a/src/runtime/map_fast32.go b/src/runtime/map_fast32.go
index d80f5eac78..8d52dad217 100644
--- a/src/runtime/map_fast32.go
+++ b/src/runtime/map_fast32.go
@@ -114,7 +114,7 @@ again:
 	if h.growing() {
 		growWork_fast32(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -158,7 +158,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -204,7 +204,7 @@ again:
 	if h.growing() {
 		growWork_fast32(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -248,7 +248,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
diff --git a/src/runtime/map_fast64.go b/src/runtime/map_fast64.go
index 3bc84bbdd3..f1368dc774 100644
--- a/src/runtime/map_fast64.go
+++ b/src/runtime/map_fast64.go
@@ -114,7 +114,7 @@ again:
 	if h.growing() {
 		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -158,7 +158,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -204,7 +204,7 @@ again:
 	if h.growing() {
 		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -248,7 +248,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
diff --git a/src/runtime/map_faststr.go b/src/runtime/map_faststr.go
index 108c502394..2d1ac762a8 100644
--- a/src/runtime/map_faststr.go
+++ b/src/runtime/map_faststr.go
@@ -225,7 +225,7 @@ again:
 	if h.growing() {
 		growWork_faststr(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 	top := tophash(hash)
 
 	var insertb *bmap
@@ -274,7 +274,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index f7875d327a..2b5affce52 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -281,28 +281,7 @@ func typedslicecopy(typ *_type, dstPtr unsafe.Pointer, dstLen int, srcPtr unsafe
 //go:linkname reflect_typedslicecopy reflect.typedslicecopy
 func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 	if elemType.ptrdata == 0 {
-		n := dst.len
-		if n > src.len {
-			n = src.len
-		}
-		if n == 0 {
-			return 0
-		}
-
-		size := uintptr(n) * elemType.size
-		if raceenabled {
-			callerpc := getcallerpc()
-			pc := funcPC(reflect_typedslicecopy)
-			racewriterangepc(dst.array, size, callerpc, pc)
-			racereadrangepc(src.array, size, callerpc, pc)
-		}
-		if msanenabled {
-			msanwrite(dst.array, size)
-			msanread(src.array, size)
-		}
-
-		memmove(dst.array, src.array, size)
-		return n
+		return slicecopy(dst.array, dst.len, src.array, src.len, elemType.size)
 	}
 	return typedslicecopy(elemType, dst.array, dst.len, src.array, src.len)
 }
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 8de44c14b9..fbfaae0f93 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -30,10 +30,9 @@
 // indicates scanning can ignore the rest of the allocation.
 //
 // The 2-bit entries are split when written into the byte, so that the top half
-// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
-// bits.
-// This form allows a copy from the 1-bit to the 4-bit form to keep the
-// pointer bits contiguous, instead of having to space them out.
+// of the byte contains 4 high (scan) bits and the bottom half contains 4 low
+// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to
+// keep the pointer bits contiguous, instead of having to space them out.
 //
 // The code makes use of the fact that the zero value for a heap
 // bitmap means scalar/dead. This property must be preserved when
@@ -816,6 +815,12 @@ func (s *mspan) countAlloc() int {
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	const doubleCheck = false // slow but helpful; enable to test modifications to this code
 
+	const (
+		mask1 = bitPointer | bitScan                        // 00010001
+		mask2 = bitPointer | bitScan | mask1<<heapBitsShift // 00110011
+		mask3 = bitPointer | bitScan | mask2<<heapBitsShift // 01110111
+	)
+
 	// dataSize is always size rounded up to the next malloc size class,
 	// except in the case of allocating a defer block, in which case
 	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
@@ -844,11 +849,12 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	h := heapBitsForAddr(x)
 	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
 
-	// Heap bitmap bits for 2-word object are only 4 bits,
-	// so also shared with objects next to it.
-	// This is called out as a special case primarily for 32-bit systems,
-	// so that on 32-bit systems the code below can assume all objects
-	// are 4-word aligned (because they're all 16-byte aligned).
+	// 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits.
+	// Therefore, these objects share a heap bitmap byte with the objects next to them.
+	// These are called out as a special case primarily so the code below can assume all
+	// objects are at least 4 words long and that their bitmaps start either at the beginning
+	// of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively).
+
 	if size == 2*sys.PtrSize {
 		if typ.size == sys.PtrSize {
 			// We're allocating a block big enough to hold two pointers.
@@ -865,7 +871,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 				*h.bitp |= (bitPointer | bitScan) << h.shift
 			} else {
-				// 2-element slice of pointer.
+				// 2-element array of pointer.
 				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 			}
 			return
@@ -886,6 +892,70 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 		*h.bitp |= uint8(hb << h.shift)
 		return
+	} else if size == 3*sys.PtrSize {
+		b := uint8(*ptrmask)
+		if doubleCheck {
+			if b == 0 {
+				println("runtime: invalid type ", typ.string())
+				throw("heapBitsSetType: called with non-pointer type")
+			}
+			if sys.PtrSize != 8 {
+				throw("heapBitsSetType: unexpected 3 pointer wide size class on 32 bit")
+			}
+			if typ.kind&kindGCProg != 0 {
+				throw("heapBitsSetType: unexpected GC prog for 3 pointer wide size class")
+			}
+			if typ.size == 2*sys.PtrSize {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, "\n")
+				throw("heapBitsSetType: inconsistent object sizes")
+			}
+		}
+		if typ.size == sys.PtrSize {
+			// The type contains a pointer otherwise heapBitsSetType wouldn't have been called.
+			// Since the type is only 1 pointer wide and contains a pointer, its gcdata must be exactly 1.
+			if doubleCheck && *typ.gcdata != 1 {
+				print("runtime: heapBitsSetType size=", size, " typ.size=", typ.size, "but *typ.gcdata", *typ.gcdata, "\n")
+				throw("heapBitsSetType: unexpected gcdata for 1 pointer wide type size in 3 pointer wide size class")
+			}
+			// 3 element array of pointers. Unrolling ptrmask 3 times into p yields 00000111.
+			b = 7
+		}
+
+		hb := b & 7
+		// Set bitScan bits for all pointers.
+		hb |= hb << wordsPerBitmapByte
+		// First bitScan bit is always set since the type contains pointers.
+		hb |= bitScan
+		// Second bitScan bit needs to also be set if the third bitScan bit is set.
+		hb |= hb & (bitScan << (2 * heapBitsShift)) >> 1
+
+		// For h.shift > 1 heap bits cross a byte boundary and need to be written part
+		// to h.bitp and part to the next h.bitp.
+		switch h.shift {
+		case 0:
+			*h.bitp &^= mask3 << 0
+			*h.bitp |= hb << 0
+		case 1:
+			*h.bitp &^= mask3 << 1
+			*h.bitp |= hb << 1
+		case 2:
+			*h.bitp &^= mask2 << 2
+			*h.bitp |= (hb & mask2) << 2
+			// Two words written to the first byte.
+			// Advance two words to get to the next byte.
+			h = h.next().next()
+			*h.bitp &^= mask1
+			*h.bitp |= (hb >> 2) & mask1
+		case 3:
+			*h.bitp &^= mask1 << 3
+			*h.bitp |= (hb & mask1) << 3
+			// One word written to the first byte.
+			// Advance one word to get to the next byte.
+			h = h.next()
+			*h.bitp &^= mask2
+			*h.bitp |= (hb >> 1) & mask2
+		}
+		return
 	}
 
 	// Copy from 1-bit ptrmask into 2-bit bitmap.
@@ -1079,7 +1149,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// word must be set to scan since there are pointers
 		// somewhere in the object.
 		// In all following words, we set the scan/dead
-		// appropriately to indicate that the object contains
+		// appropriately to indicate that the object continues
 		// to the next 2-bit entry in the bitmap.
 		//
 		// We set four bits at a time here, but if the object
@@ -1095,12 +1165,22 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		b >>= 4
 		nb -= 4
 
-	case sys.PtrSize == 8 && h.shift == 2:
+	case h.shift == 2:
 		// Ptrmask and heap bitmap are misaligned.
+		//
+		// On 32 bit architectures only the 6-word object that corresponds
+		// to a 24 bytes size class can start with h.shift of 2 here since
+		// all other non 16 byte aligned size classes have been handled by
+		// special code paths at the beginning of heapBitsSetType on 32 bit.
+		//
+		// Many size classes are only 16 byte aligned. On 64 bit architectures
+		// this results in a heap bitmap position starting with a h.shift of 2.
+		//
 		// The bits for the first two words are in a byte shared
 		// with another object, so we must be careful with the bits
 		// already there.
-		// We took care of 1-word and 2-word objects above,
+		//
+		// We took care of 1-word, 2-word, and 3-word objects above,
 		// so this is at least a 6-word object.
 		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
 		hb |= bitScan << (2 * heapBitsShift)
@@ -1113,7 +1193,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		*hbitp |= uint8(hb)
 		hbitp = add1(hbitp)
 		if w += 2; w >= nw {
-			// We know that there is more data, because we handled 2-word objects above.
+			// We know that there is more data, because we handled 2-word and 3-word objects above.
 			// This must be at least a 6-word object. If we're out of pointer words,
 			// mark no scan in next bitmap byte and finish.
 			hb = 0
@@ -1248,12 +1328,12 @@ Phase4:
 		// Handle the first byte specially if it's shared. See
 		// Phase 1 for why this is the only special case we need.
 		if doubleCheck {
-			if !(h.shift == 0 || (sys.PtrSize == 8 && h.shift == 2)) {
+			if !(h.shift == 0 || h.shift == 2) {
 				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
 				throw("bad start shift")
 			}
 		}
-		if sys.PtrSize == 8 && h.shift == 2 {
+		if h.shift == 2 {
 			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
 			h = h.next().next()
 			cnw -= 2
@@ -1788,12 +1868,12 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
 	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
 	// Compute the number of pages needed for bitmapBytes.
 	pages := divRoundUp(bitmapBytes, pageSize)
-	s := mheap_.allocManual(pages, &memstats.gc_sys)
+	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
 	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
 	return s
 }
 func dematerializeGCProg(s *mspan) {
-	mheap_.freeManual(s, &memstats.gc_sys)
+	mheap_.freeManual(s, spanAllocPtrScalarBits)
 }
 
 func dumpGCProg(p *byte) {
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 7a7d33ccae..bb7475b6f3 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -19,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -31,9 +32,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny             uintptr
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
+	tiny       uintptr
+	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -41,11 +45,6 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Local allocator stats, flushed during GC.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
@@ -93,10 +92,16 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
 func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
@@ -108,12 +113,31 @@ func freemcache(c *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		purgecachedstats(c)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Returns nil if we're not bootstrapping or we don't have a P. The caller's
+// P must not change, so we must be in a non-preemptible state.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -148,13 +172,107 @@ func (c *mcache) refill(spc spanClass) {
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release()
+
+	// Update heap_live with the same assumption.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
+	// While we're here, flush scanAlloc, since we have to call
+	// revise anyway.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live and heap_scan changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release()
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release()
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
@@ -162,6 +280,13 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
+
+	// Updated heap_scan and possible heap_live.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49e01677..cd20dec539 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -20,10 +20,6 @@ import "runtime/internal/atomic"
 type mcentral struct {
 	spanclass spanClass
 
-	// For !go115NewMCentralImpl.
-	nonempty mSpanList // list of spans with a free object, ie a nonempty free list
-	empty    mSpanList // list of spans with no free objects (or cached in an mcache)
-
 	// partial and full contain two mspan sets: one of swept in-use
 	// spans, and one of unswept in-use spans. These two trade
 	// roles on each GC cycle. The unswept set is drained either by
@@ -44,11 +40,6 @@ type mcentral struct {
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
@@ -178,19 +169,6 @@ havespan:
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -228,27 +206,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -256,7 +213,7 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
index 1fd8e4e78f..ba80ac1bdf 100644
--- a/src/runtime/mcheckmark.go
+++ b/src/runtime/mcheckmark.go
@@ -34,6 +34,8 @@ var useCheckmark = false
 //
 // The world must be stopped.
 func startCheckmarks() {
+	assertWorldStopped()
+
 	// Clear all checkmarks.
 	for _, ai := range mheap_.allArenas {
 		arena := mheap_.arenas[ai.l1()][ai.l2()]
@@ -41,7 +43,7 @@ func startCheckmarks() {
 
 		if bitmap == nil {
 			// Allocate bitmap on first use.
-			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gc_sys))
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
 			if bitmap == nil {
 				throw("out of memory allocating checkmarks bitmap")
 			}
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go
index 7e145b072a..957aa4dcc2 100644
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -11,7 +11,7 @@ import (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -24,7 +24,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -41,8 +41,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 
 }
@@ -59,8 +59,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	// AIX does not allow mapping a range that is already mapped.
 	// So, call mprotect to change permissions.
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 4d860e7bd3..bc672019fb 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -13,12 +13,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -35,8 +35,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -65,8 +65,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 const _sunosEAGAIN = 11
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM || ((GOOS == "solaris" || GOOS == "illumos") && err == _sunosEAGAIN) {
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3b5d565b0f..7fccd2bb8e 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -11,12 +11,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -39,8 +39,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -58,8 +58,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index 092b3d4fa2..957ed36ffa 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -13,7 +13,7 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := sysReserve(nil, n)
 	sysMap(p, n, sysStat)
 	return p
@@ -31,8 +31,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
@@ -80,6 +80,6 @@ func growMemory(pages int32) int32
 // This allows the front-end to replace the old DataView object with a new one.
 func resetMemoryDataView()
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 59b0bca970..3436851091 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -17,7 +17,7 @@ const (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -30,7 +30,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -144,8 +144,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -161,8 +161,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 4fea851cdd..53d8e6dffa 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -140,19 +140,19 @@ func sbrk(n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(bl)
 }
 
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	lock(&memlock)
 	p := memAlloc(n)
 	memCheck()
 	unlock(&memlock)
 	if p != nil {
-		mSysStatInc(sysStat, n)
+		sysStat.add(int64(n))
 	}
 	return p
 }
 
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	lock(&memlock)
 	if uintptr(v)+n == bloc {
 		// Address range being freed is at the end of memory,
@@ -176,10 +176,10 @@ func sysUsed(v unsafe.Pointer, n uintptr) {
 func sysHugePage(v unsafe.Pointer, n uintptr) {
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
 	// sysReserve has already allocated all heap memory,
 	// but has not adjusted stats.
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 165062ec27..3a805b9767 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go
@@ -24,8 +24,8 @@ const (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	mSysStatInc(sysStat, n)
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	sysStat.add(int64(n))
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_COMMIT|_MEM_RESERVE, _PAGE_READWRITE))
 }
 
@@ -97,8 +97,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	r := stdcall3(_VirtualFree, uintptr(v), 0, _MEM_RELEASE)
 	if r == 0 {
 		print("runtime: VirtualFree of ", n, " bytes failed with errno=", getlasterror(), "\n")
@@ -124,6 +124,6 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_RESERVE, _PAGE_READWRITE))
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index dbb7e9a28a..43d27629e5 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -6,152 +6,236 @@
 
 // See memmove Go doc for important implementation constraints.
 
+// Register map
+//
+// dstin  R0
+// src    R1
+// count  R2
+// dst    R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data   R6-R17
+// tmp1   R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-	MOVD	to+0(FP), R3
-	MOVD	from+8(FP), R4
-	MOVD	n+16(FP), R5
-	CBNZ	R5, check
-	RET
+	MOVD	to+0(FP), R0
+	MOVD	from+8(FP), R1
+	MOVD	n+16(FP), R2
+	CBZ	R2, copy0
 
-check:
-	CMP	$16, R5
+	// Small copies: 1..16 bytes
+	CMP	$16, R2
 	BLE	copy16
 
-	AND	$~31, R5, R7	// R7 is N&~31
-	SUB	R7, R5, R6	// R6 is N&31
-
-	CMP	R3, R4
-	BLT	backward
-
-	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
-	// R3 and R4 are advanced as we copy.
-
-	// (There may be implementations of armv8 where copying by bytes until
-	// at least one of source or dest is word aligned is a worthwhile
-	// optimization, but the on the one tested so far (xgene) it did not
-	// make a significance difference.)
+	// Large copies
+	CMP	$128, R2
+	BHI	copy_long
+	CMP	$32, R2
+	BHI	copy32_128
 
-	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
-
-	ADD	R3, R7, R9	// R9 points just past where we copy by word
-
-forwardlargeloop:
-	// Copy 32 bytes at a time.
-	LDP.P	32(R4), (R8, R10)
-	STP.P	(R8, R10), 32(R3)
-	LDP	-16(R4), (R11, R12)
-	STP	(R11, R12), -16(R3)
-	SUB 	$32, R7, R7
-	CBNZ	R7, forwardlargeloop
-
-noforwardlarge:
-	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
+	// Small copies: 17..32 bytes.
+	LDP	(R1), (R6, R7)
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	LDP	-16(R4), (R12, R13)
+	STP	(R6, R7), (R0)
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	STP	(R12, R13), -16(R5)
 	RET
 
-forwardtail:
-	// There are R6 <= 31 bytes remaining to copy.
-	// This is large enough to still contain pointers,
-	// which must be copied atomically.
-	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
-	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
-	LDP.P	16(R4), (R8, R10)
-	STP.P	(R8, R10), 16(R3)
-
-	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
-	MOVD.P	8(R4), R8
-	MOVD.P	R8, 8(R3)
-
-	AND	$7, R6
-	CBNZ	R6, 2(PC)
-	RET
-
-	ADD	R3, R6, R9	// R9 points just past the destination memory
-
-forwardtailloop:
-	MOVBU.P 1(R4), R8
-	MOVBU.P	R8, 1(R3)
-	CMP	R3, R9
-	BNE	forwardtailloop
-	RET
-
-	// Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
 copy16:
-	ADD	R4, R5, R8	// R8 points just past the last source byte
-	ADD	R3, R5, R9	// R9 points just past the last destination byte
-	CMP	$8, R5
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	CMP	$8, R2
 	BLT	copy7
-	MOVD	(R4), R6
-	MOVD	-8(R8), R7
-	MOVD	R6, (R3)
-	MOVD	R7, -8(R9)
+	MOVD	(R1), R6
+	MOVD	-8(R4), R7
+	MOVD	R6, (R0)
+	MOVD	R7, -8(R5)
 	RET
 
 copy7:
-	TBZ	$2, R5, copy3
-	MOVWU	(R4), R6
-	MOVWU	-4(R8), R7
-	MOVW	R6, (R3)
-	MOVW	R7, -4(R9)
+	TBZ	$2, R2, copy3
+	MOVWU	(R1), R6
+	MOVWU	-4(R4), R7
+	MOVW	R6, (R0)
+	MOVW	R7, -4(R5)
 	RET
 
 copy3:
-	TBZ	$1, R5, copy1
-	MOVHU	(R4), R6
-	MOVHU	-2(R8), R7
-	MOVH	R6, (R3)
-	MOVH	R7, -2(R9)
+	TBZ	$1, R2, copy1
+	MOVHU	(R1), R6
+	MOVHU	-2(R4), R7
+	MOVH	R6, (R0)
+	MOVH	R7, -2(R5)
 	RET
 
 copy1:
-	MOVBU	(R4), R6
-	MOVB	R6, (R3)
+	MOVBU	(R1), R6
+	MOVB	R6, (R0)
+
+copy0:
 	RET
 
-backward:
-	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
-	// R3 and R4 are advanced to the end of the destination/source buffers
-	// respectively and moved back as we copy.
+	// Medium copies: 33..128 bytes.
+copy32_128:
+	ADD	R1, R2, R4          // R4 points just past the last source byte
+	ADD	R0, R2, R5          // R5 points just past the last destination byte
+	LDP	(R1), (R6, R7)
+	LDP	16(R1), (R8, R9)
+	LDP	-32(R4), (R10, R11)
+	LDP	-16(R4), (R12, R13)
+	CMP	$64, R2
+	BHI	copy128
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
+	RET
 
-	ADD	R4, R5, R4	// R4 points just past the last source byte
-	ADD	R3, R5, R3	// R3 points just past the last destination byte
+	// Copy 65..128 bytes.
+copy128:
+	LDP	32(R1), (R14, R15)
+	LDP	48(R1), (R16, R17)
+	CMP	$96, R2
+	BLS	copy96
+	LDP	-64(R4), (R2, R3)
+	LDP	-48(R4), (R1, R4)
+	STP	(R2, R3), -64(R5)
+	STP	(R1, R4), -48(R5)
 
-	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
+copy96:
+	STP	(R6, R7), (R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R14, R15), 32(R0)
+	STP	(R16, R17), 48(R0)
+	STP	(R10, R11), -32(R5)
+	STP	(R12, R13), -16(R5)
+	RET
 
-	AND	$7, R6, R12
-	CBZ	R12, backwardtaillarge
+	// Copy more than 128 bytes.
+copy_long:
+	ADD	R1, R2, R4 // R4 points just past the last source byte
+	ADD	R0, R2, R5 // R5 points just past the last destination byte
+	MOVD	ZR, R7
+	MOVD	ZR, R8
 
-	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
-	// Copy sub-pointer-size tail.
-	MOVBU.W	-1(R4), R8
-	MOVBU.W	R8, -1(R3)
-	CMP	R9, R3
-	BNE	backwardtailloop
+	CMP	$1024, R2
+	BLT	backward_check
+	// feature detect to decide how to align
+	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
+	CBNZ	R6, use_aligned_loads
+	MOVD	R0, R7
+	MOVD	R5, R8
+	B	backward_check
+use_aligned_loads:
+	MOVD	R1, R7
+	MOVD	R4, R8
+	// R7 and R8 are used here for the realignment calculation. In
+	// the use_aligned_loads case, R7 is the src pointer and R8 is
+	// srcend pointer, which is used in the backward copy case.
+	// When doing aligned stores, R7 is the dst pointer and R8 is
+	// the dstend pointer.
 
-backwardtaillarge:
-	// Do 8/16-byte write if possible.
-	// See comment at forwardtail.
-	TBZ	$3, R6, 3(PC)
-	MOVD.W	-8(R4), R8
-	MOVD.W	R8, -8(R3)
+backward_check:
+	// Use backward copy if there is an overlap.
+	SUB	R1, R0, R14
+	CBZ	R14, copy0
+	CMP	R2, R14
+	BCC	copy_long_backward
 
-	TBZ	$4, R6, 3(PC)
-	LDP.W	-16(R4), (R8, R10)
-	STP.W	(R8, R10), -16(R3)
+	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+	LDP	(R1), (R12, R13)     // Load  A
+	AND	$15, R7, R14         // Calculate the realignment offset
+	SUB	R14, R1, R1
+	SUB	R14, R0, R3          // move dst back same amount as src
+	ADD	R14, R2, R2
+	LDP	16(R1), (R6, R7)     // Load   B
+	STP	(R12, R13), (R0)     // Store A
+	LDP	32(R1), (R8, R9)     // Load    C
+	LDP	48(R1), (R10, R11)   // Load     D
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+	SUBS	$144, R2, R2
+	BLS	copy64_from_end
 
-nobackwardtail:
-	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
+loop64:
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	32(R1), (R8, R9)     // Load    C
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	48(R1), (R10, R11)   // Load     D
+	STP.W	(R12, R13), 64(R3)   // Store     E
+	LDP.W	64(R1), (R12, R13)   // Load      E
+	SUBS	$64, R2, R2
+	BHI	loop64
+
+	// Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+	LDP	-64(R4), (R14, R15)  // Load       F
+	STP	(R6, R7), 16(R3)     // Store  B
+	LDP	-48(R4), (R6, R7)    // Load        G
+	STP	(R8, R9), 32(R3)     // Store   C
+	LDP	-32(R4), (R8, R9)    // Load         H
+	STP	(R10, R11), 48(R3)   // Store    D
+	LDP	-16(R4), (R10, R11)  // Load          I
+	STP	(R12, R13), 64(R3)   // Store     E
+	STP	(R14, R15), -64(R5)  // Store      F
+	STP	(R6, R7), -48(R5)    // Store       G
+	STP	(R8, R9), -32(R5)    // Store        H
+	STP	(R10, R11), -16(R5)  // Store         I
 	RET
 
-backwardlarge:
-	SUB	R7, R3, R9	// R9 points at the lowest destination byte
+	// Large backward copy for overlapping copies.
+	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+	LDP	-16(R4), (R12, R13)
+	AND	$15, R8, R14
+	SUB	R14, R4, R4
+	SUB	R14, R2, R2
+	LDP	-16(R4), (R6, R7)
+	STP	(R12, R13), -16(R5)
+	LDP	-32(R4), (R8, R9)
+	LDP	-48(R4), (R10, R11)
+	LDP.W	-64(R4), (R12, R13)
+	SUB	R14, R5, R5
+	SUBS	$128, R2, R2
+	BLS	copy64_from_start
+
+loop64_backward:
+	STP	(R6, R7), -16(R5)
+	LDP	-16(R4), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	-32(R4), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	-48(R4), (R10, R11)
+	STP.W	(R12, R13), -64(R5)
+	LDP.W	-64(R4), (R12, R13)
+	SUBS	$64, R2, R2
+	BHI	loop64_backward
 
-backwardlargeloop:
-	LDP	-16(R4), (R8, R10)
-	STP	(R8, R10), -16(R3)
-	LDP.W	-32(R4), (R11, R12)
-	STP.W	(R11, R12), -32(R3)
-	CMP	R9, R3
-	BNE	backwardlargeloop
+	// Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+	LDP	48(R1), (R2, R3)
+	STP	(R6, R7), -16(R5)
+	LDP	32(R1), (R6, R7)
+	STP	(R8, R9), -32(R5)
+	LDP	16(R1), (R8, R9)
+	STP	(R10, R11), -48(R5)
+	LDP	(R1), (R10, R11)
+	STP	(R12, R13), -64(R5)
+	STP	(R2, R3), 48(R0)
+	STP	(R6, R7), 32(R0)
+	STP	(R8, R9), 16(R0)
+	STP	(R10, R11), (R0)
 	RET
diff --git a/src/runtime/memmove_linux_amd64_test.go b/src/runtime/memmove_linux_amd64_test.go
index d0e8b42a5a..b3ccd907b9 100644
--- a/src/runtime/memmove_linux_amd64_test.go
+++ b/src/runtime/memmove_linux_amd64_test.go
@@ -5,7 +5,6 @@
 package runtime_test
 
 import (
-	"io/ioutil"
 	"os"
 	"reflect"
 	"syscall"
@@ -18,7 +17,7 @@ import (
 func TestMemmoveOverflow(t *testing.T) {
 	t.Parallel()
 	// Create a temporary file.
-	tmp, err := ioutil.TempFile("", "go-memmovetest")
+	tmp, err := os.CreateTemp("", "go-memmovetest")
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 396c1304c5..7c9d2ada45 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -286,6 +286,9 @@ var bufSizes = []int{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 	32, 64, 128, 256, 512, 1024, 2048, 4096,
 }
+var bufSizesOverlap = []int{
+	32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
 
 func BenchmarkMemmove(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
@@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[:n])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n+1)
@@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[1:n+1])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n)
@@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+1)
+		for i := 0; i < b.N; i++ {
+			copy(x[1:n+1], x[:n])
+		}
+	})
+}
+
 func TestMemclr(t *testing.T) {
 	size := 512
 	if testing.Short() {
@@ -538,21 +568,30 @@ func BenchmarkCopyFat1024(b *testing.B) {
 	}
 }
 
+// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
+// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+// when we try to read the result with one 4-byte load.
 func BenchmarkIssue18740(b *testing.B) {
-	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
-	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
-	// when we try to read the result with one 4-byte load.
-	var buf [4]byte
-	for j := 0; j < b.N; j++ {
-		s := uint32(0)
-		for i := 0; i < 4096; i += 4 {
-			copy(buf[:], g[i:])
-			s += binary.LittleEndian.Uint32(buf[:])
-		}
-		sink = uint64(s)
+	benchmarks := []struct {
+		name  string
+		nbyte int
+		f     func([]byte) uint64
+	}{
+		{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
+		{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
+		{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
 	}
-}
 
-// TODO: 2 byte and 8 byte benchmarks also.
-
-var g [4096]byte
+	var g [4096]byte
+	for _, bm := range benchmarks {
+		buf := make([]byte, bm.nbyte)
+		b.Run(bm.name, func(b *testing.B) {
+			for j := 0; j < b.N; j++ {
+				for i := 0; i < 4096; i += bm.nbyte {
+					copy(buf[:], g[i:])
+					sink += bm.f(buf[:])
+				}
+			}
+		})
+	}
+}
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
new file mode 100644
index 0000000000..d3c0341aee
--- /dev/null
+++ b/src/runtime/metrics.go
@@ -0,0 +1,485 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Metrics implementation exported to runtime/metrics.
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var (
+	// metrics is a map of runtime/metrics keys to
+	// data used by the runtime to sample each metric's
+	// value.
+	metricsSema uint32 = 1
+	metricsInit bool
+	metrics     map[string]metricData
+
+	sizeClassBuckets []float64
+	timeHistBuckets  []float64
+)
+
+type metricData struct {
+	// deps is the set of runtime statistics that this metric
+	// depends on. Before compute is called, the statAggregate
+	// which will be passed must ensure() these dependencies.
+	deps statDepSet
+
+	// compute is a function that populates a metricValue
+	// given a populated statAggregate structure.
+	compute func(in *statAggregate, out *metricValue)
+}
+
+// initMetrics initializes the metrics map if it hasn't been yet.
+//
+// metricsSema must be held.
+func initMetrics() {
+	if metricsInit {
+		return
+	}
+	sizeClassBuckets = make([]float64, _NumSizeClasses)
+	for i := range sizeClassBuckets {
+		sizeClassBuckets[i] = float64(class_to_size[i])
+	}
+	timeHistBuckets = timeHistogramMetricsBuckets()
+	metrics = map[string]metricData{
+		"/gc/cycles/automatic:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone - in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/forced:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/total:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone
+			},
+		},
+		"/gc/heap/allocs-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeAllocCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallAllocCount[i])
+				}
+			},
+		},
+		"/gc/heap/frees-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeFreeCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallFreeCount[i])
+				}
+			},
+		},
+		"/gc/heap/goal:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.heapGoal
+			},
+		},
+		"/gc/heap/objects:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.numObjects
+			},
+		},
+		"/gc/pauses:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[len(hist.counts)-1] = atomic.Load64(&memstats.gcPauseDist.overflow)
+				for i := range hist.buckets {
+					hist.counts[i] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+				}
+			},
+		},
+		"/memory/classes/heap/free:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed - in.heapStats.inHeap -
+					in.heapStats.inStacks - in.heapStats.inWorkBufs -
+					in.heapStats.inPtrScalarBits)
+			},
+		},
+		"/memory/classes/heap/objects:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/heap/released:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.released)
+			},
+		},
+		"/memory/classes/heap/stacks:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inStacks)
+			},
+		},
+		"/memory/classes/heap/unused:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inHeap) - in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/metadata/mcache/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheSys - in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mcache/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanSys - in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/other:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inWorkBufs+in.heapStats.inPtrScalarBits) + in.sysStats.gcMiscSys
+			},
+		},
+		"/memory/classes/os-stacks:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.stacksSys
+			},
+		},
+		"/memory/classes/other:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.otherSys
+			},
+		},
+		"/memory/classes/profiling/buckets:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.buckHashSys
+			},
+		},
+		"/memory/classes/total:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed+in.heapStats.released) +
+					in.sysStats.stacksSys + in.sysStats.mSpanSys +
+					in.sysStats.mCacheSys + in.sysStats.buckHashSys +
+					in.sysStats.gcMiscSys + in.sysStats.otherSys
+			},
+		},
+		"/sched/goroutines:goroutines": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(gcount())
+			},
+		},
+	}
+	metricsInit = true
+}
+
+// statDep is a dependency on a group of statistics
+// that a metric might have.
+type statDep uint
+
+const (
+	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
+	sysStatsDep                 // corresponds to sysStatsAggregate
+	numStatsDeps
+)
+
+// statDepSet represents a set of statDeps.
+//
+// Under the hood, it's a bitmap.
+type statDepSet [1]uint64
+
+// makeStatDepSet creates a new statDepSet from a list of statDeps.
+func makeStatDepSet(deps ...statDep) statDepSet {
+	var s statDepSet
+	for _, d := range deps {
+		s[d/64] |= 1 << (d % 64)
+	}
+	return s
+}
+
+// differennce returns set difference of s from b as a new set.
+func (s statDepSet) difference(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] &^ b[i]
+	}
+	return c
+}
+
+// union returns the union of the two sets as a new set.
+func (s statDepSet) union(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] | b[i]
+	}
+	return c
+}
+
+// empty returns true if there are no dependencies in the set.
+func (s *statDepSet) empty() bool {
+	for _, c := range s {
+		if c != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// has returns true if the set contains a given statDep.
+func (s *statDepSet) has(d statDep) bool {
+	return s[d/64]&(1<<(d%64)) != 0
+}
+
+// heapStatsAggregate represents memory stats obtained from the
+// runtime. This set of stats is grouped together because they
+// depend on each other in some way to make sense of the runtime's
+// current heap memory use. They're also sharded across Ps, so it
+// makes sense to grab them all at once.
+type heapStatsAggregate struct {
+	heapStatsDelta
+
+	// Derived from values in heapStatsDelta.
+
+	// inObjects is the bytes of memory occupied by objects,
+	inObjects uint64
+
+	// numObjects is the number of live objects in the heap.
+	numObjects uint64
+}
+
+// compute populates the heapStatsAggregate with values from the runtime.
+func (a *heapStatsAggregate) compute() {
+	memstats.heapStats.read(&a.heapStatsDelta)
+
+	// Calculate derived stats.
+	a.inObjects = uint64(a.largeAlloc - a.largeFree)
+	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
+	for i := range a.smallAllocCount {
+		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
+		a.inObjects += n * uint64(class_to_size[i])
+		a.numObjects += n
+	}
+}
+
+// sysStatsAggregate represents system memory stats obtained
+// from the runtime. This set of stats is grouped together because
+// they're all relatively cheap to acquire and generally independent
+// of one another and other runtime memory stats. The fact that they
+// may be acquired at different times, especially with respect to
+// heapStatsAggregate, means there could be some skew, but because of
+// these stats are independent, there's no real consistency issue here.
+type sysStatsAggregate struct {
+	stacksSys      uint64
+	mSpanSys       uint64
+	mSpanInUse     uint64
+	mCacheSys      uint64
+	mCacheInUse    uint64
+	buckHashSys    uint64
+	gcMiscSys      uint64
+	otherSys       uint64
+	heapGoal       uint64
+	gcCyclesDone   uint64
+	gcCyclesForced uint64
+}
+
+// compute populates the sysStatsAggregate with values from the runtime.
+func (a *sysStatsAggregate) compute() {
+	a.stacksSys = memstats.stacks_sys.load()
+	a.buckHashSys = memstats.buckhash_sys.load()
+	a.gcMiscSys = memstats.gcMiscSys.load()
+	a.otherSys = memstats.other_sys.load()
+	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.gcCyclesDone = uint64(memstats.numgc)
+	a.gcCyclesForced = uint64(memstats.numforcedgc)
+
+	systemstack(func() {
+		lock(&mheap_.lock)
+		a.mSpanSys = memstats.mspan_sys.load()
+		a.mSpanInUse = uint64(mheap_.spanalloc.inuse)
+		a.mCacheSys = memstats.mcache_sys.load()
+		a.mCacheInUse = uint64(mheap_.cachealloc.inuse)
+		unlock(&mheap_.lock)
+	})
+}
+
+// statAggregate is the main driver of the metrics implementation.
+//
+// It contains multiple aggregates of runtime statistics, as well
+// as a set of these aggregates that it has populated. The aggergates
+// are populated lazily by its ensure method.
+type statAggregate struct {
+	ensured   statDepSet
+	heapStats heapStatsAggregate
+	sysStats  sysStatsAggregate
+}
+
+// ensure populates statistics aggregates determined by deps if they
+// haven't yet been populated.
+func (a *statAggregate) ensure(deps *statDepSet) {
+	missing := deps.difference(a.ensured)
+	if missing.empty() {
+		return
+	}
+	for i := statDep(0); i < numStatsDeps; i++ {
+		if !missing.has(i) {
+			continue
+		}
+		switch i {
+		case heapStatsDep:
+			a.heapStats.compute()
+		case sysStatsDep:
+			a.sysStats.compute()
+		}
+	}
+	a.ensured = a.ensured.union(missing)
+}
+
+// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// must be kept structurally identical to that type.
+type metricKind int
+
+const (
+	// These values must be kept identical to their corresponding Kind* values
+	// in the runtime/metrics package.
+	metricKindBad metricKind = iota
+	metricKindUint64
+	metricKindFloat64
+	metricKindFloat64Histogram
+)
+
+// metricSample is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricSample struct {
+	name  string
+	value metricValue
+}
+
+// metricValue is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricValue struct {
+	kind    metricKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// float64HistOrInit tries to pull out an existing float64Histogram
+// from the value, but if none exists, then it allocates one with
+// the given buckets.
+func (v *metricValue) float64HistOrInit(buckets []float64) *metricFloat64Histogram {
+	var hist *metricFloat64Histogram
+	if v.kind == metricKindFloat64Histogram && v.pointer != nil {
+		hist = (*metricFloat64Histogram)(v.pointer)
+	} else {
+		v.kind = metricKindFloat64Histogram
+		hist = new(metricFloat64Histogram)
+		v.pointer = unsafe.Pointer(hist)
+	}
+	hist.buckets = buckets
+	if len(hist.counts) != len(hist.buckets)+1 {
+		hist.counts = make([]uint64, len(buckets)+1)
+	}
+	return hist
+}
+
+// metricFloat64Histogram is a runtime copy of runtime/metrics.Float64Histogram
+// and must be kept structurally identical to that type.
+type metricFloat64Histogram struct {
+	counts  []uint64
+	buckets []float64
+}
+
+// agg is used by readMetrics, and is protected by metricsSema.
+//
+// Managed as a global variable because its pointer will be
+// an argument to a dynamically-defined function, and we'd
+// like to avoid it escaping to the heap.
+var agg statAggregate
+
+// readMetrics is the implementation of runtime/metrics.Read.
+//
+//go:linkname readMetrics runtime/metrics.runtime_readMetrics
+func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
+	// Construct a slice from the args.
+	sl := slice{samplesp, len, cap}
+	samples := *(*[]metricSample)(unsafe.Pointer(&sl))
+
+	// Acquire the metricsSema but with handoff. This operation
+	// is expensive enough that queueing up goroutines and handing
+	// off between them will be noticably better-behaved.
+	semacquire1(&metricsSema, true, 0, 0)
+
+	// Ensure the map is initialized.
+	initMetrics()
+
+	// Clear agg defensively.
+	agg = statAggregate{}
+
+	// Sample.
+	for i := range samples {
+		sample := &samples[i]
+		data, ok := metrics[sample.name]
+		if !ok {
+			sample.value.kind = metricKindBad
+			continue
+		}
+		// Ensure we have all the stats we need.
+		// agg is populated lazily.
+		agg.ensure(&data.deps)
+
+		// Compute the value based on the stats we have.
+		data.compute(&agg, &sample.value)
+	}
+
+	semrelease(&metricsSema)
+}
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
new file mode 100644
index 0000000000..32af5d1727
--- /dev/null
+++ b/src/runtime/metrics/description.go
@@ -0,0 +1,180 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Description describes a runtime metric.
+type Description struct {
+	// Name is the full name of the metric which includes the unit.
+	//
+	// The format of the metric may be described by the following regular expression.
+	//
+	// 	^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$
+	//
+	// The format splits the name into two components, separated by a colon: a path which always
+	// starts with a /, and a machine-parseable unit. The name may contain any valid Unicode
+	// codepoint in between / characters, but by convention will try to stick to lowercase
+	// characters and hyphens. An example of such a path might be "/memory/heap/free".
+	//
+	// The unit is by convention a series of lowercase English unit names (singular or plural)
+	// without prefixes delimited by '*' or '/'. The unit names may contain any valid Unicode
+	// codepoint that is not a delimiter.
+	// Examples of units might be "seconds", "bytes", "bytes/second", "cpu-seconds",
+	// "byte*cpu-seconds", and "bytes/second/second".
+	//
+	// A complete name might look like "/memory/heap/free:bytes".
+	Name string
+
+	// Description is an English language sentence describing the metric.
+	Description string
+
+	// Kind is the kind of value for this metric.
+	//
+	// The purpose of this field is to allow users to filter out metrics whose values are
+	// types which their application may not understand.
+	Kind ValueKind
+
+	// Cumulative is whether or not the metric is cumulative. If a cumulative metric is just
+	// a single number, then it increases monotonically. If the metric is a distribution,
+	// then each bucket count increases monotonically.
+	//
+	// This flag thus indicates whether or not it's useful to compute a rate from this value.
+	Cumulative bool
+
+	// StopTheWorld is whether or not the metric requires a stop-the-world
+	// event in order to collect it.
+	StopTheWorld bool
+}
+
+// The English language descriptions below must be kept in sync with the
+// descriptions of each metric in doc.go.
+var allDesc = []Description{
+	{
+		Name:        "/gc/cycles/automatic:gc-cycles",
+		Description: "Count of completed GC cycles generated by the Go runtime.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/forced:gc-cycles",
+		Description: "Count of completed GC cycles forced by the application.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/total:gc-cycles",
+		Description: "Count of all completed GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/heap/allocs-by-size:objects",
+		Description: "Distribution of all objects allocated by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/frees-by-size:objects",
+		Description: "Distribution of all objects freed by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/goal:bytes",
+		Description: "Heap size target for the end of the GC cycle.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/heap/objects:objects",
+		Description: "Number of objects, live or unswept, occupying heap memory.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/pauses:seconds",
+		Description: "Distribution individual GC-related stop-the-world pause latencies.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name: "/memory/classes/heap/free:bytes",
+		Description: "Memory that is completely free and eligible to be returned to the underlying system, " +
+			"but has not been. This metric is the runtime's estimate of free address space that is backed by " +
+			"physical memory.",
+		Kind: KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/objects:bytes",
+		Description: "Memory occupied by live objects and dead objects that have not yet been marked free by the garbage collector.",
+		Kind:        KindUint64,
+	},
+	{
+		Name: "/memory/classes/heap/released:bytes",
+		Description: "Memory that is completely free and has been returned to the underlying system. This " +
+			"metric is the runtime's estimate of free address space that is still mapped into the process, " +
+			"but is not backed by physical memory.",
+		Kind: KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/stacks:bytes",
+		Description: "Memory allocated from the heap that is reserved for stack space, whether or not it is currently in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/unused:bytes",
+		Description: "Memory that is reserved for heap objects but is not currently used to hold heap objects.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/free:bytes",
+		Description: "Memory that is reserved for runtime mcache structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/inuse:bytes",
+		Description: "Memory that is occupied by runtime mcache structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/free:bytes",
+		Description: "Memory that is reserved for runtime mspan structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/inuse:bytes",
+		Description: "Memory that is occupied by runtime mspan structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/other:bytes",
+		Description: "Memory that is reserved for or used to hold runtime metadata.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/os-stacks:bytes",
+		Description: "Stack memory allocated by the underlying operating system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/other:bytes",
+		Description: "Memory used by execution trace buffers, structures for debugging the runtime, finalizer and profiler specials, and more.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/profiling/buckets:bytes",
+		Description: "Memory that is used by the stack trace hash map used for profiling.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/total:bytes",
+		Description: "All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines:goroutines",
+		Description: "Count of live goroutines.",
+		Kind:        KindUint64,
+	},
+}
+
+// All returns a slice of containing metric descriptions for all supported metrics.
+func All() []Description {
+	return allDesc
+}
diff --git a/src/runtime/metrics/description_test.go b/src/runtime/metrics/description_test.go
new file mode 100644
index 0000000000..fd1fd46efc
--- /dev/null
+++ b/src/runtime/metrics/description_test.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics_test
+
+import (
+	"bufio"
+	"os"
+	"regexp"
+	"runtime/metrics"
+	"strings"
+	"testing"
+)
+
+func TestDescriptionNameFormat(t *testing.T) {
+	r := regexp.MustCompile("^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$")
+	descriptions := metrics.All()
+	for _, desc := range descriptions {
+		if !r.MatchString(desc.Name) {
+			t.Errorf("metrics %q does not match regexp %s", desc.Name, r)
+		}
+	}
+}
+
+func extractMetricDocs(t *testing.T) map[string]string {
+	f, err := os.Open("doc.go")
+	if err != nil {
+		t.Fatalf("failed to open doc.go in runtime/metrics package: %v", err)
+	}
+	const (
+		stateSearch          = iota // look for list of metrics
+		stateNextMetric             // look for next metric
+		stateNextDescription        // build description
+	)
+	state := stateSearch
+	s := bufio.NewScanner(f)
+	result := make(map[string]string)
+	var metric string
+	var prevMetric string
+	var desc strings.Builder
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		switch state {
+		case stateSearch:
+			if line == "Below is the full list of supported metrics, ordered lexicographically." {
+				state = stateNextMetric
+			}
+		case stateNextMetric:
+			// Ignore empty lines until we find a non-empty
+			// one. This will be our metric name.
+			if len(line) != 0 {
+				prevMetric = metric
+				metric = line
+				if prevMetric > metric {
+					t.Errorf("metrics %s and %s are out of lexicographical order", prevMetric, metric)
+				}
+				state = stateNextDescription
+			}
+		case stateNextDescription:
+			if len(line) == 0 || line == `*/` {
+				// An empty line means we're done.
+				// Write down the description and look
+				// for a new metric.
+				result[metric] = desc.String()
+				desc.Reset()
+				state = stateNextMetric
+			} else {
+				// As long as we're seeing data, assume that's
+				// part of the description and append it.
+				if desc.Len() != 0 {
+					// Turn previous newlines into spaces.
+					desc.WriteString(" ")
+				}
+				desc.WriteString(line)
+			}
+		}
+		if line == `*/` {
+			break
+		}
+	}
+	if state == stateSearch {
+		t.Fatalf("failed to find supported metrics docs in %s", f.Name())
+	}
+	return result
+}
+
+func TestDescriptionDocs(t *testing.T) {
+	docs := extractMetricDocs(t)
+	descriptions := metrics.All()
+	for _, d := range descriptions {
+		want := d.Description
+		got, ok := docs[d.Name]
+		if !ok {
+			t.Errorf("no docs found for metric %s", d.Name)
+			continue
+		}
+		if got != want {
+			t.Errorf("mismatched description and docs for metric %s", d.Name)
+			t.Errorf("want: %q, got %q", want, got)
+			continue
+		}
+	}
+	if len(docs) > len(descriptions) {
+	docsLoop:
+		for name, _ := range docs {
+			for _, d := range descriptions {
+				if name == d.Name {
+					continue docsLoop
+				}
+			}
+			t.Errorf("stale documentation for non-existent metric: %s", name)
+		}
+	}
+}
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
new file mode 100644
index 0000000000..a68184ee82
--- /dev/null
+++ b/src/runtime/metrics/doc.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package metrics provides a stable interface to access implementation-defined
+metrics exported by the Go runtime. This package is similar to existing functions
+like runtime.ReadMemStats and debug.ReadGCStats, but significantly more general.
+
+The set of metrics defined by this package may evolve as the runtime itself
+evolves, and also enables variation across Go implementations, whose relevant
+metric sets may not intersect.
+
+Interface
+
+Metrics are designated by a string key, rather than, for example, a field name in
+a struct. The full list of supported metrics is always available in the slice of
+Descriptions returned by All. Each Description also includes useful information
+about the metric, such as how to display it (e.g. gauge vs. counter) and how difficult
+or disruptive it is to obtain it (e.g. do you need to stop the world?).
+
+Thus, users of this API are encouraged to sample supported metrics defined by the
+slice returned by All to remain compatible across Go versions. Of course, situations
+arise where reading specific metrics is critical. For these cases, users are
+encouranged to use build tags, and although metrics may be deprecated and removed,
+users should consider this to be an exceptional and rare event, coinciding with a
+very large change in a particular Go implementation.
+
+Each metric key also has a "kind" that describes the format of the metric's value.
+In the interest of not breaking users of this package, the "kind" for a given metric
+is guaranteed not to change. If it must change, then a new metric will be introduced
+with a new key and a new "kind."
+
+Metric key format
+
+As mentioned earlier, metric keys are strings. Their format is simple and well-defined,
+designed to be both human and machine readable. It is split into two components,
+separated by a colon: a rooted path and a unit. The choice to include the unit in
+the key is motivated by compatibility: if a metric's unit changes, its semantics likely
+did also, and a new key should be introduced.
+
+For more details on the precise definition of the metric key's path and unit formats, see
+the documentation of the Name field of the Description struct.
+
+A note about floats
+
+This package supports metrics whose values have a floating-point representation. In
+order to improve ease-of-use, this package promises to never produce the following
+classes of floating-point values: NaN, infinity.
+
+Supported metrics
+
+Below is the full list of supported metrics, ordered lexicographically.
+
+	/gc/cycles/automatic:gc-cycles
+		Count of completed GC cycles generated by the Go runtime.
+
+	/gc/cycles/forced:gc-cycles
+		Count of completed GC cycles forced by the application.
+
+	/gc/cycles/total:gc-cycles
+		Count of all completed GC cycles.
+
+	/gc/heap/allocs-by-size:objects
+		Distribution of all objects allocated by approximate size.
+
+	/gc/heap/frees-by-size:objects
+		Distribution of all objects freed by approximate size.
+
+	/gc/heap/goal:bytes
+		Heap size target for the end of the GC cycle.
+
+	/gc/heap/objects:objects
+		Number of objects, live or unswept, occupying heap memory.
+
+	/gc/pauses:seconds
+		Distribution individual GC-related stop-the-world pause latencies.
+
+	/memory/classes/heap/free:bytes
+		Memory that is completely free and eligible to be returned to
+		the underlying system, but has not been. This metric is the
+		runtime's estimate of free address space that is backed by
+		physical memory.
+
+	/memory/classes/heap/objects:bytes
+		Memory occupied by live objects and dead objects that have
+		not yet been marked free by the garbage collector.
+
+	/memory/classes/heap/released:bytes
+		Memory that is completely free and has been returned to
+		the underlying system. This metric is the runtime's estimate of
+		free address space that is still mapped into the process, but
+		is not backed by physical memory.
+
+	/memory/classes/heap/stacks:bytes
+		Memory allocated from the heap that is reserved for stack
+		space, whether or not it is currently in-use.
+
+	/memory/classes/heap/unused:bytes
+		Memory that is reserved for heap objects but is not currently
+		used to hold heap objects.
+
+	/memory/classes/metadata/mcache/free:bytes
+		Memory that is reserved for runtime mcache structures, but
+		not in-use.
+
+	/memory/classes/metadata/mcache/inuse:bytes
+		Memory that is occupied by runtime mcache structures that
+		are currently being used.
+
+	/memory/classes/metadata/mspan/free:bytes
+		Memory that is reserved for runtime mspan structures, but
+		not in-use.
+
+	/memory/classes/metadata/mspan/inuse:bytes
+		Memory that is occupied by runtime mspan structures that are
+		currently being used.
+
+	/memory/classes/metadata/other:bytes
+		Memory that is reserved for or used to hold runtime
+		metadata.
+
+	/memory/classes/os-stacks:bytes
+		Stack memory allocated by the underlying operating system.
+
+	/memory/classes/other:bytes
+		Memory used by execution trace buffers, structures for
+		debugging the runtime, finalizer and profiler specials, and
+		more.
+
+	/memory/classes/profiling/buckets:bytes
+		Memory that is used by the stack trace hash map used for
+		profiling.
+
+	/memory/classes/total:bytes
+		All memory mapped by the Go runtime into the current process
+		as read-write. Note that this does not include memory mapped
+		by code called via cgo or via the syscall package.
+		Sum of all metrics in /memory/classes.
+
+	/sched/goroutines:goroutines
+		Count of live goroutines.
+*/
+package metrics
diff --git a/src/runtime/metrics/histogram.go b/src/runtime/metrics/histogram.go
new file mode 100644
index 0000000000..e1364e1e26
--- /dev/null
+++ b/src/runtime/metrics/histogram.go
@@ -0,0 +1,30 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Float64Histogram represents a distribution of float64 values.
+type Float64Histogram struct {
+	// Counts contains the weights for each histogram bucket. The length of
+	// Counts is equal to the length of Buckets (in the metric description)
+	// plus one to account for the implicit minimum bucket.
+	//
+	// Given N buckets, the following is the mathematical relationship between
+	// Counts and Buckets.
+	// count[0] is the weight of the range (-inf, bucket[0])
+	// count[n] is the weight of the range [bucket[n], bucket[n+1]), for 0 < n < N-1
+	// count[N-1] is the weight of the range [bucket[N-1], inf)
+	Counts []uint64
+
+	// Buckets contains the boundaries between histogram buckets, in increasing order.
+	//
+	// Because this slice contains boundaries, there are len(Buckets)+1 counts:
+	// a count for all values less than the first boundary, a count covering each
+	// [slice[i], slice[i+1]) interval, and a count for all values greater than or
+	// equal to the last boundary.
+	//
+	// For a given metric name, the value of Buckets is guaranteed not to change
+	// between calls until program exit.
+	Buckets []float64
+}
diff --git a/src/runtime/metrics/sample.go b/src/runtime/metrics/sample.go
new file mode 100644
index 0000000000..35534dd70d
--- /dev/null
+++ b/src/runtime/metrics/sample.go
@@ -0,0 +1,47 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	_ "runtime" // depends on the runtime via a linkname'd function
+	"unsafe"
+)
+
+// Sample captures a single metric sample.
+type Sample struct {
+	// Name is the name of the metric sampled.
+	//
+	// It must correspond to a name in one of the metric descriptions
+	// returned by Descriptions.
+	Name string
+
+	// Value is the value of the metric sample.
+	Value Value
+}
+
+// Implemented in the runtime.
+func runtime_readMetrics(unsafe.Pointer, int, int)
+
+// Read populates each Value field in the given slice of metric samples.
+//
+// Desired metrics should be present in the slice with the appropriate name.
+// The user of this API is encouraged to re-use the same slice between calls for
+// efficiency, but is not required to do so.
+//
+// Note that re-use has some caveats. Notably, Values should not be read or
+// manipulated while a Read with that value is outstanding; that is a data race.
+// This property includes pointer-typed Values (e.g. Float64Histogram) whose
+// underlying storage will be reused by Read when possible. To safely use such
+// values in a concurrent setting, all data must be deep-copied.
+//
+// It is safe to execute multiple Read calls concurrently, but their arguments
+// must share no underlying memory. When in doubt, create a new []Sample from
+// scratch, which is always safe, though may be inefficient.
+//
+// Sample values with names not appearing in All will have their Value populated
+// as KindBad to indicate that the name is unknown.
+func Read(m []Sample) {
+	runtime_readMetrics(unsafe.Pointer(&m[0]), len(m), cap(m))
+}
diff --git a/src/runtime/metrics/value.go b/src/runtime/metrics/value.go
new file mode 100644
index 0000000000..0b056b4ea8
--- /dev/null
+++ b/src/runtime/metrics/value.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	"math"
+	"unsafe"
+)
+
+// ValueKind is a tag for a metric Value which indicates its type.
+type ValueKind int
+
+const (
+	// KindBad indicates that the Value has no type and should not be used.
+	KindBad ValueKind = iota
+
+	// KindUint64 indicates that the type of the Value is a uint64.
+	KindUint64
+
+	// KindFloat64 indicates that the type of the Value is a float64.
+	KindFloat64
+
+	// KindFloat64Histogram indicates that the type of the Value is a *Float64Histogram.
+	KindFloat64Histogram
+)
+
+// Value represents a metric value returned by the runtime.
+type Value struct {
+	kind    ValueKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// Kind returns the a tag representing the kind of value this is.
+func (v Value) Kind() ValueKind {
+	return v.kind
+}
+
+// Uint64 returns the internal uint64 value for the metric.
+//
+// If v.Kind() != KindUint64, this method panics.
+func (v Value) Uint64() uint64 {
+	if v.kind != KindUint64 {
+		panic("called Uint64 on non-uint64 metric value")
+	}
+	return v.scalar
+}
+
+// Float64 returns the internal float64 value for the metric.
+//
+// If v.Kind() != KindFloat64, this method panics.
+func (v Value) Float64() float64 {
+	if v.kind != KindFloat64 {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return math.Float64frombits(v.scalar)
+}
+
+// Float64Histogram returns the internal *Float64Histogram value for the metric.
+//
+// If v.Kind() != KindFloat64Histogram, this method panics.
+func (v Value) Float64Histogram() *Float64Histogram {
+	if v.kind != KindFloat64Histogram {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return (*Float64Histogram)(v.pointer)
+}
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
new file mode 100644
index 0000000000..167edd57fd
--- /dev/null
+++ b/src/runtime/metrics_test.go
@@ -0,0 +1,224 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"runtime/metrics"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
+	all := metrics.All()
+	samples := make([]metrics.Sample, len(all))
+	descs := make(map[string]metrics.Description)
+	for i := range all {
+		samples[i].Name = all[i].Name
+		descs[all[i].Name] = all[i]
+	}
+	return descs, samples
+}
+
+func TestReadMetrics(t *testing.T) {
+	// Tests whether readMetrics produces values aligning
+	// with ReadMemStats while the world is stopped.
+	var mstats runtime.MemStats
+	_, samples := prepareAllMetricsSamples()
+	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
+
+	checkUint64 := func(t *testing.T, m string, got, want uint64) {
+		t.Helper()
+		if got != want {
+			t.Errorf("metric %q: got %d, want %d", m, got, want)
+		}
+	}
+
+	// Check to make sure the values we read line up with other values we read.
+	for i := range samples {
+		switch name := samples[i].Name; name {
+		case "/memory/classes/heap/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
+		case "/memory/classes/heap/released:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
+		case "/memory/classes/heap/objects:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
+		case "/memory/classes/heap/unused:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
+		case "/memory/classes/heap/stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
+		case "/memory/classes/metadata/mcache/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
+		case "/memory/classes/metadata/mcache/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
+		case "/memory/classes/metadata/mspan/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
+		case "/memory/classes/metadata/mspan/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
+		case "/memory/classes/metadata/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
+		case "/memory/classes/os-stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
+		case "/memory/classes/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
+		case "/memory/classes/profiling/buckets:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
+		case "/memory/classes/total:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
+		case "/gc/heap/objects:objects":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
+		case "/gc/heap/goal:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
+		case "/gc/cycles/automatic:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
+		case "/gc/cycles/forced:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
+		case "/gc/cycles/total:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
+		}
+	}
+}
+
+func TestReadMetricsConsistency(t *testing.T) {
+	// Tests whether readMetrics produces consistent, sensible values.
+	// The values are read concurrently with the runtime doing other
+	// things (e.g. allocating) so what we read can't reasonably compared
+	// to runtime values.
+
+	// Run a few GC cycles to get some of the stats to be non-zero.
+	runtime.GC()
+	runtime.GC()
+	runtime.GC()
+
+	// Read all the supported metrics through the metrics package.
+	descs, samples := prepareAllMetricsSamples()
+	metrics.Read(samples)
+
+	// Check to make sure the values we read make sense.
+	var totalVirtual struct {
+		got, want uint64
+	}
+	var objects struct {
+		alloc, free *metrics.Float64Histogram
+		total       uint64
+	}
+	var gc struct {
+		numGC  uint64
+		pauses uint64
+	}
+	for i := range samples {
+		kind := samples[i].Value.Kind()
+		if want := descs[samples[i].Name].Kind; kind != want {
+			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
+			continue
+		}
+		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
+			v := samples[i].Value.Uint64()
+			totalVirtual.want += v
+
+			// None of these stats should ever get this big.
+			// If they do, there's probably overflow involved,
+			// usually due to bad accounting.
+			if int64(v) < 0 {
+				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
+			}
+		}
+		switch samples[i].Name {
+		case "/memory/classes/total:bytes":
+			totalVirtual.got = samples[i].Value.Uint64()
+		case "/gc/heap/objects:objects":
+			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs-by-size:objects":
+			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees-by-size:objects":
+			objects.free = samples[i].Value.Float64Histogram()
+		case "/gc/cycles:gc-cycles":
+			gc.numGC = samples[i].Value.Uint64()
+		case "/gc/pauses:seconds":
+			h := samples[i].Value.Float64Histogram()
+			gc.pauses = 0
+			for i := range h.Counts {
+				gc.pauses += h.Counts[i]
+			}
+		case "/sched/goroutines:goroutines":
+			if samples[i].Value.Uint64() < 1 {
+				t.Error("number of goroutines is less than one")
+			}
+		}
+	}
+	if totalVirtual.got != totalVirtual.want {
+		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
+	}
+	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
+		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
+	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
+		t.Error("allocs-by-size and frees-by-size counts don't match in length")
+	} else {
+		for i := range objects.alloc.Buckets {
+			ba := objects.alloc.Buckets[i]
+			bf := objects.free.Buckets[i]
+			if ba != bf {
+				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
+			}
+		}
+		if !t.Failed() {
+			got, want := uint64(0), objects.total
+			for i := range objects.alloc.Counts {
+				if objects.alloc.Counts[i] < objects.free.Counts[i] {
+					t.Errorf("found more allocs than frees in object dist bucket %d", i)
+					continue
+				}
+				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+			}
+			if got != want {
+				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
+			}
+		}
+	}
+	// The current GC has at least 2 pauses per GC.
+	// Check to see if that value makes sense.
+	if gc.pauses < gc.numGC*2 {
+		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
+	}
+}
+
+func BenchmarkReadMetricsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+	_, samples := prepareAllMetricsSamples()
+
+	// Hit metrics.Read continuously and measure.
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := time.Now()
+		metrics.Read(samples)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	stop()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index d6c85a8b93..f4dbd77252 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -88,7 +88,7 @@ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
-			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
 			finc.alllink = allfin
 			allfin = finc
 			if finptrmask[0] == 0 {
@@ -293,15 +293,15 @@ func runfinq() {
 // pass the object to a call of the KeepAlive function to mark the
 // last point in the function where the object must be reachable.
 //
-// For example, if p points to a struct that contains a file descriptor d,
-// and p has a finalizer that closes that file descriptor, and if the last
-// use of p in a function is a call to syscall.Write(p.d, buf, size), then
-// p may be unreachable as soon as the program enters syscall.Write. The
-// finalizer may run at that moment, closing p.d, causing syscall.Write
-// to fail because it is writing to a closed file descriptor (or, worse,
-// to an entirely different file descriptor opened by a different goroutine).
-// To avoid this problem, call runtime.KeepAlive(p) after the call to
-// syscall.Write.
+// For example, if p points to a struct, such as os.File, that contains
+// a file descriptor d, and p has a finalizer that closes that file
+// descriptor, and if the last use of p in a function is a call to
+// syscall.Write(p.d, buf, size), then p may be unreachable as soon as
+// the program enters syscall.Write. The finalizer may run at that moment,
+// closing p.d, causing syscall.Write to fail because it is writing to
+// a closed file descriptor (or, worse, to an entirely different
+// file descriptor opened by a different goroutine). To avoid this problem,
+// call runtime.KeepAlive(p) after the call to syscall.Write.
 //
 // A single goroutine runs all finalizers for a program, sequentially.
 // If a finalizer must run for a long time, it should do so by starting
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index f9dd6ca474..293c16b38b 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -32,7 +32,7 @@ type fixalloc struct {
 	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
-	stat   *uint64
+	stat   *sysMemStat
 	zero   bool // zero allocations
 }
 
@@ -49,7 +49,7 @@ type mlink struct {
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
 	f.size = size
 	f.first = first
 	f.arg = arg
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index bd87144355..185d3201ca 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -290,10 +290,14 @@ func setGCPhase(x uint32) {
 type gcMarkWorkerMode int
 
 const (
+	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
+	// starting work and the mode should be ignored.
+	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
+
 	// gcMarkWorkerDedicatedMode indicates that the P of a mark
 	// worker is dedicated to running that mark worker. The mark
 	// worker should run without preemption.
-	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
+	gcMarkWorkerDedicatedMode
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
@@ -313,6 +317,7 @@ const (
 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 // to use in execution traces.
 var gcMarkWorkerModeStrings = [...]string{
+	"Not worker",
 	"GC (dedicated)",
 	"GC (fractional)",
 	"GC (idle)",
@@ -388,10 +393,24 @@ type gcControllerState struct {
 	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
 	// time heap_scan is updated.
-	assistWorkPerByte float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
 
 	// assistBytesPerWork is 1/assistWorkPerByte.
-	assistBytesPerWork float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
 	// time that should be spent in the fractional mark worker on
@@ -409,7 +428,8 @@ type gcControllerState struct {
 }
 
 // startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema.
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
 func (c *gcControllerState) startCycle() {
 	c.scanWork = 0
 	c.bgScanCredit = 0
@@ -469,7 +489,8 @@ func (c *gcControllerState) startCycle() {
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
-		print("pacer: assist ratio=", c.assistWorkPerByte,
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
 			" (scan ", memstats.heap_scan>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
@@ -479,9 +500,22 @@ func (c *gcControllerState) startCycle() {
 }
 
 // revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan, memstats.heap_live, or
-// memstats.next_gc is updated (with mheap_.lock held).
+// improved estimates. This should be called whenever memstats.heap_scan,
+// memstats.heap_live, or memstats.next_gc is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
@@ -494,10 +528,12 @@ func (c *gcControllerState) revise() {
 		gcpercent = 100000
 	}
 	live := atomic.Load64(&memstats.heap_live)
+	scan := atomic.Load64(&memstats.heap_scan)
+	work := atomic.Loadint64(&c.scanWork)
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(memstats.next_gc)
+	heapGoal := int64(atomic.Load64(&memstats.next_gc))
 
 	// Compute the expected scan work remaining.
 	//
@@ -508,17 +544,17 @@ func (c *gcControllerState) revise() {
 	//
 	// (This is a float calculation to avoid overflowing on
 	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
-	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+	if int64(live) > heapGoal || work > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
 		// work than we expected. Pace GC so that in the worst case it
 		// will complete by the hard goal.
 		const maxOvershoot = 1.1
-		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
 
 		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(memstats.heap_scan)
+		scanWorkExpected = int64(scan)
 	}
 
 	// Compute the remaining scan work estimate.
@@ -528,7 +564,7 @@ func (c *gcControllerState) revise() {
 	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
-	scanWorkRemaining := scanWorkExpected - c.scanWork
+	scanWorkRemaining := scanWorkExpected - work
 	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
@@ -552,8 +588,15 @@ func (c *gcControllerState) revise() {
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
-	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -670,18 +713,12 @@ func (c *gcControllerState) enlistWorker() {
 	}
 }
 
-// findRunnableGCWorker returns the background mark worker for _p_ if it
+// findRunnableGCWorker returns a background mark worker for _p_ if it
 // should be run. This must only be called when gcBlackenEnabled != 0.
 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
-	if _p_.gcBgMarkWorker == 0 {
-		// The mark worker associated with this P is blocked
-		// performing a mark transition. We can't run it
-		// because it may be on some other run or wait queue.
-		return nil
-	}
 
 	if !gcMarkWorkAvailable(_p_) {
 		// No work to be done right now. This can happen at
@@ -691,15 +728,35 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		return nil
 	}
 
+	// Grab a worker before we commit to running below.
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		// There is at least one worker per P, so normally there are
+		// enough workers to run on all Ps, if necessary. However, once
+		// a worker enters gcMarkDone it may park without rejoining the
+		// pool, thus freeing a P with no corresponding worker.
+		// gcMarkDone never depends on another worker doing work, so it
+		// is safe to simply do nothing here.
+		//
+		// If gcMarkDone bails out without completing the mark phase,
+		// it will always do so with queued global work. Thus, that P
+		// will be immediately eligible to re-run the worker G it was
+		// just using, ensuring work can complete.
+		return nil
+	}
+
 	decIfPositive := func(ptr *int64) bool {
-		if *ptr > 0 {
-			if atomic.Xaddint64(ptr, -1) >= 0 {
+		for {
+			v := atomic.Loadint64(ptr)
+			if v <= 0 {
+				return false
+			}
+
+			// TODO: having atomic.Casint64 would be more pleasant.
+			if atomic.Cas64((*uint64)(unsafe.Pointer(ptr)), uint64(v), uint64(v-1)) {
 				return true
 			}
-			// We lost a race
-			atomic.Xaddint64(ptr, +1)
 		}
-		return false
 	}
 
 	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
@@ -708,6 +765,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 	} else if c.fractionalUtilizationGoal == 0 {
 		// No need for fractional workers.
+		gcBgMarkWorkerPool.push(&node.node)
 		return nil
 	} else {
 		// Is this P behind on the fractional utilization
@@ -717,14 +775,15 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		delta := nanotime() - gcController.markStartTime
 		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
 			// Nope. No need to run a fractional worker.
+			gcBgMarkWorkerPool.push(&node.node)
 			return nil
 		}
 		// Run a fractional worker.
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
-	// Run the background mark worker
-	gp := _p_.gcBgMarkWorker.ptr()
+	// Run the background mark worker.
+	gp := node.gp.ptr()
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	if trace.enabled {
 		traceGoUnpark(gp, 0)
@@ -762,6 +821,8 @@ func pollFractionalWorkerExit() bool {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcSetTriggerRatio(triggerRatio float64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
 	// Compute the next GC goal, which is when the allocated heap
 	// has grown by GOGC/100 over the heap marked by the last
 	// cycle.
@@ -844,7 +905,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 
 	// Commit to the trigger and goal.
 	memstats.gc_trigger = trigger
-	memstats.next_gc = goal
+	atomic.Store64(&memstats.next_gc, goal)
 	if trace.enabled {
 		traceNextGC()
 	}
@@ -901,7 +962,9 @@ func gcSetTriggerRatio(triggerRatio float64) {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcEffectiveGrowthRatio() float64 {
-	egogc := float64(memstats.next_gc-memstats.heap_marked) / float64(memstats.heap_marked)
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
 	if egogc < 0 {
 		// Shouldn't happen, but just in case.
 		egogc = 0
@@ -983,7 +1046,6 @@ var work struct {
 	nproc  uint32
 	tstart int64
 	nwait  uint32
-	ndone  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
 	nFlushCacheRoots                               int
@@ -1381,6 +1443,7 @@ func gcStart(trigger gcTrigger) {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
+		memstats.gcPauseDist.record(now - work.pauseStart)
 	})
 
 	// Release the world sema before Gosched() in STW mode
@@ -1407,19 +1470,6 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32
 
-// debugCachedWork enables extra checks for debugging premature mark
-// termination.
-//
-// For debugging issue #27993.
-const debugCachedWork = false
-
-// gcWorkPauseGen is for debugging the mark completion algorithm.
-// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
-// Only used if debugCachedWork is true.
-//
-// For debugging issue #27993.
-var gcWorkPauseGen uint32 = 1
-
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1475,15 +1525,7 @@ top:
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
 			wbBufFlush1(_p_)
-			// For debugging, shrink the write barrier
-			// buffer so it flushes immediately.
-			// wbBuf.reset will keep it at this size as
-			// long as throwOnGCWork is set.
-			if debugCachedWork {
-				b := &_p_.wbBuf
-				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
-				b.debugGen = gcWorkPauseGen
-			}
+
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
@@ -1494,29 +1536,12 @@ top:
 			if _p_.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
 				_p_.gcw.flushedWork = false
-			} else if debugCachedWork {
-				// For debugging, freeze the gcWork
-				// until we know whether we've reached
-				// completion or not. If we think
-				// we've reached completion, but
-				// there's a paused gcWork, then
-				// that's a bug.
-				_p_.gcw.pauseGen = gcWorkPauseGen
-				// Capture the G's stack.
-				for i := range _p_.gcw.pauseStack {
-					_p_.gcw.pauseStack[i] = 0
-				}
-				callers(1, _p_.gcw.pauseStack[:])
 			}
 		})
 		casgstatus(gp, _Gwaiting, _Grunning)
 	})
 
 	if gcMarkDoneFlushed != 0 {
-		if debugCachedWork {
-			// Release paused gcWorks.
-			atomic.Xadd(&gcWorkPauseGen, 1)
-		}
 		// More grey objects were discovered since the
 		// previous termination check, so there may be more
 		// work to do. Keep going. It's possible the
@@ -1526,13 +1551,6 @@ top:
 		goto top
 	}
 
-	if debugCachedWork {
-		throwOnGCWork = true
-		// Release paused gcWorks. If there are any, they
-		// should now observe throwOnGCWork and panic.
-		atomic.Xadd(&gcWorkPauseGen, 1)
-	}
-
 	// There was no global work, no local work, and no Ps
 	// communicated work since we took markDoneSema. Therefore
 	// there are no grey objects and no more objects can be
@@ -1549,59 +1567,34 @@ top:
 	// below. The important thing is that the wb remains active until
 	// all marking is complete. This includes writes made by the GC.
 
-	if debugCachedWork {
-		// For debugging, double check that no work was added after we
-		// went around above and disable write barrier buffering.
+	// There is sometimes work left over when we enter mark termination due
+	// to write barriers performed after the completion barrier above.
+	// Detect this and resume concurrent mark. This is obviously
+	// unfortunate.
+	//
+	// See issue #27993 for details.
+	//
+	// Switch to the system stack to call wbBufFlush1, though in this case
+	// it doesn't matter because we're non-preemptible anyway.
+	restart := false
+	systemstack(func() {
 		for _, p := range allp {
-			gcw := &p.gcw
-			if !gcw.empty() {
-				printlock()
-				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
-				if gcw.wbuf1 == nil {
-					print(" wbuf1=<nil>")
-				} else {
-					print(" wbuf1.n=", gcw.wbuf1.nobj)
-				}
-				if gcw.wbuf2 == nil {
-					print(" wbuf2=<nil>")
-				} else {
-					print(" wbuf2.n=", gcw.wbuf2.nobj)
-				}
-				print("\n")
-				if gcw.pauseGen == gcw.putGen {
-					println("runtime: checkPut already failed at this generation")
-				}
-				throw("throwOnGCWork")
+			wbBufFlush1(p)
+			if !p.gcw.empty() {
+				restart = true
+				break
 			}
 		}
-	} else {
-		// For unknown reasons (see issue #27993), there is
-		// sometimes work left over when we enter mark
-		// termination. Detect this and resume concurrent
-		// mark. This is obviously unfortunate.
-		//
-		// Switch to the system stack to call wbBufFlush1,
-		// though in this case it doesn't matter because we're
-		// non-preemptible anyway.
-		restart := false
+	})
+	if restart {
+		getg().m.preemptoff = ""
 		systemstack(func() {
-			for _, p := range allp {
-				wbBufFlush1(p)
-				if !p.gcw.empty() {
-					restart = true
-					break
-				}
-			}
+			now := startTheWorldWithSema(true)
+			work.pauseNS += now - work.pauseStart
+			memstats.gcPauseDist.record(now - work.pauseStart)
 		})
-		if restart {
-			getg().m.preemptoff = ""
-			systemstack(func() {
-				now := startTheWorldWithSema(true)
-				work.pauseNS += now - work.pauseStart
-			})
-			semrelease(&worldsema)
-			goto top
-		}
+		semrelease(&worldsema)
+		goto top
 	}
 
 	// Disable assists and background workers. We must do
@@ -1630,10 +1623,10 @@ top:
 	gcMarkTermination(nextTriggerRatio)
 }
 
+// World must be stopped and mark assists and background workers must be
+// disabled.
 func gcMarkTermination(nextTriggerRatio float64) {
-	// World is stopped.
-	// Start marktermination which includes enabling the write barrier.
-	atomic.Store(&gcBlackenEnabled, 0)
+	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
 	work.heap1 = memstats.heap_live
@@ -1711,6 +1704,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
+	memstats.gcPauseDist.record(now - work.pauseStart)
 	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
 	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
@@ -1827,19 +1821,25 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	}
 }
 
-// gcBgMarkStartWorkers prepares background mark worker goroutines.
-// These goroutines will not run until the mark phase, but they must
-// be started while the work is not stopped and from a regular G
-// stack. The caller must hold worldsema.
+// gcBgMarkStartWorkers prepares background mark worker goroutines. These
+// goroutines will not run until the mark phase, but they must be started while
+// the work is not stopped and from a regular G stack. The caller must hold
+// worldsema.
 func gcBgMarkStartWorkers() {
-	// Background marking is performed by per-P G's. Ensure that
-	// each P has a background GC G.
-	for _, p := range allp {
-		if p.gcBgMarkWorker == 0 {
-			go gcBgMarkWorker(p)
-			notetsleepg(&work.bgMarkReady, -1)
-			noteclear(&work.bgMarkReady)
-		}
+	// Background marking is performed by per-P G's. Ensure that each P has
+	// a background GC G.
+	//
+	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
+	// again, we can reuse the old workers; no need to create new workers.
+	for gcBgMarkWorkerCount < gomaxprocs {
+		go gcBgMarkWorker()
+
+		notetsleepg(&work.bgMarkReady, -1)
+		noteclear(&work.bgMarkReady)
+		// The worker is now guaranteed to be added to the pool before
+		// its P's next findRunnableGCWorker.
+
+		gcBgMarkWorkerCount++
 	}
 }
 
@@ -1859,82 +1859,104 @@ func gcBgMarkPrepare() {
 	work.nwait = ^uint32(0)
 }
 
-func gcBgMarkWorker(_p_ *p) {
+// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
+// gcBgMarkWorker goroutine.
+type gcBgMarkWorkerNode struct {
+	// Unused workers are managed in a lock-free stack. This field must be first.
+	node lfnode
+
+	// The g of this worker.
+	gp guintptr
+
+	// Release this m on park. This is used to communicate with the unlock
+	// function, which cannot access the G's stack. It is unused outside of
+	// gcBgMarkWorker().
+	m muintptr
+}
+
+func gcBgMarkWorker() {
 	gp := getg()
 
-	type parkInfo struct {
-		m      muintptr // Release this m on park.
-		attach puintptr // If non-nil, attach to this p on park.
-	}
-	// We pass park to a gopark unlock function, so it can't be on
+	// We pass node to a gopark unlock function, so it can't be on
 	// the stack (see gopark). Prevent deadlock from recursively
 	// starting GC by disabling preemption.
 	gp.m.preemptoff = "GC worker init"
-	park := new(parkInfo)
+	node := new(gcBgMarkWorkerNode)
 	gp.m.preemptoff = ""
 
-	park.m.set(acquirem())
-	park.attach.set(_p_)
-	// Inform gcBgMarkStartWorkers that this worker is ready.
-	// After this point, the background mark worker is scheduled
-	// cooperatively by gcController.findRunnable. Hence, it must
-	// never be preempted, as this would put it into _Grunnable
-	// and put it on a run queue. Instead, when the preempt flag
-	// is set, this puts itself into _Gwaiting to be woken up by
-	// gcController.findRunnable at the appropriate time.
+	node.gp.set(gp)
+
+	node.m.set(acquirem())
 	notewakeup(&work.bgMarkReady)
+	// After this point, the background mark worker is generally scheduled
+	// cooperatively by gcController.findRunnableGCWorker. While performing
+	// work on the P, preemption is disabled because we are working on
+	// P-local work buffers. When the preempt flag is set, this puts itself
+	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
+	// at the appropriate time.
+	//
+	// When preemption is enabled (e.g., while in gcMarkDone), this worker
+	// may be preempted and schedule as a _Grunnable G from a runq. That is
+	// fine; it will eventually gopark again for further scheduling via
+	// findRunnableGCWorker.
+	//
+	// Since we disable preemption before notifying bgMarkReady, we
+	// guarantee that this G will be in the worker pool for the next
+	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
+	// latency between _GCmark starting and the workers starting.
 
 	for {
-		// Go to sleep until woken by gcController.findRunnable.
-		// We can't releasem yet since even the call to gopark
-		// may be preempted.
-		gopark(func(g *g, parkp unsafe.Pointer) bool {
-			park := (*parkInfo)(parkp)
-
-			// The worker G is no longer running, so it's
-			// now safe to allow preemption.
-			releasem(park.m.ptr())
+		// Go to sleep until woken by
+		// gcController.findRunnableGCWorker.
+		gopark(func(g *g, nodep unsafe.Pointer) bool {
+			node := (*gcBgMarkWorkerNode)(nodep)
 
-			// If the worker isn't attached to its P,
-			// attach now. During initialization and after
-			// a phase change, the worker may have been
-			// running on a different P. As soon as we
-			// attach, the owner P may schedule the
-			// worker, so this must be done after the G is
-			// stopped.
-			if park.attach != 0 {
-				p := park.attach.ptr()
-				park.attach.set(nil)
-				// cas the worker because we may be
-				// racing with a new worker starting
-				// on this P.
-				if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
-					// The P got a new worker.
-					// Exit this worker.
-					return false
-				}
+			if mp := node.m.ptr(); mp != nil {
+				// The worker G is no longer running; release
+				// the M.
+				//
+				// N.B. it is _safe_ to release the M as soon
+				// as we are no longer performing P-local mark
+				// work.
+				//
+				// However, since we cooperatively stop work
+				// when gp.preempt is set, if we releasem in
+				// the loop then the following call to gopark
+				// would immediately preempt the G. This is
+				// also safe, but inefficient: the G must
+				// schedule again only to enter gopark and park
+				// again. Thus, we defer the release until
+				// after parking the G.
+				releasem(mp)
 			}
+
+			// Release this G to the pool.
+			gcBgMarkWorkerPool.push(&node.node)
+			// Note that at this point, the G may immediately be
+			// rescheduled and may be running.
 			return true
-		}, unsafe.Pointer(park), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
+		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
 
-		// Loop until the P dies and disassociates this
-		// worker (the P may later be reused, in which case
-		// it will get a new worker) or we failed to associate.
-		if _p_.gcBgMarkWorker.ptr() != gp {
-			break
-		}
+		// Preemption must not occur here, or another G might see
+		// p.gcMarkWorkerMode.
 
 		// Disable preemption so we can use the gcw. If the
 		// scheduler wants to preempt us, we'll stop draining,
 		// dispose the gcw, and then preempt.
-		park.m.set(acquirem())
+		node.m.set(acquirem())
+		pp := gp.m.p.ptr() // P can't change with preemption disabled.
 
 		if gcBlackenEnabled == 0 {
+			println("worker mode", pp.gcMarkWorkerMode)
 			throw("gcBgMarkWorker: blackening not enabled")
 		}
 
+		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
+			throw("gcBgMarkWorker: mode not set")
+		}
+
 		startTime := nanotime()
-		_p_.gcMarkWorkerStartTime = startTime
+		pp.gcMarkWorkerStartTime = startTime
 
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@@ -1951,11 +1973,11 @@ func gcBgMarkWorker(_p_ *p) {
 			// disabled for mark workers, so it is safe to
 			// read from the G stack.
 			casgstatus(gp, _Grunning, _Gwaiting)
-			switch _p_.gcMarkWorkerMode {
+			switch pp.gcMarkWorkerMode {
 			default:
 				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
 			case gcMarkWorkerDedicatedMode:
-				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
 				if gp.preempt {
 					// We were preempted. This is
 					// a useful signal to kick
@@ -1964,7 +1986,7 @@ func gcBgMarkWorker(_p_ *p) {
 					// somewhere else.
 					lock(&sched.lock)
 					for {
-						gp, _ := runqget(_p_)
+						gp, _ := runqget(pp)
 						if gp == nil {
 							break
 						}
@@ -1974,24 +1996,24 @@ func gcBgMarkWorker(_p_ *p) {
 				}
 				// Go back to draining, this time
 				// without preemption.
-				gcDrain(&_p_.gcw, gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
-				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
-				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			}
 			casgstatus(gp, _Gwaiting, _Grunning)
 		})
 
 		// Account for time.
 		duration := nanotime() - startTime
-		switch _p_.gcMarkWorkerMode {
+		switch pp.gcMarkWorkerMode {
 		case gcMarkWorkerDedicatedMode:
 			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
 			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
-			atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
+			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
 		case gcMarkWorkerIdleMode:
 			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
@@ -2000,31 +2022,27 @@ func gcBgMarkWorker(_p_ *p) {
 		// of work?
 		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait > work.nproc {
-			println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
+			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
 				"work.nwait=", incnwait, "work.nproc=", work.nproc)
 			throw("work.nwait > work.nproc")
 		}
 
+		// We'll releasem after this point and thus this P may run
+		// something else. We must clear the worker mode to avoid
+		// attributing the mode to a different (non-worker) G in
+		// traceGoStart.
+		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
+
 		// If this worker reached a background mark completion
 		// point, signal the main GC goroutine.
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
-			// Make this G preemptible and disassociate it
-			// as the worker for this P so
-			// findRunnableGCWorker doesn't try to
-			// schedule it.
-			_p_.gcBgMarkWorker.set(nil)
-			releasem(park.m.ptr())
+			// We don't need the P-local buffers here, allow
+			// preemption becuse we may schedule like a regular
+			// goroutine in gcMarkDone (block on locks, etc).
+			releasem(node.m.ptr())
+			node.m.set(nil)
 
 			gcMarkDone()
-
-			// Disable preemption and prepare to reattach
-			// to the P.
-			//
-			// We may be running on a different P at this
-			// point, so we can't reattach until this G is
-			// parked.
-			park.m.set(acquirem())
-			park.attach.set(_p_)
 		}
 	}
 }
@@ -2085,7 +2103,7 @@ func gcMark(start_time int64) {
 		// ensured all reachable objects were marked, all of
 		// these must be pointers to black objects. Hence we
 		// can just discard the write barrier buffer.
-		if debug.gccheckmark > 0 || throwOnGCWork {
+		if debug.gccheckmark > 0 {
 			// For debugging, flush the buffer and make
 			// sure it really was all marked.
 			wbBufFlush1(p)
@@ -2117,13 +2135,21 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	throwOnGCWork = false
-
-	cachestats()
-
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
+	// might have incorrect information.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
+	}
+
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
@@ -2142,6 +2168,8 @@ func gcMark(start_time int64) {
 //
 //go:systemstack
 func gcSweep(mode gcMode) {
+	assertWorldStopped()
+
 	if gcphase != _GCoff {
 		throw("gcSweep being done but phase is not GCoff")
 	}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 79df59d6d6..5a24cdac88 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -54,6 +54,8 @@ const (
 //
 // The world must be stopped.
 func gcMarkRootPrepare() {
+	assertWorldStopped()
+
 	work.nFlushCacheRoots = 0
 
 	// Compute how many data and BSS root blocks there are.
@@ -400,11 +402,13 @@ retry:
 	// balance positive. When the required amount of work is low,
 	// we over-assist to build up credit for future allocations
 	// and amortize the cost of assisting.
+	assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
 	debtBytes := -gp.gcAssistBytes
-	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	scanWork := int64(assistWorkPerByte * float64(debtBytes))
 	if scanWork < gcOverAssistWork {
 		scanWork = gcOverAssistWork
-		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+		debtBytes = int64(assistBytesPerWork * float64(scanWork))
 	}
 
 	// Steal as much credit as we can from the background GC's
@@ -418,7 +422,7 @@ retry:
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
 			stolen = bgScanCredit
-			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+			gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(stolen))
 		} else {
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
@@ -543,7 +547,8 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// this scan work counts for. The "1+" is a poor man's
 	// round-up, to ensure this adds credit even if
 	// assistBytesPerWork is very low.
-	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
 
 	// If this is the last worker and we ran out of work,
 	// signal a completion point.
@@ -637,7 +642,8 @@ func gcFlushBgCredit(scanWork int64) {
 		return
 	}
 
-	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	scanBytes := int64(float64(scanWork) * assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
 	for !work.assistQueue.q.empty() && scanBytes > 0 {
@@ -670,7 +676,8 @@ func gcFlushBgCredit(scanWork int64) {
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
-		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
 		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 	}
 	unlock(&work.assistQueue.lock)
@@ -1530,6 +1537,8 @@ func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 //
 // The world must be stopped.
 func gcMarkTinyAllocs() {
+	assertWorldStopped()
+
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil || c.tiny == 0 {
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index b74da1057a..38f09309dc 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -90,7 +90,7 @@ const (
 	//
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
-	scavengeCostRatio = 0.7 * sys.GoosDarwin
+	scavengeCostRatio = 0.7 * (sys.GoosDarwin + sys.GoosIos)
 
 	// scavengeReservationShards determines the amount of memory the scavenger
 	// should reserve for scavenging at a time. Specifically, the amount of
@@ -100,7 +100,7 @@ const (
 
 // heapRetained returns an estimate of the current heap RSS.
 func heapRetained() uint64 {
-	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
@@ -123,7 +123,7 @@ func gcPaceScavenger() {
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -390,13 +390,15 @@ func bgscavenge(c chan int) {
 //
 // Returns the amount of memory scavenged in bytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	var (
 		addrs addrRange
 		gen   uint32
@@ -404,17 +406,17 @@ func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	released := uintptr(0)
 	for released < nbytes {
 		if addrs.size() == 0 {
-			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
 				break
 			}
 		}
-		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
+		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
 		released += r
 		addrs = a
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
-	s.scavengeUnreserve(addrs, gen)
+	p.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -440,46 +442,48 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 // scavengeStartGen starts a new scavenge generation, resetting
 // the scavenger's search space to the full in-use address space.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeStartGen() {
+func (p *pageAlloc) scavengeStartGen() {
+	assertLockHeld(p.mheapLock)
+
 	if debug.scavtrace > 0 {
-		printScavTrace(s.scav.gen, s.scav.released, false)
+		printScavTrace(p.scav.gen, p.scav.released, false)
 	}
-	s.inUse.cloneInto(&s.scav.inUse)
+	p.inUse.cloneInto(&p.scav.inUse)
 
 	// Pick the new starting address for the scavenger cycle.
 	var startAddr offAddr
-	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
 		// The "free" high watermark exceeds the "scavenged" low watermark,
 		// so there are free scavengable pages in parts of the address space
 		// that the scavenger already searched, the high watermark being the
 		// highest one. Pick that as our new starting point to ensure we
 		// see those pages.
-		startAddr = s.scav.freeHWM
+		startAddr = p.scav.freeHWM
 	} else {
 		// The "free" high watermark does not exceed the "scavenged" low
 		// watermark. This means the allocator didn't free any memory in
 		// the range we scavenged last cycle, so we might as well continue
 		// scavenging from where we were.
-		startAddr = s.scav.scavLWM
+		startAddr = p.scav.scavLWM
 	}
-	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+	p.scav.inUse.removeGreaterEqual(startAddr.addr())
 
-	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
 	// scavengeReservationShards is large. This case is fine as the scavenger
 	// will simply be turned off, but it does mean that scavengeReservationShards,
 	// in concert with pallocChunkBytes, dictates the minimum heap size at which
 	// the scavenger triggers. In practice this minimum is generally less than an
 	// arena in size, so virtually every heap has the scavenger on.
-	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
-	s.scav.gen++
-	s.scav.released = 0
-	s.scav.freeHWM = minOffAddr
-	s.scav.scavLWM = maxOffAddr
+	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	p.scav.gen++
+	p.scav.released = 0
+	p.scav.freeHWM = minOffAddr
+	p.scav.scavLWM = maxOffAddr
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -489,19 +493,21 @@ func (s *pageAlloc) scavengeStartGen() {
 //
 // Returns the reserved range and the scavenge generation number for it.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	assertLockHeld(p.mheapLock)
+
 	// Start by reserving the minimum.
-	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
 
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, s.scav.gen
+		return r, p.scav.gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -511,27 +517,29 @@ func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	newBase := alignDown(r.base.addr(), pallocChunkBytes)
 
 	// Remove from inUse however much extra we just pulled out.
-	s.scav.inUse.removeGreaterEqual(newBase)
+	p.scav.inUse.removeGreaterEqual(newBase)
 	r.base = offAddr{newBase}
-	return r, s.scav.gen
+	return r, p.scav.gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	if r.size() == 0 || gen != s.scav.gen {
+func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	assertLockHeld(p.mheapLock)
+
+	if r.size() == 0 || gen != p.scav.gen {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	s.scav.inUse.add(r)
+	p.scav.inUse.add(r)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -545,13 +553,15 @@ func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+	assertLockHeld(p.mheapLock)
+
 	// Defensively check if we've recieved an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -586,12 +596,12 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// Helpers for locking and unlocking only if mayUnlock == true.
 	lockHeap := func() {
 		if mayUnlock {
-			lock(s.mheapLock)
+			lock(p.mheapLock)
 		}
 	}
 	unlockHeap := func() {
 		if mayUnlock {
-			unlock(s.mheapLock)
+			unlock(p.mheapLock)
 		}
 	}
 
@@ -602,14 +612,16 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// by subtracting 1.
 	maxAddr := work.limit.addr() - 1
 	maxChunk := chunkIndex(maxAddr)
-	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 	}
@@ -631,7 +643,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// that's fine. We're being optimistic anyway.
 
 			// Check quickly if there are enough free pages at all.
-			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
 				continue
 			}
 
@@ -641,7 +653,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// avoid races with heap growth. It may or may not be possible to also
 			// see a nil pointer in this case if we do race with heap growth, but
 			// just defensively ignore the nils. This operation is optimistic anyway.
-			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
 			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
 				return i, true
 			}
@@ -670,16 +682,20 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		}
 
 		// Find, verify, and scavenge if we can.
-		chunk := s.chunkOf(candidateChunkIdx)
+		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 
 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+
+	assertLockHeld(p.mheapLock) // Must be locked on return.
 	return 0, work
 }
 
@@ -690,28 +706,38 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
-	s.chunkOf(ci).scavenged.setRange(base, npages)
+// p.mheapLock must be held.
+func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	assertLockHeld(p.mheapLock)
+
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
 	// Update the scavenge low watermark.
-	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
-		s.scav.scavLWM = oAddr
+	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
+		p.scav.scavLWM = oAddr
 	}
 
 	// Only perform the actual scavenging if we're not in a test.
 	// It's dangerous to do so otherwise.
-	if s.test {
+	if p.test {
 		return addr
 	}
 	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	nbytes := int64(npages) * pageSize
+	atomic.Xadd64(&memstats.heap_released, nbytes)
+
+	// Update consistent accounting too.
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, -nbytes)
+	atomic.Xaddint64(&stats.released, nbytes)
+	memstats.heapStats.release()
+
 	return addr
 }
 
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 7f619b1e7d..250343077f 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -235,26 +235,43 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 	if PhysHugePageSize > uintptr(PageSize) {
 		// Check hugepage preserving behavior.
 		bits := uint(PhysHugePageSize / uintptr(PageSize))
-		tests["PreserveHugePageBottom"] = test{
-			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
-			min:   1,
-			max:   3, // Make it so that max would have us try to break the huge page.
-			want:  BitRange{0, bits + 2},
-		}
-		if 3*bits < PallocChunkPages {
-			// We need at least 3 huge pages in a chunk for this test to make sense.
-			tests["PreserveHugePageMiddle"] = test{
-				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+		if bits < PallocChunkPages {
+			tests["PreserveHugePageBottom"] = test{
+				alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
 				min:   1,
-				max:   12, // Make it so that max would have us try to break the huge page.
-				want:  BitRange{bits, bits + 10},
+				max:   3, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{0, bits + 2},
+			}
+			if 3*bits < PallocChunkPages {
+				// We need at least 3 huge pages in a chunk for this test to make sense.
+				tests["PreserveHugePageMiddle"] = test{
+					alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+					min:   1,
+					max:   12, // Make it so that max would have us try to break the huge page.
+					want:  BitRange{bits, bits + 10},
+				}
+			}
+			tests["PreserveHugePageTop"] = test{
+				alloc: []BitRange{{0, PallocChunkPages - bits}},
+				min:   1,
+				max:   1, // Even one page would break a huge page in this case.
+				want:  BitRange{PallocChunkPages - bits, bits},
+			}
+		} else if bits == PallocChunkPages {
+			tests["PreserveHugePageAll"] = test{
+				min:  1,
+				max:  1, // Even one page would break a huge page in this case.
+				want: BitRange{0, PallocChunkPages},
+			}
+		} else {
+			// The huge page size is greater than pallocChunkPages, so it should
+			// be effectively disabled. There's no way we can possible scavenge
+			// a huge page out of this bitmap chunk.
+			tests["PreserveHugePageNone"] = test{
+				min:  1,
+				max:  1,
+				want: BitRange{PallocChunkPages - 1, 1},
 			}
-		}
-		tests["PreserveHugePageTop"] = test{
-			alloc: []BitRange{{0, PallocChunkPages - bits}},
-			min:   1,
-			max:   1, // Even one page would break a huge page in this case.
-			want:  BitRange{PallocChunkPages - bits, bits},
 		}
 	}
 	for name, v := range tests {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 6b8c56ce35..76bc4246e5 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -123,6 +123,8 @@ func (h *mheap) nextSpanForSweep() *mspan {
 //
 //go:nowritebarrier
 func finishsweep_m() {
+	assertWorldStopped()
+
 	// Sweeping must be complete before marking commences, so
 	// sweep any unswept spans. If this is a concurrent GC, there
 	// shouldn't be any spans left to sweep, so this should finish
@@ -337,8 +339,6 @@ func (s *mspan) sweep(preserve bool) bool {
 	spc := s.spanclass
 	size := s.elemsize
 
-	c := _g_.m.p.ptr().mcache
-
 	// The allocBits indicate which unmarked objects don't need to be
 	// processed since they were free at the end of the last GC cycle
 	// and were not allocated since then.
@@ -503,7 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release()
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release()
 			return true
 		}
 
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 46101657d5..b3a068661e 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -22,13 +22,6 @@ const (
 	workbufAlloc = 32 << 10
 )
 
-// throwOnGCWork causes any operations that add pointers to a gcWork
-// buffer to throw.
-//
-// TODO(austin): This is a temporary debugging measure for issue
-// #27993. To be removed before release.
-var throwOnGCWork bool
-
 func init() {
 	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 		throw("bad workbufAlloc")
@@ -93,17 +86,6 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
-
-	// pauseGen causes put operations to spin while pauseGen ==
-	// gcWorkPauseGen if debugCachedWork is true.
-	pauseGen uint32
-
-	// putGen is the pauseGen of the last putGen.
-	putGen uint32
-
-	// pauseStack is the stack at which this P was paused if
-	// debugCachedWork is true.
-	pauseStack [16]uintptr
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -122,60 +104,10 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
-func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
-	if debugCachedWork {
-		alreadyFailed := w.putGen == w.pauseGen
-		w.putGen = w.pauseGen
-		if !canPreemptM(getg().m) {
-			// If we were to spin, the runtime may
-			// deadlock. Since we can't be preempted, the
-			// spin could prevent gcMarkDone from
-			// finishing the ragged barrier, which is what
-			// releases us from the spin.
-			return
-		}
-		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
-		}
-		if throwOnGCWork {
-			printlock()
-			if alreadyFailed {
-				println("runtime: checkPut already failed at this generation")
-			}
-			println("runtime: late gcWork put")
-			if ptr != 0 {
-				gcDumpObject("ptr", ptr, ^uintptr(0))
-			}
-			for _, ptr := range ptrs {
-				gcDumpObject("ptrs", ptr, ^uintptr(0))
-			}
-			println("runtime: paused at")
-			for _, pc := range w.pauseStack {
-				if pc == 0 {
-					break
-				}
-				f := findfunc(pc)
-				if f.valid() {
-					// Obviously this doesn't
-					// relate to ancestor
-					// tracebacks, but this
-					// function prints what we
-					// want.
-					printAncestorTracebackFuncInfo(f, pc)
-				} else {
-					println("\tunknown PC ", hex(pc), "\n")
-				}
-			}
-			throw("throwOnGCWork")
-		}
-	}
-}
-
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-	w.checkPut(obj, nil)
-
 	flushed := false
 	wbuf := w.wbuf1
 	// Record that this may acquire the wbufSpans or heap lock to
@@ -214,8 +146,6 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-	w.checkPut(obj, nil)
-
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
@@ -237,8 +167,6 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}
 
-	w.checkPut(0, obj)
-
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -360,12 +288,10 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		putfull(wbuf)
 		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		w.wbuf1 = handoff(wbuf)
 		w.flushedWork = true // handoff did putfull
 	} else {
@@ -445,7 +371,7 @@ func getempty() *workbuf {
 		}
 		if s == nil {
 			systemstack(func() {
-				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+				s = mheap_.allocManual(workbufAlloc/pageSize, spanAllocWorkBuf)
 			})
 			if s == nil {
 				throw("out of memory")
@@ -547,7 +473,7 @@ func freeSomeWbufs(preemptible bool) bool {
 				break
 			}
 			work.wbufSpans.free.remove(span)
-			mheap_.freeManual(span, &memstats.gc_sys)
+			mheap_.freeManual(span, spanAllocWorkBuf)
 		}
 	})
 	more := !work.wbufSpans.free.isEmpty()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1a57bcd66e..1855330da5 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -44,6 +44,11 @@ const (
 	// Must be a multiple of the pageInUse bitmap element size and
 	// must also evenly divide pagesPerArena.
 	pagesPerReclaimerChunk = 512
+
+	// physPageAlignedStacks indicates whether stack allocations must be
+	// physical page aligned. This is a requirement for MAP_STACK on
+	// OpenBSD.
+	physPageAlignedStacks = GOOS == "openbsd"
 )
 
 // Main malloc heap.
@@ -128,13 +133,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64                  // bytes allocated for large objects
-	nlargealloc uint64                  // number of large object allocations
-	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -490,10 +488,15 @@ func (s *mspan) layout() (size, n, total uintptr) {
 // indirect call from the fixalloc initializer, the compiler can't see
 // this.
 //
+// The heap lock must be held.
+//
 //go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
+
+	assertLockHeld(&h.lock)
+
 	if len(h.allspans) >= cap(h.allspans) {
 		n := 64 * 1024 / sys.PtrSize
 		if n < cap(h.allspans)*3/2 {
@@ -720,7 +723,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gc_sys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -728,7 +731,7 @@ func (h *mheap) init() {
 //
 // reclaim implements the page-reclaimer half of the sweeper.
 //
-// h must NOT be locked.
+// h.lock must NOT be held.
 func (h *mheap) reclaim(npage uintptr) {
 	// TODO(austin): Half of the time spent freeing spans is in
 	// locking/unlocking the heap (even with low contention). We
@@ -811,6 +814,8 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	// In particular, if a span were freed and merged concurrently
 	// with this probing heapArena.spans, it would be possible to
 	// observe arbitrary, stale span pointers.
+	assertLockHeld(&h.lock)
+
 	n0 := n
 	var nFreed uintptr
 	sg := h.sweepgen
@@ -865,9 +870,27 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		traceGCSweepSpan((n0 - nFreed) * pageSize)
 		lock(&h.lock)
 	}
+
+	assertLockHeld(&h.lock) // Must be locked on return.
 	return nFreed
 }
 
+// spanAllocType represents the type of allocation to make, or
+// the type of allocation to be freed.
+type spanAllocType uint8
+
+const (
+	spanAllocHeap          spanAllocType = iota // heap span
+	spanAllocStack                              // stack span
+	spanAllocPtrScalarBits                      // unrolled GC prog bitmap span
+	spanAllocWorkBuf                            // work buf span
+)
+
+// manual returns true if the span allocation is manually managed.
+func (s spanAllocType) manual() bool {
+	return s != spanAllocHeap
+}
+
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
 // spanclass indicates the span's size class and scannability.
@@ -884,7 +907,7 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 		if h.sweepdone == 0 {
 			h.reclaim(npages)
 		}
-		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
+		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
 	if s != nil {
@@ -909,9 +932,15 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 // allocManual must be called on the system stack because it may
 // acquire the heap lock via allocSpan. See mheap for details.
 //
+// If new code is written to call allocManual, do NOT use an
+// existing spanAllocType value and instead declare a new one.
+//
 //go:systemstack
-func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
-	return h.allocSpan(npages, true, 0, stat)
+func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
+	if !typ.manual() {
+		throw("manual span allocation called with non-manually-managed type")
+	}
+	return h.allocSpan(npages, typ, 0)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -996,7 +1025,7 @@ func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
 // tryAllocMSpan attempts to allocate an mspan object from
 // the P-local cache, but may fail.
 //
-// h need not be locked.
+// h.lock need not be held.
 //
 // This caller must ensure that its P won't change underneath
 // it during this function. Currently to ensure that we enforce
@@ -1020,7 +1049,7 @@ func (h *mheap) tryAllocMSpan() *mspan {
 
 // allocMSpanLocked allocates an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // allocMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1029,6 +1058,8 @@ func (h *mheap) tryAllocMSpan() *mspan {
 //
 //go:systemstack
 func (h *mheap) allocMSpanLocked() *mspan {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	if pp == nil {
 		// We don't have a p so just do the normal thing.
@@ -1050,7 +1081,7 @@ func (h *mheap) allocMSpanLocked() *mspan {
 
 // freeMSpanLocked free an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // freeMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1059,6 +1090,8 @@ func (h *mheap) allocMSpanLocked() *mspan {
 //
 //go:systemstack
 func (h *mheap) freeMSpanLocked(s *mspan) {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	// First try to free the mspan directly to the cache.
 	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
@@ -1073,7 +1106,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 
 // allocSpan allocates an mspan which owns npages worth of memory.
 //
-// If manual == false, allocSpan allocates a heap span of class spanclass
+// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
 // and updates heap accounting. If manual == true, allocSpan allocates a
 // manually-managed span (spanclass is ignored), and the caller is
 // responsible for any accounting related to its use of the span. Either
@@ -1082,20 +1115,27 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 //
 // The returned span is fully initialized.
 //
-// h must not be locked.
+// h.lock must not be held.
 //
 // allocSpan must be called on the system stack both because it acquires
 // the heap lock and because it must block GC transitions.
 //
 //go:systemstack
-func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
 
+	// On some platforms we need to provide physical page aligned stack
+	// allocations. Where the page size is less than the physical page
+	// size, we already manage to do this by default.
+	needPhysPageAlign := physPageAlignedStacks && typ == spanAllocStack && pageSize < physPageSize
+
 	// If the allocation is small enough, try the page cache!
+	// The page cache does not support aligned allocations, so we cannot use
+	// it if we need to provide a physical page aligned stack allocation.
 	pp := gp.m.p.ptr()
-	if pp != nil && npages < pageCachePages/4 {
+	if !needPhysPageAlign && pp != nil && npages < pageCachePages/4 {
 		c := &pp.pcache
 
 		// If the cache is empty, refill it.
@@ -1109,23 +1149,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		base, scav = c.alloc(npages)
 		if base != 0 {
 			s = h.tryAllocMSpan()
-
-			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+			if s != nil {
 				goto HaveSpan
 			}
-			// We're either running duing GC, failed to acquire a mspan,
-			// or the allocation is for a large object. This means we
-			// have to lock the heap and do a bunch of extra work,
-			// so go down the HaveBaseLocked path.
-			//
-			// We must do this during GC to avoid skew with heap_scan
-			// since we flush mcache stats whenever we lock.
-			//
-			// TODO(mknyszek): It would be nice to not have to
-			// lock the heap if it's a large allocation, but
-			// it's fine for now. The critical section here is
-			// short and large object allocations are relatively
-			// infrequent.
+			// We have a base but no mspan, so we need
+			// to lock the heap.
 		}
 	}
 
@@ -1133,6 +1161,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 	// whole job done without the heap lock.
 	lock(&h.lock)
 
+	if needPhysPageAlign {
+		// Overallocate by a physical page to allow for later alignment.
+		npages += physPageSize / pageSize
+	}
+
 	if base == 0 {
 		// Try to acquire a base address.
 		base, scav = h.pages.alloc(npages)
@@ -1152,39 +1185,23 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// one now that we have the heap lock.
 		s = h.allocMSpanLocked()
 	}
-	if !manual {
-		// This is a heap span, so we should do some additional accounting
-		// which may only be done with the heap locked.
 
-		// Transfer stats from mcache to global.
-		var c *mcache
-		if gp.m.p != 0 {
-			c = gp.m.p.ptr().mcache
-		} else {
-			// This case occurs while bootstrapping.
-			// See the similar code in mallocgc.
-			c = mcache0
-			if c == nil {
-				throw("mheap.allocSpan called with no P")
-			}
-		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
+	if needPhysPageAlign {
+		allocBase, allocPages := base, npages
+		base = alignUp(allocBase, physPageSize)
+		npages -= physPageSize / pageSize
 
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+		// Return memory around the aligned allocation.
+		spaceBefore := base - allocBase
+		if spaceBefore > 0 {
+			h.pages.free(allocBase, spaceBefore/pageSize)
 		}
-
-		// Either heap_live or heap_scan could have been updated.
-		if gcBlackenEnabled != 0 {
-			gcController.revise()
+		spaceAfter := (allocPages-npages)*pageSize - spaceBefore
+		if spaceAfter > 0 {
+			h.pages.free(base+npages*pageSize, spaceAfter/pageSize)
 		}
 	}
+
 	unlock(&h.lock)
 
 HaveSpan:
@@ -1195,12 +1212,10 @@ HaveSpan:
 		s.needzero = 1
 	}
 	nbytes := npages * pageSize
-	if manual {
+	if typ.manual() {
 		s.manualFreeList = 0
 		s.nelems = 0
 		s.limit = s.base() + s.npages*pageSize
-		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
 		s.state.set(mSpanManual)
 	} else {
 		// We must set span properties before the span is published anywhere
@@ -1254,11 +1269,31 @@ HaveSpan:
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
 		sysUsed(unsafe.Pointer(base), nbytes)
-		mSysStatDec(&memstats.heap_released, scav)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
-	mSysStatInc(sysStat, nbytes)
-	mSysStatDec(&memstats.heap_idle, nbytes)
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys.
+		memstats.heap_sys.add(-int64(nbytes))
+	}
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release()
 
 	// Publish the span in various locations.
 
@@ -1269,7 +1304,7 @@ HaveSpan:
 	// before that happens) or pageInUse is updated.
 	h.setSpans(s.base(), npages, s)
 
-	if !manual {
+	if !typ.manual() {
 		// Mark in-use span in arena page bitmap.
 		//
 		// This publishes the span to the page sweeper, so
@@ -1280,11 +1315,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
@@ -1297,8 +1327,10 @@ HaveSpan:
 // Try to add at least npage pages of memory to the heap,
 // returning whether it worked.
 //
-// h must be locked.
+// h.lock must be held.
 func (h *mheap) grow(npage uintptr) bool {
+	assertLockHeld(&h.lock)
+
 	// We must grow the heap in whole palloc chunks.
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
@@ -1340,8 +1372,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// The allocation is always aligned to the heap arena
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
-		mSysStatInc(&memstats.heap_released, asize)
-		mSysStatInc(&memstats.heap_idle, asize)
+		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.released, int64(asize))
+		memstats.heapStats.release()
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1373,29 +1407,20 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if gcBlackenEnabled != 0 {
-			// heap_scan changed.
-			gcController.revise()
-		}
-		h.freeSpanLocked(s, true, true)
+		h.freeSpanLocked(s, spanAllocHeap)
 		unlock(&h.lock)
 	})
 }
 
 // freeManual frees a manually-managed span returned by allocManual.
-// stat must be the same as the stat passed to the allocManual that
+// typ must be the same as the spanAllocType passed to the allocManual that
 // allocated s.
 //
 // This must only be called when gcphase == _GCoff. See mSpanState for
@@ -1405,16 +1430,16 @@ func (h *mheap) freeSpan(s *mspan) {
 // the heap lock. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) freeManual(s *mspan, stat *uint64) {
+func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
 	s.needzero = 1
 	lock(&h.lock)
-	mSysStatDec(stat, s.npages*pageSize)
-	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
-	h.freeSpanLocked(s, false, true)
+	h.freeSpanLocked(s, typ)
 	unlock(&h.lock)
 }
 
-func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
+func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
+	assertLockHeld(&h.lock)
+
 	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
@@ -1434,12 +1459,30 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
-	if acctinuse {
-		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
+	// Update stats.
+	//
+	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
 	}
-	if acctidle {
-		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
 	}
+	memstats.heapStats.release()
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
@@ -1982,7 +2025,7 @@ func newArenaMayUnlock() *gcBitsArena {
 	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
 		unlock(&gcBitsArenas.lock)
-		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 8859ed68cc..94ae75fbfe 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -27,8 +27,8 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
+	"os"
 )
 
 func main() {
@@ -38,6 +38,7 @@ func main() {
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
+	gen("riscv64", notags, zeroRISCV64, copyRISCV64)
 }
 
 func gen(arch string, tags, zero, copy func(io.Writer)) {
@@ -53,7 +54,7 @@ func gen(arch string, tags, zero, copy func(io.Writer)) {
 	fmt.Fprintln(&buf)
 	copy(&buf)
 
-	if err := ioutil.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {
+	if err := os.WriteFile("duff_"+arch+".s", buf.Bytes(), 0644); err != nil {
 		log.Fatalln(err)
 	}
 }
@@ -227,3 +228,30 @@ func copyMIPS64x(w io.Writer) {
 	}
 	fmt.Fprintln(w, "\tRET")
 }
+
+func zeroRISCV64(w io.Writer) {
+	// ZERO: always zero
+	// X10: ptr to memory to be zeroed
+	// X10 is updated as a side effect.
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\tZERO, (X10)")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+	}
+	fmt.Fprintln(w, "\tRET")
+}
+
+func copyRISCV64(w io.Writer) {
+	// X10: ptr to source memory
+	// X11: ptr to destination memory
+	// X10 and X11 are updated as a side effect
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\t(X10), X31")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+		fmt.Fprintln(w, "\tMOV\tX31, (X11)")
+		fmt.Fprintln(w, "\tADD\t$8, X11")
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintln(w, "\tRET")
+}
diff --git a/src/runtime/mkfastlog2table.go b/src/runtime/mkfastlog2table.go
index 305c84a7c1..d650292394 100644
--- a/src/runtime/mkfastlog2table.go
+++ b/src/runtime/mkfastlog2table.go
@@ -12,9 +12,9 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"log"
 	"math"
+	"os"
 )
 
 func main() {
@@ -36,7 +36,7 @@ func main() {
 	}
 	fmt.Fprintln(&buf, "}")
 
-	if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
+	if err := os.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
 		log.Fatalln(err)
 	}
 }
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 44dea22ef3..1d614dd003 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -126,7 +126,8 @@ func header(arch string) {
 	}
 	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
 	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
-	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
+	fmt.Fprintf(out, "// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.\n")
+	fmt.Fprintf(out, "TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
 
 func p(f string, args ...interface{}) {
@@ -189,7 +190,6 @@ func (l *layout) restore() {
 
 func gen386() {
 	p("PUSHFL")
-
 	// Save general purpose registers.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
@@ -199,12 +199,6 @@ func gen386() {
 		l.add("MOVL", reg, 4)
 	}
 
-	// Save the 387 state.
-	l.addSpecial(
-		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
-		"FRSTOR %d(SP)",
-		108)
-
 	// Save SSE state only if supported.
 	lSSE := layout{stack: l.stack, sp: "SP"}
 	for i := 0; i < 8; i++ {
@@ -355,10 +349,10 @@ func genARM64() {
 	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
 	p("SUB $8, RSP, R29")  // set up new frame pointer
 	p("#endif")
-	// On darwin, save the LR again after decrementing SP. We run the
-	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// On iOS, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support sigaltstack),
 	// so any writes below SP may be clobbered.
-	p("#ifdef GOOS_darwin")
+	p("#ifdef GOOS_ios")
 	p("MOVD R30, (RSP)")
 	p("#endif")
 
@@ -502,12 +496,12 @@ func genPPC64() {
 }
 
 func genRISCV64() {
-	// X0 (zero), X1 (LR), X2 (SP), X4 (g), X31 (TMP) are special.
+	// X0 (zero), X1 (LR), X2 (SP), X4 (TP), X27 (g), X31 (TMP) are special.
 	var l = layout{sp: "X2", stack: 8}
 
-	// Add integer registers (X3, X5-X30).
+	// Add integer registers (X3, X5-X26, X28-30).
 	for i := 3; i < 31; i++ {
-		if i == 4 {
+		if i == 4 || i == 27 {
 			continue
 		}
 		reg := fmt.Sprintf("X%d", i)
diff --git a/src/runtime/mksizeclasses.go b/src/runtime/mksizeclasses.go
index cacbb64207..b92d1fed5f 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/runtime/mksizeclasses.go
@@ -35,7 +35,6 @@ import (
 	"fmt"
 	"go/format"
 	"io"
-	"io/ioutil"
 	"log"
 	"os"
 )
@@ -65,7 +64,7 @@ func main() {
 	if *stdout {
 		_, err = os.Stdout.Write(out)
 	} else {
-		err = ioutil.WriteFile("sizeclasses.go", out, 0666)
+		err = os.WriteFile("sizeclasses.go", out, 0666)
 	}
 	if err != nil {
 		log.Fatal(err)
@@ -110,8 +109,8 @@ func makeClasses() []class {
 				align = 256
 			} else if size >= 128 {
 				align = size / 8
-			} else if size >= 16 {
-				align = 16 // required for x86 SSE instructions, if we want to use them
+			} else if size >= 32 {
+				align = 16 // heap bitmaps assume 16 byte alignment for allocations >= 32 bytes.
 			}
 		}
 		if !powerOfTwo(align) {
@@ -157,7 +156,7 @@ func makeClasses() []class {
 		}
 	}
 
-	if len(classes) != 67 {
+	if len(classes) != 68 {
 		panic("number of size classes has changed")
 	}
 
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 8b3c62c375..dac1f39969 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -293,13 +293,13 @@ type pageAlloc struct {
 
 	// sysStat is the runtime memstat to update when new system
 	// memory is committed by the pageAlloc for allocation metadata.
-	sysStat *uint64
+	sysStat *sysMemStat
 
 	// Whether or not this struct is being used in tests.
 	test bool
 }
 
-func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
@@ -308,34 +308,49 @@ func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 		print("runtime: summary max pages = ", maxPackedValue, "\n")
 		throw("root level max pages doesn't fit in summary")
 	}
-	s.sysStat = sysStat
+	p.sysStat = sysStat
 
-	// Initialize s.inUse.
-	s.inUse.init(sysStat)
+	// Initialize p.inUse.
+	p.inUse.init(sysStat)
 
 	// System-dependent initialization.
-	s.sysInit()
+	p.sysInit()
 
 	// Start with the searchAddr in a state indicating there's no free memory.
-	s.searchAddr = maxSearchAddr
+	p.searchAddr = maxSearchAddr
 
 	// Set the mheapLock.
-	s.mheapLock = mheapLock
+	p.mheapLock = mheapLock
 
 	// Initialize scavenge tracking state.
-	s.scav.scavLWM = maxSearchAddr
+	p.scav.scavLWM = maxSearchAddr
+}
+
+// tryChunkOf returns the bitmap data for the given chunk.
+//
+// Returns nil if the chunk data has not been mapped.
+func (p *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
+	l2 := p.chunks[ci.l1()]
+	if l2 == nil {
+		return nil
+	}
+	return &l2[ci.l2()]
 }
 
 // chunkOf returns the chunk at the given chunk index.
-func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
-	return &s.chunks[ci.l1()][ci.l2()]
+//
+// The chunk index must be valid or this method may throw.
+func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &p.chunks[ci.l1()][ci.l2()]
 }
 
 // grow sets up the metadata for the address range [base, base+size).
-// It may allocate metadata, in which case *s.sysStat will be updated.
+// It may allocate metadata, in which case *p.sysStat will be updated.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) grow(base, size uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) grow(base, size uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// Round up to chunks, since we can't deal with increments smaller
 	// than chunks. Also, sysGrow expects aligned values.
 	limit := alignUp(base+size, pallocChunkBytes)
@@ -343,29 +358,29 @@ func (s *pageAlloc) grow(base, size uintptr) {
 
 	// Grow the summary levels in a system-dependent manner.
 	// We just update a bunch of additional metadata here.
-	s.sysGrow(base, limit)
+	p.sysGrow(base, limit)
 
-	// Update s.start and s.end.
+	// Update p.start and p.end.
 	// If no growth happened yet, start == 0. This is generally
 	// safe since the zero page is unmapped.
-	firstGrowth := s.start == 0
+	firstGrowth := p.start == 0
 	start, end := chunkIndex(base), chunkIndex(limit)
-	if firstGrowth || start < s.start {
-		s.start = start
+	if firstGrowth || start < p.start {
+		p.start = start
 	}
-	if end > s.end {
-		s.end = end
+	if end > p.end {
+		p.end = end
 	}
 	// Note that [base, limit) will never overlap with any existing
 	// range inUse because grow only ever adds never-used memory
 	// regions to the page allocator.
-	s.inUse.add(makeAddrRange(base, limit))
+	p.inUse.add(makeAddrRange(base, limit))
 
 	// A grow operation is a lot like a free operation, so if our
-	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// chunk ends up below p.searchAddr, update p.searchAddr to the
 	// new address, just like in free.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 
 	// Add entries into chunks, which is sparse, if needed. Then,
@@ -374,21 +389,21 @@ func (s *pageAlloc) grow(base, size uintptr) {
 	// Newly-grown memory is always considered scavenged.
 	// Set all the bits in the scavenged bitmaps high.
 	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
-		if s.chunks[c.l1()] == nil {
+		if p.chunks[c.l1()] == nil {
 			// Create the necessary l2 entry.
 			//
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
-			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
-			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
-		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
 	}
 
 	// Update summaries accordingly. The grow acts like a free, so
 	// we need to ensure this newly-free memory is visible in the
 	// summaries.
-	s.update(base, size/pageSize, true, false)
+	p.update(base, size/pageSize, true, false)
 }
 
 // update updates heap metadata. It must be called each time the bitmap
@@ -398,8 +413,10 @@ func (s *pageAlloc) grow(base, size uintptr) {
 // a contiguous allocation or free between addr and addr+npages. alloc indicates
 // whether the operation performed was an allocation or a free.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+// p.mheapLock must be held.
+func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	assertLockHeld(p.mheapLock)
+
 	// base, limit, start, and end are inclusive.
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -408,23 +425,23 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	if sc == ec {
 		// Fast path: the allocation doesn't span more than one chunk,
 		// so update this one and if the summary didn't change, return.
-		x := s.summary[len(s.summary)-1][sc]
-		y := s.chunkOf(sc).summarize()
+		x := p.summary[len(p.summary)-1][sc]
+		y := p.chunkOf(sc).summarize()
 		if x == y {
 			return
 		}
-		s.summary[len(s.summary)-1][sc] = y
+		p.summary[len(p.summary)-1][sc] = y
 	} else if contig {
 		// Slow contiguous path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 
 		// Update the summary for chunk sc.
-		summary[sc] = s.chunkOf(sc).summarize()
+		summary[sc] = p.chunkOf(sc).summarize()
 
 		// Update the summaries for chunks in between, which are
 		// either totally allocated or freed.
-		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		whole := p.summary[len(p.summary)-1][sc+1 : ec]
 		if alloc {
 			// Should optimize into a memclr.
 			for i := range whole {
@@ -437,22 +454,22 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 		}
 
 		// Update the summary for chunk ec.
-		summary[ec] = s.chunkOf(ec).summarize()
+		summary[ec] = p.chunkOf(ec).summarize()
 	} else {
 		// Slow general path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
 		//
 		// We can't assume a contiguous allocation happened, so walk over
 		// every chunk in the range and manually recompute the summary.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 		for c := sc; c <= ec; c++ {
-			summary[c] = s.chunkOf(c).summarize()
+			summary[c] = p.chunkOf(c).summarize()
 		}
 	}
 
 	// Walk up the radix tree and update the summaries appropriately.
 	changed := true
-	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+	for l := len(p.summary) - 2; l >= 0 && changed; l-- {
 		// Update summaries at level l from summaries at level l+1.
 		changed = false
 
@@ -466,12 +483,12 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 
 		// Iterate over each block, updating the corresponding summary in the less-granular level.
 		for i := lo; i < hi; i++ {
-			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			children := p.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
 			sum := mergeSummaries(children, logMaxPages)
-			old := s.summary[l][i]
+			old := p.summary[l][i]
 			if old != sum {
 				changed = true
-				s.summary[l][i] = sum
+				p.summary[l][i] = sum
 			}
 		}
 	}
@@ -484,8 +501,10 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 // Returns the amount of scavenged memory in bytes present in the
 // allocated range.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
 	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
@@ -493,24 +512,24 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	scav := uint(0)
 	if sc == ec {
 		// The range doesn't cross any chunk boundaries.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, ei+1-si)
 		chunk.allocRange(si, ei+1-si)
 	} else {
 		// The range crosses at least one chunk boundary.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
 		chunk.allocRange(si, pallocChunkPages-si)
 		for c := sc + 1; c < ec; c++ {
-			chunk := s.chunkOf(c)
+			chunk := p.chunkOf(c)
 			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
 			chunk.allocAll()
 		}
-		chunk = s.chunkOf(ec)
+		chunk = p.chunkOf(ec)
 		scav += chunk.scavenged.popcntRange(0, ei+1)
 		chunk.allocRange(0, ei+1)
 	}
-	s.update(base, npages, true, true)
+	p.update(base, npages, true, true)
 	return uintptr(scav) * pageSize
 }
 
@@ -519,13 +538,15 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 // returned. If addr is higher than any mapped region, then
 // it returns maxOffAddr.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+// p.mheapLock must be held.
+func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+	assertLockHeld(p.mheapLock)
+
 	// If we're not in a test, validate first by checking mheap_.arenas.
 	// This is a fast path which is only safe to use outside of testing.
 	ai := arenaIndex(addr.addr())
-	if s.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
-		vAddr, ok := s.inUse.findAddrGreaterEqual(addr.addr())
+	if p.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := p.inUse.findAddrGreaterEqual(addr.addr())
 		if ok {
 			return offAddr{vAddr}
 		} else {
@@ -541,20 +562,22 @@ func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 // find searches for the first (address-ordered) contiguous free region of
 // npages in size and returns a base address for that region.
 //
-// It uses s.searchAddr to prune its search and assumes that no palloc chunks
-// below chunkIndex(s.searchAddr) contain any free memory at all.
+// It uses p.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(p.searchAddr) contain any free memory at all.
 //
-// find also computes and returns a candidate s.searchAddr, which may or
-// may not prune more of the address space than s.searchAddr already does.
-// This candidate is always a valid s.searchAddr.
+// find also computes and returns a candidate p.searchAddr, which may or
+// may not prune more of the address space than p.searchAddr already does.
+// This candidate is always a valid p.searchAddr.
 //
 // find represents the slow path and the full radix tree search.
 //
 // Returns a base address of 0 on failure, in which case the candidate
 // searchAddr returned is invalid and must be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+	assertLockHeld(p.mheapLock)
+
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -634,7 +657,7 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	lastSumIdx := -1
 
 nextLevel:
-	for l := 0; l < len(s.summary); l++ {
+	for l := 0; l < len(p.summary); l++ {
 		// For the root level, entriesPerBlock is the whole level.
 		entriesPerBlock := 1 << levelBits[l]
 		logMaxPages := levelLogPages[l]
@@ -644,14 +667,14 @@ nextLevel:
 		i <<= levelBits[l]
 
 		// Slice out the block of entries we care about.
-		entries := s.summary[l][i : i+entriesPerBlock]
+		entries := p.summary[l][i : i+entriesPerBlock]
 
 		// Determine j0, the first index we should start iterating from.
 		// The searchAddr may help us eliminate iterations if we followed the
 		// searchAddr on the previous level or we're on the root leve, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
-		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
 			j0 = searchIdx & (entriesPerBlock - 1)
 		}
 
@@ -716,7 +739,7 @@ nextLevel:
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
 			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
-			return addr, s.findMappedAddr(firstFree.base)
+			return addr, p.findMappedAddr(firstFree.base)
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -728,7 +751,7 @@ nextLevel:
 		// lied to us. In either case, dump some useful state and throw.
 		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
 		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
-		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: p.searchAddr = ", hex(p.searchAddr.addr()), ", i = ", i, "\n")
 		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
 		for j := 0; j < len(entries); j++ {
 			sum := entries[j]
@@ -745,12 +768,12 @@ nextLevel:
 	// After iterating over all levels, i must contain a chunk index which
 	// is what the final level represents.
 	ci := chunkIdx(i)
-	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	j, searchIdx := p.chunkOf(ci).find(npages, 0)
 	if j == ^uint(0) {
 		// We couldn't find any space in this chunk despite the summaries telling
 		// us it should be there. There's likely a bug, so dump some state and throw.
-		sum := s.summary[len(s.summary)-1][i]
-		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		sum := p.summary[len(p.summary)-1][i]
+		print("runtime: summary[", len(p.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
 		print("runtime: npages = ", npages, "\n")
 		throw("bad summary data")
 	}
@@ -762,7 +785,7 @@ nextLevel:
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
 	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
-	return addr, s.findMappedAddr(firstFree.base)
+	return addr, p.findMappedAddr(firstFree.base)
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
@@ -772,25 +795,31 @@ nextLevel:
 // Returns a 0 base address on failure, in which case other returned values
 // should be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return 0, 0
 	}
 
 	// If npages has a chance of fitting in the chunk where the searchAddr is,
 	// search it directly.
 	searchAddr := minOffAddr
-	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
+	if pallocChunkPages-chunkPageIndex(p.searchAddr.addr()) >= uint(npages) {
 		// npages is guaranteed to be no greater than pallocChunkPages here.
-		i := chunkIndex(s.searchAddr.addr())
-		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
-			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+		i := chunkIndex(p.searchAddr.addr())
+		if max := p.summary[len(p.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := p.chunkOf(i).find(npages, chunkPageIndex(p.searchAddr.addr()))
 			if j == ^uint(0) {
 				print("runtime: max = ", max, ", npages = ", npages, "\n")
-				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(p.searchAddr.addr()), ", p.searchAddr = ", hex(p.searchAddr.addr()), "\n")
 				throw("bad summary data")
 			}
 			addr = chunkBase(i) + uintptr(j)*pageSize
@@ -800,7 +829,7 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	}
 	// We failed to use a searchAddr for one reason or another, so try
 	// the slow path.
-	addr, searchAddr = s.find(npages)
+	addr, searchAddr = p.find(npages)
 	if addr == 0 {
 		if npages == 1 {
 			// We failed to find a single free page, the smallest unit
@@ -808,41 +837,47 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 			// exhausted. Otherwise, the heap still might have free
 			// space in it, just not enough contiguous space to
 			// accommodate npages.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 		}
 		return 0, 0
 	}
 Found:
 	// Go ahead and actually mark the bits now that we have an address.
-	scav = s.allocRange(addr, npages)
+	scav = p.allocRange(addr, npages)
 
 	// If we found a higher searchAddr, we know that all the
 	// heap memory before that searchAddr in an offset address space is
-	// allocated, so bump s.searchAddr up to the new one.
-	if s.searchAddr.lessThan(searchAddr) {
-		s.searchAddr = searchAddr
+	// allocated, so bump p.searchAddr up to the new one.
+	if p.searchAddr.lessThan(searchAddr) {
+		p.searchAddr = searchAddr
 	}
 	return addr, scav
 }
 
 // free returns npages worth of memory starting at base back to the page heap.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) free(base, npages uintptr) {
-	// If we're freeing pages below the s.searchAddr, update searchAddr.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) free(base, npages uintptr) {
+	assertLockHeld(p.mheapLock)
+
+	// If we're freeing pages below the p.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
-		s.scav.freeHWM = offLimit
+	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+		p.scav.freeHWM = offLimit
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
 		// where it is, so mark it directly.
 		i := chunkIndex(base)
-		s.chunkOf(i).free1(chunkPageIndex(base))
+		p.chunkOf(i).free1(chunkPageIndex(base))
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
 		sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -850,17 +885,17 @@ func (s *pageAlloc) free(base, npages uintptr) {
 
 		if sc == ec {
 			// The range doesn't cross any chunk boundaries.
-			s.chunkOf(sc).free(si, ei+1-si)
+			p.chunkOf(sc).free(si, ei+1-si)
 		} else {
 			// The range crosses at least one chunk boundary.
-			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			p.chunkOf(sc).free(si, pallocChunkPages-si)
 			for c := sc + 1; c < ec; c++ {
-				s.chunkOf(c).freeAll()
+				p.chunkOf(c).freeAll()
 			}
-			s.chunkOf(ec).free(0, ei+1)
+			p.chunkOf(ec).free(0, ei+1)
 		}
 	}
-	s.update(base, npages, true, false)
+	p.update(base, npages, true, false)
 }
 
 const (
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 6658a900ac..331dadade9 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm mips mipsle wasm darwin,arm64
+// +build 386 arm mips mipsle wasm ios,arm64
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
 // allocator, even though it has 64-bit pointers. This is because any wasm
 // pointer always has its top 32 bits as zero, so the effective heap address
 // space is only 2^32 bytes in size (see heapAddrBits).
 
-// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// ios/arm64 is treated as a 32-bit architecture for the purposes of the
 // page allocator, even though it has 64-bit pointers and a 33-bit address
 // space (see heapAddrBits). The 33 bit address space cannot be rounded up
 // to 64 bits because there are too many summary levels to fit in just 33
@@ -60,7 +60,7 @@ var levelLogPages = [summaryLevels]uint{
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Calculate how much memory all our entries will take up.
 	//
 	// This should be around 12 KiB or less.
@@ -76,7 +76,7 @@ func (s *pageAlloc) sysInit() {
 		throw("failed to reserve page summary memory")
 	}
 	// There isn't much. Just map it and mark it as used immediately.
-	sysMap(reservation, totalSize, s.sysStat)
+	sysMap(reservation, totalSize, p.sysStat)
 	sysUsed(reservation, totalSize)
 
 	// Iterate over the reservation and cut it up into slices.
@@ -88,29 +88,29 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
 	}
 
 	// Walk up the tree and update the summary slices.
-	for l := len(s.summary) - 1; l >= 0; l-- {
+	for l := len(p.summary) - 1; l >= 0; l-- {
 		// Figure out what part of the summary array this new address space needs.
 		// Note that we need to align the ranges to the block width (1<<levelBits[l])
 		// at this level because the full block is needed to compute the summary for
 		// the next level.
 		lo, hi := addrsToSummaryRange(l, base, limit)
 		_, hi = blockAlignSummaryRange(l, lo, hi)
-		if hi > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:hi]
+		if hi > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:hi]
 		}
 	}
 }
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 831626e4b2..ffacb46c18 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
+// +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
 
-// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+// See mpagealloc_32bit.go for why ios/arm64 is excluded here.
 
 package runtime
 
@@ -67,7 +67,7 @@ var levelLogPages = [summaryLevels]uint{
 // sysInit performs architecture-dependent initialization of fields
 // in pageAlloc. pageAlloc should be uninitialized except for sysStat
 // if any runtime statistic should be updated.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Reserve memory for each level. This will get mapped in
 	// as R/W by setArenas.
 	for l, shift := range levelShift {
@@ -82,21 +82,21 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 	}
 }
 
 // sysGrow performs architecture-dependent operations on heap
 // growth for the page allocator, such as mapping in new memory
 // for summaries. It also updates the length of the slices in
-// s.summary.
+// [.summary.
 //
 // base is the base of the newly-added heap memory and limit is
 // the first address past the end of the newly-added heap memory.
 // Both must be aligned to pallocChunkBytes.
 //
-// The caller must update s.start and s.end after calling sysGrow.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+// The caller must update p.start and p.end after calling sysGrow.
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
@@ -111,12 +111,12 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	}
 
 	// summaryRangeToSumAddrRange converts a range of indices in any
-	// level of s.summary into page-aligned addresses which cover that
+	// level of p.summary into page-aligned addresses which cover that
 	// range of indices.
 	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
 		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
 		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
-		base := unsafe.Pointer(&s.summary[level][0])
+		base := unsafe.Pointer(&p.summary[level][0])
 		return addrRange{
 			offAddr{uintptr(add(base, baseOffset))},
 			offAddr{uintptr(add(base, limitOffset))},
@@ -140,10 +140,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	//
 	// This will be used to look at what memory in the summary array is already
 	// mapped before and after this new range.
-	inUseIndex := s.inUse.findSucc(base)
+	inUseIndex := p.inUse.findSucc(base)
 
 	// Walk up the radix tree and map summaries in as needed.
-	for l := range s.summary {
+	for l := range p.summary {
 		// Figure out what part of the summary array this new address space needs.
 		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
 
@@ -151,8 +151,8 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// we get tight bounds checks on at least the top bound.
 		//
 		// We must do this regardless of whether we map new memory.
-		if needIdxLimit > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:needIdxLimit]
+		if needIdxLimit > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:needIdxLimit]
 		}
 
 		// Compute the needed address range in the summary array for level l.
@@ -163,10 +163,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// for mapping. prune's invariants are guaranteed by the fact that this
 		// function will never be asked to remap the same memory twice.
 		if inUseIndex > 0 {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
 		}
-		if inUseIndex < len(s.inUse.ranges) {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		if inUseIndex < len(p.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex]))
 		}
 		// It's possible that after our pruning above, there's nothing new to map.
 		if need.size() == 0 {
@@ -174,7 +174,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		}
 
 		// Map and commit need.
-		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), p.sysStat)
 		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
 	}
 }
diff --git a/src/runtime/mpagealloc_test.go b/src/runtime/mpagealloc_test.go
index 65ba71d459..5d979fa95b 100644
--- a/src/runtime/mpagealloc_test.go
+++ b/src/runtime/mpagealloc_test.go
@@ -54,7 +54,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
 			},
 		},
 		"Contiguous2": {
@@ -63,7 +63,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 1,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
 			},
 		},
 		"Contiguous5": {
@@ -75,7 +75,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Discontiguous": {
@@ -85,9 +85,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Mixed": {
@@ -98,8 +98,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"WildlyDiscontiguous": {
@@ -110,9 +110,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x21,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
-				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
-				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)),
 			},
 		},
 		"ManyDiscontiguous": {
@@ -129,39 +129,39 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 64,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
-				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
-				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
-				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
-				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
-				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
-				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
-				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
-				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
-				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
-				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
-				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
-				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
-				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
-				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
-				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
-				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
-				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
-				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
-				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
-				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
-				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
-				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
-				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
-				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
-				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
-				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
-				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
-				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
-				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
-				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)),
 			},
 		},
 	}
@@ -172,8 +172,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)),
 			},
 		}
 	}
@@ -197,7 +197,7 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Fail()
 			} else {
 				for i := range want {
-					if want[i] != got[i] {
+					if !want[i].Equals(got[i]) {
 						t.Fail()
 						break
 					}
@@ -207,11 +207,11 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Logf("found inUse mismatch")
 				t.Logf("got:")
 				for i, r := range got {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 				t.Logf("want:")
 				for i, r := range want {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 			}
 		})
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 683a997136..4b5c66d8d6 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -71,8 +71,14 @@ func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
 // into s. Then, it clears the cache, such that empty returns
 // true.
 //
-// s.mheapLock must be held or the world must be stopped.
-func (c *pageCache) flush(s *pageAlloc) {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (c *pageCache) flush(p *pageAlloc) {
+	assertLockHeld(p.mheapLock)
+
 	if c.empty() {
 		return
 	}
@@ -83,18 +89,18 @@ func (c *pageCache) flush(s *pageAlloc) {
 	// slower, safer thing by iterating over each bit individually.
 	for i := uint(0); i < 64; i++ {
 		if c.cache&(1<<i) != 0 {
-			s.chunkOf(ci).free1(pi + i)
+			p.chunkOf(ci).free1(pi + i)
 		}
 		if c.scav&(1<<i) != 0 {
-			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
 		}
 	}
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
-	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
-	s.update(c.base, pageCachePages, false, false)
+	p.update(c.base, pageCachePages, false, false)
 	*c = pageCache{}
 }
 
@@ -102,19 +108,25 @@ func (c *pageCache) flush(s *pageAlloc) {
 // may not be contiguous, and returns a pageCache structure which owns the
 // chunk.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocToCache() pageCache {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) allocToCache() pageCache {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return pageCache{}
 	}
 	c := pageCache{}
-	ci := chunkIndex(s.searchAddr.addr()) // chunk index
-	if s.summary[len(s.summary)-1][ci] != 0 {
+	ci := chunkIndex(p.searchAddr.addr()) // chunk index
+	if p.summary[len(p.summary)-1][ci] != 0 {
 		// Fast path: there's free pages at or near the searchAddr address.
-		chunk := s.chunkOf(ci)
-		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		chunk := p.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(p.searchAddr.addr()))
 		if j == ^uint(0) {
 			throw("bad summary data")
 		}
@@ -126,15 +138,15 @@ func (s *pageAlloc) allocToCache() pageCache {
 	} else {
 		// Slow path: the searchAddr address had nothing there, so go find
 		// the first free page the slow way.
-		addr, _ := s.find(1)
+		addr, _ := p.find(1)
 		if addr == 0 {
 			// We failed to find adequate free space, so mark the searchAddr as OoM
 			// and return an empty pageCache.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 			return pageCache{}
 		}
 		ci := chunkIndex(addr)
-		chunk := s.chunkOf(ci)
+		chunk := p.chunkOf(ci)
 		c = pageCache{
 			base:  alignDown(addr, 64*pageSize),
 			cache: ^chunk.pages64(chunkPageIndex(addr)),
@@ -143,19 +155,19 @@ func (s *pageAlloc) allocToCache() pageCache {
 	}
 
 	// Set the bits as allocated and clear the scavenged bits.
-	s.allocRange(c.base, pageCachePages)
+	p.allocRange(c.base, pageCachePages)
 
 	// Update as an allocation, but note that it's not contiguous.
-	s.update(c.base, pageCachePages, false, true)
+	p.update(c.base, pageCachePages, false, true)
 
 	// Set the search address to the last page represented by the cache.
 	// Since all of the pages in this block are going to the cache, and we
 	// searched for the first free page, we can confidently start at the
 	// next page.
 	//
-	// However, s.searchAddr is not allowed to point into unmapped heap memory
+	// However, p.searchAddr is not allowed to point into unmapped heap memory
 	// unless it is maxSearchAddr, so make it the last page as opposed to
 	// the page after.
-	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	p.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
 	return c
 }
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 2c0eb2c2dd..84a2c06dbb 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -160,10 +160,10 @@ type addrRanges struct {
 	totalBytes uintptr
 
 	// sysStat is the stat to track allocations by this type
-	sysStat *uint64
+	sysStat *sysMemStat
 }
 
-func (a *addrRanges) init(sysStat *uint64) {
+func (a *addrRanges) init(sysStat *sysMemStat) {
 	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
 	ranges.len = 0
 	ranges.cap = 16
@@ -172,20 +172,46 @@ func (a *addrRanges) init(sysStat *uint64) {
 	a.totalBytes = 0
 }
 
-// findSucc returns the first index in a such that base is
+// findSucc returns the first index in a such that addr is
 // less than the base of the addrRange at that index.
 func (a *addrRanges) findSucc(addr uintptr) int {
-	// TODO(mknyszek): Consider a binary search for large arrays.
-	// While iterating over these ranges is potentially expensive,
-	// the expected number of ranges is small, ideally just 1,
-	// since Go heaps are usually mostly contiguous.
 	base := offAddr{addr}
-	for i := range a.ranges {
+
+	// Narrow down the search space via a binary search
+	// for large addrRanges until we have at most iterMax
+	// candidates left.
+	const iterMax = 8
+	bot, top := 0, len(a.ranges)
+	for top-bot > iterMax {
+		i := ((top - bot) / 2) + bot
+		if a.ranges[i].contains(base.addr()) {
+			// a.ranges[i] contains base, so
+			// its successor is the next index.
+			return i + 1
+		}
+		if base.lessThan(a.ranges[i].base) {
+			// In this case i might actually be
+			// the successor, but we can't be sure
+			// until we check the ones before it.
+			top = i
+		} else {
+			// In this case we know base is
+			// greater than or equal to a.ranges[i].limit-1,
+			// so i is definitely not the successor.
+			// We already checked i, so pick the next
+			// one.
+			bot = i + 1
+		}
+	}
+	// There are top-bot candidates left, so
+	// iterate over them and find the first that
+	// base is strictly less than.
+	for i := bot; i < top; i++ {
 		if base.lessThan(a.ranges[i].base) {
 			return i
 		}
 	}
-	return len(a.ranges)
+	return top
 }
 
 // findAddrGreaterEqual returns the smallest address represented by a
@@ -218,7 +244,7 @@ func (a *addrRanges) contains(addr uintptr) bool {
 
 // add inserts a new address range to a.
 //
-// r must not overlap with any address range in a.
+// r must not overlap with any address range in a and r.size() must be > 0.
 func (a *addrRanges) add(r addrRange) {
 	// The copies in this function are potentially expensive, but this data
 	// structure is meant to represent the Go heap. At worst, copying this
@@ -229,6 +255,12 @@ func (a *addrRanges) add(r addrRange) {
 	// of 16) and Go heaps are usually mostly contiguous, so the chance that
 	// an addrRanges even grows to that size is extremely low.
 
+	// An empty range has no effect on the set of addresses represented
+	// by a, but passing a zero-sized range is almost always a bug.
+	if r.size() == 0 {
+		print("runtime: range = {", hex(r.base.addr()), ", ", hex(r.limit.addr()), "}\n")
+		throw("attempted to add zero-sized address range")
+	}
 	// Because we assume r is not currently represented in a,
 	// findSucc gives us our insertion index.
 	i := a.findSucc(r.base.addr())
diff --git a/src/runtime/mranges_test.go b/src/runtime/mranges_test.go
new file mode 100644
index 0000000000..ed439c56c2
--- /dev/null
+++ b/src/runtime/mranges_test.go
@@ -0,0 +1,275 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func validateAddrRanges(t *testing.T, a *AddrRanges, want ...AddrRange) {
+	ranges := a.Ranges()
+	if len(ranges) != len(want) {
+		t.Errorf("want %v, got %v", want, ranges)
+		t.Fatal("different lengths")
+	}
+	gotTotalBytes := uintptr(0)
+	wantTotalBytes := uintptr(0)
+	for i := range ranges {
+		gotTotalBytes += ranges[i].Size()
+		wantTotalBytes += want[i].Size()
+		if ranges[i].Base() >= ranges[i].Limit() {
+			t.Error("empty range found")
+		}
+		// Ensure this is equivalent to what we want.
+		if !ranges[i].Equals(want[i]) {
+			t.Errorf("range %d: got [0x%x, 0x%x), want [0x%x, 0x%x)", i,
+				ranges[i].Base(), ranges[i].Limit(),
+				want[i].Base(), want[i].Limit(),
+			)
+		}
+		if i != 0 {
+			// Ensure the ranges are sorted.
+			if ranges[i-1].Base() >= ranges[i].Base() {
+				t.Errorf("ranges %d and %d are out of sorted order", i-1, i)
+			}
+			// Check for a failure to coalesce.
+			if ranges[i-1].Limit() == ranges[i].Base() {
+				t.Errorf("ranges %d and %d should have coalesced", i-1, i)
+			}
+			// Check if any ranges overlap. Because the ranges are sorted
+			// by base, it's sufficient to just check neighbors.
+			if ranges[i-1].Limit() > ranges[i].Base() {
+				t.Errorf("ranges %d and %d overlap", i-1, i)
+			}
+		}
+	}
+	if wantTotalBytes != gotTotalBytes {
+		t.Errorf("expected %d total bytes, got %d", wantTotalBytes, gotTotalBytes)
+	}
+	if b := a.TotalBytes(); b != gotTotalBytes {
+		t.Errorf("inconsistent total bytes: want %d, got %d", gotTotalBytes, b)
+	}
+	if t.Failed() {
+		t.Errorf("addrRanges: %v", ranges)
+		t.Fatal("detected bad addrRanges")
+	}
+}
+
+func TestAddrRangesAdd(t *testing.T) {
+	a := NewAddrRanges()
+
+	// First range.
+	a.Add(MakeAddrRange(512, 1024))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 1024),
+	)
+
+	// Coalesce up.
+	a.Add(MakeAddrRange(1024, 2048))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+	)
+
+	// Add new independent range.
+	a.Add(MakeAddrRange(4096, 8192))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(4096, 8192),
+	)
+
+	// Coalesce down.
+	a.Add(MakeAddrRange(3776, 4096))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(3776, 8192),
+	)
+
+	// Coalesce up and down.
+	a.Add(MakeAddrRange(2048, 3776))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 8192),
+	)
+
+	// Push a bunch of independent ranges to the end to try and force growth.
+	expectedRanges := []AddrRange{MakeAddrRange(512, 8192)}
+	for i := uintptr(0); i < 64; i++ {
+		dRange := MakeAddrRange(8192+(i+1)*2048, 8192+(i+1)*2048+10)
+		a.Add(dRange)
+		expectedRanges = append(expectedRanges, dRange)
+		validateAddrRanges(t, &a, expectedRanges...)
+	}
+
+	// Push a bunch of independent ranges to the beginning to try and force growth.
+	var bottomRanges []AddrRange
+	for i := uintptr(0); i < 63; i++ {
+		dRange := MakeAddrRange(8+i*8, 8+i*8+4)
+		a.Add(dRange)
+		bottomRanges = append(bottomRanges, dRange)
+		validateAddrRanges(t, &a, append(bottomRanges, expectedRanges...)...)
+	}
+}
+
+func TestAddrRangesFindSucc(t *testing.T) {
+	var large []AddrRange
+	for i := 0; i < 100; i++ {
+		large = append(large, MakeAddrRange(5+uintptr(i)*5, 5+uintptr(i)*5+3))
+	}
+
+	type testt struct {
+		name   string
+		base   uintptr
+		expect int
+		ranges []AddrRange
+	}
+	tests := []testt{
+		{
+			name:   "Empty",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{},
+		},
+		{
+			name:   "OneBefore",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneWithin",
+			base:   14,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfterLimit",
+			base:   16,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfter",
+			base:   17,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "ThreeBefore",
+			base:   3,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeAfter",
+			base:   24,
+			expect: 3,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeBetween",
+			base:   11,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeWithin",
+			base:   9,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "Zero",
+			base:   0,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(0, 10),
+			},
+		},
+		{
+			name:   "Max",
+			base:   ^uintptr(0),
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(^uintptr(0)-5, ^uintptr(0)),
+			},
+		},
+		{
+			name:   "LargeBefore",
+			base:   2,
+			expect: 0,
+			ranges: large,
+		},
+		{
+			name:   "LargeAfter",
+			base:   5 + uintptr(len(large))*5 + 30,
+			expect: len(large),
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenLow",
+			base:   14,
+			expect: 2,
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenHigh",
+			base:   249,
+			expect: 49,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinLow",
+			base:   25,
+			expect: 5,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinHigh",
+			base:   396,
+			expect: 79,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinMiddle",
+			base:   250,
+			expect: 50,
+			ranges: large,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			a := MakeAddrRanges(test.ranges...)
+			i := a.FindSucc(test.base)
+			if i != test.expect {
+				t.Fatalf("expected %d, got %d", test.expect, i)
+			}
+		})
+	}
+}
diff --git a/src/runtime/msan.go b/src/runtime/msan.go
index c0f3957e28..6a5960b0a8 100644
--- a/src/runtime/msan.go
+++ b/src/runtime/msan.go
@@ -50,8 +50,12 @@ func msanmalloc(addr unsafe.Pointer, sz uintptr)
 //go:noescape
 func msanfree(addr unsafe.Pointer, sz uintptr)
 
-// These are called from msan_amd64.s
+//go:noescape
+func msanmove(dst, src unsafe.Pointer, sz uintptr)
+
+// These are called from msan_GOARCH.s
 //go:cgo_import_static __msan_read_go
 //go:cgo_import_static __msan_write_go
 //go:cgo_import_static __msan_malloc_go
 //go:cgo_import_static __msan_free_go
+//go:cgo_import_static __msan_memmove
diff --git a/src/runtime/msan_amd64.s b/src/runtime/msan_amd64.s
index cbe739df53..669e9ca73f 100644
--- a/src/runtime/msan_amd64.s
+++ b/src/runtime/msan_amd64.s
@@ -58,6 +58,15 @@ TEXT	runtime·msanfree(SB), NOSPLIT, $0-16
 	MOVQ	$__msan_free_go(SB), AX
 	JMP	msancall<>(SB)
 
+// func runtime·msanmove(dst, src unsafe.Pointer, sz uintptr)
+TEXT	runtime·msanmove(SB), NOSPLIT, $0-24
+	MOVQ	dst+0(FP), RARG0
+	MOVQ	src+8(FP), RARG1
+	MOVQ	size+16(FP), RARG2
+	// void __msan_memmove(void *dst, void *src, uintptr_t sz);
+	MOVQ	$__msan_memmove(SB), AX
+	JMP	msancall<>(SB)
+
 // Switches SP to g0 stack and calls (AX). Arguments already set.
 TEXT	msancall<>(SB), NOSPLIT, $0-0
 	get_tls(R12)
diff --git a/src/runtime/msan_arm64.s b/src/runtime/msan_arm64.s
index 5e29f1aefb..f19906cfc8 100644
--- a/src/runtime/msan_arm64.s
+++ b/src/runtime/msan_arm64.s
@@ -9,6 +9,7 @@
 
 #define RARG0 R0
 #define RARG1 R1
+#define RARG2 R2
 #define FARG R3
 
 // func runtime·domsanread(addr unsafe.Pointer, sz uintptr)
@@ -45,6 +46,15 @@ TEXT	runtime·msanfree(SB), NOSPLIT, $0-16
 	MOVD	$__msan_free_go(SB), FARG
 	JMP	msancall<>(SB)
 
+// func runtime·msanmove(dst, src unsafe.Pointer, sz uintptr)
+TEXT	runtime·msanmove(SB), NOSPLIT, $0-24
+	MOVD	dst+0(FP), RARG0
+	MOVD	src+8(FP), RARG1
+	MOVD	size+16(FP), RARG2
+	// void __msan_memmove(void *dst, void *src, uintptr_t sz);
+	MOVD	$__msan_memmove(SB), FARG
+	JMP	msancall<>(SB)
+
 // Switches SP to g0 stack and calls (FARG). Arguments already set.
 TEXT	msancall<>(SB), NOSPLIT, $0-0
 	MOVD	RSP, R19                  // callee-saved
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
index 490eed4549..10d2596c38 100644
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -102,7 +102,7 @@ retry:
 			if newCap == 0 {
 				newCap = spanSetInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -283,7 +283,7 @@ func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
 	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
 		return s
 	}
-	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
 }
 
 // free returns a spanSetBlock back to the pool.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 6a8a34d1ed..6defaedabe 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,13 +8,10 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
 // Statistics.
-// If you edit this structure, also edit type MemStats below.
-// Their layouts must match exactly.
 //
 // For detailed descriptions see the documentation for MemStats.
 // Fields that differ from MemStats are further documented here.
@@ -35,31 +32,43 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in mSpanInUse spans
-	heap_released uint64 // bytes released to the os
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
 
 	// heap_objects is not used by the runtime directly and instead
 	// computed on the fly by updatememstats.
 	heap_objects uint64 // total number of allocated objects
 
+	// Statistics about stacks.
+	stacks_inuse uint64     // bytes in manually-managed stack spans; computed by updatememstats
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
+	mspan_sys    sysMemStat
 	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64 // updated atomically or during STW
-	other_sys    uint64 // updated atomically or during STW
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+
+	// Statistics about GC overhead.
+	gcWorkBufInUse           uint64     // computed by updatememstats
+	gcProgPtrScalarBitsInUse uint64     // computed by updatememstats
+	gcMiscSys                sysMemStat // updated atomically or during STW
+
+	// Miscellaneous statistics.
+	other_sys sysMemStat // updated atomically or during STW
+
+	// Statistics about the garbage collector.
+
+	// next_gc is the goal heap_live for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	next_gc uint64
 
-	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
-	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
@@ -78,7 +87,9 @@ type mstats struct {
 		nfree   uint64
 	}
 
-	// Statistics below here are not exported to MemStats directly.
+	// Add an uint32 for even number of size classes to align below fields
+	// to 64 bits for atomic operations on 32 bit platforms.
+	_ [1 - _NumSizeClasses%2]uint32
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
@@ -106,11 +117,10 @@ type mstats struct {
 
 	// heap_live is the number of bytes considered live by the GC.
 	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= heap_alloc, since heap_alloc
-	// includes unmarked objects that have not yet been swept (and
-	// hence goes up as we allocate and down as we sweep) while
-	// heap_live excludes these objects (and hence only goes up
-	// between GCs).
+	// since then. heap_live <= alloc, since alloc includes unmarked
+	// objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heap_live excludes these
+	// objects (and hence only goes up between GCs).
 	//
 	// This is updated atomically without locking. To reduce
 	// contention, this is updated only when obtaining a span from
@@ -135,6 +145,8 @@ type mstats struct {
 	// no-scan objects and no-scan tails of objects.
 	//
 	// Whenever this is updated, call gcController.revise().
+	//
+	// Read and written atomically or with the world stopped.
 	heap_scan uint64
 
 	// heap_marked is the number of bytes marked by the previous
@@ -142,6 +154,17 @@ type mstats struct {
 	// unlike heap_live, heap_marked does not change until the
 	// next mark termination.
 	heap_marked uint64
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
+
+	// _ uint32 // ensure gcPauseDist is aligned
+
+	// gcPauseDist represents the distribution of all GC-related
+	// application pauses in the runtime.
+	//
+	// Each individual pause is counted separately, unlike pause_ns.
+	gcPauseDist timeHistogram
 }
 
 var memstats mstats
@@ -419,24 +442,25 @@ type MemStats struct {
 	}
 }
 
-// Size of the trailing by_size array differs between mstats and MemStats,
-// and all data after by_size is local to runtime, not exported.
-// NumSizeClasses was changed, but we cannot change MemStats because of backward compatibility.
-// sizeof_C_MStats is the size of the prefix of mstats that
-// corresponds to MemStats. It should match Sizeof(MemStats{}).
-var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
-
 func init() {
-	var memStats MemStats
-	if sizeof_C_MStats != unsafe.Sizeof(memStats) {
-		println(sizeof_C_MStats, unsafe.Sizeof(memStats))
-		throw("MStats vs MemStatsType size mismatch")
-	}
-
-	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
-		println(unsafe.Offsetof(memstats.heap_live))
+	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
+		println(offset)
 		throw("memstats.heap_live not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
+		println(offset)
+		throw("memstats.gcPauseDist not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
 }
 
 // ReadMemStats populates m with memory allocator statistics.
@@ -458,14 +482,74 @@ func ReadMemStats(m *MemStats) {
 func readmemstats_m(stats *MemStats) {
 	updatememstats()
 
-	// The size of the trailing by_size array differs between
-	// mstats and MemStats. NumSizeClasses was changed, but we
-	// cannot change MemStats because of backward compatibility.
-	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
-
+	stats.Alloc = memstats.alloc
+	stats.TotalAlloc = memstats.total_alloc
+	stats.Sys = memstats.sys
+	stats.Mallocs = memstats.nmalloc
+	stats.Frees = memstats.nfree
+	stats.HeapAlloc = memstats.alloc
+	stats.HeapSys = memstats.heap_sys.load()
+	// By definition, HeapIdle is memory that was mapped
+	// for the heap but is not currently used to hold heap
+	// objects. It also specifically is memory that can be
+	// used for other purposes, like stacks, but this memory
+	// is subtracted out of HeapSys before it makes that
+	// transition. Put another way:
+	//
+	// heap_sys = bytes allocated from the OS for the heap - bytes ultimately used for non-heap purposes
+	// heap_idle = bytes allocated from the OS for the heap - bytes ultimately used for any purpose
+	//
+	// or
+	//
+	// heap_sys = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse
+	// heap_idle = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse - heap_inuse
+	//
+	// => heap_idle = heap_sys - heap_inuse
+	stats.HeapIdle = memstats.heap_sys.load() - memstats.heap_inuse
+	stats.HeapInuse = memstats.heap_inuse
+	stats.HeapReleased = memstats.heap_released
+	stats.HeapObjects = memstats.heap_objects
+	stats.StackInuse = memstats.stacks_inuse
 	// memstats.stacks_sys is only memory mapped directly for OS stacks.
 	// Add in heap-allocated stack memory for user consumption.
-	stats.StackSys += stats.StackInuse
+	stats.StackSys = memstats.stacks_inuse + memstats.stacks_sys.load()
+	stats.MSpanInuse = memstats.mspan_inuse
+	stats.MSpanSys = memstats.mspan_sys.load()
+	stats.MCacheInuse = memstats.mcache_inuse
+	stats.MCacheSys = memstats.mcache_sys.load()
+	stats.BuckHashSys = memstats.buckhash_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+	stats.OtherSys = memstats.other_sys.load()
+	stats.NextGC = memstats.next_gc
+	stats.LastGC = memstats.last_gc_unix
+	stats.PauseTotalNs = memstats.pause_total_ns
+	stats.PauseNs = memstats.pause_ns
+	stats.PauseEnd = memstats.pause_end
+	stats.NumGC = memstats.numgc
+	stats.NumForcedGC = memstats.numforcedgc
+	stats.GCCPUFraction = memstats.gc_cpu_fraction
+	stats.EnableGC = true
+
+	// Handle BySize. Copy N values, where N is
+	// the minimum of the lengths of the two arrays.
+	// Unfortunately copy() won't work here because
+	// the arrays have different structs.
+	//
+	// TODO(mknyszek): Consider renaming the fields
+	// of by_size's elements to align so we can use
+	// the copy built-in.
+	bySizeLen := len(stats.BySize)
+	if l := len(memstats.by_size); l < bySizeLen {
+		bySizeLen = l
+	}
+	for i := 0; i < bySizeLen; i++ {
+		stats.BySize[i].Size = memstats.by_size[i].size
+		stats.BySize[i].Mallocs = memstats.by_size[i].nmalloc
+		stats.BySize[i].Frees = memstats.by_size[i].nfree
+	}
 }
 
 //go:linkname readGCStats runtime/debug.readGCStats
@@ -511,8 +595,14 @@ func readGCStats_m(pauses *[]uint64) {
 	*pauses = p[:n+n+3]
 }
 
+// Updates the memstats structure.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func updatememstats() {
+	assertWorldStopped()
+
 	// Flush mcaches to mcentral before doing anything else.
 	//
 	// Flushing to the mcentral may in general cause stats to
@@ -521,11 +611,9 @@ func updatememstats() {
 
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
-
-	// We also count stacks_inuse as sys memory.
-	memstats.sys += memstats.stacks_inuse
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
+		memstats.other_sys.load()
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
@@ -542,62 +630,75 @@ func updatememstats() {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
+	// Collect consistent stats, which are the source-of-truth in the some cases.
+	var consStats heapStatsDelta
+	memstats.heapStats.unsafeRead(&consStats)
 
-	// Aggregate local stats.
-	cachestats()
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
 
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
 	// Collect per-sizeclass stats.
 	for i := 0; i < _NumSizeClasses; i++ {
-		if i == 0 {
-			memstats.nmalloc += mheap_.nlargealloc
-			totalAlloc += mheap_.largealloc
-			totalFree += mheap_.largefree
-			memstats.nfree += mheap_.nlargefree
-			continue
-		}
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
 
-		// The mcache stats have been flushed to mheap_.
-		memstats.nfree += mheap_.nsmallfree[i]
-		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
 	}
-	totalFree += smallFree
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
 	memstats.alloc = totalAlloc - totalFree
-	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
-}
 
-// cachestats flushes all mcache stats.
-//
-// The world must be stopped.
-//
-//go:nowritebarrier
-func cachestats() {
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		purgecachedstats(c)
+	memstats.stacks_inuse = uint64(consStats.inStacks)
+	memstats.gcWorkBufInUse = uint64(consStats.inWorkBufs)
+	memstats.gcProgPtrScalarBitsInUse = uint64(consStats.inPtrScalarBits)
+
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+
+	// The world is stopped, so the consistent stats (after aggregation)
+	// should be identical to some combination of memstats. In particular:
+	//
+	// * heap_inuse == inHeap
+	// * heap_released == released
+	// * heap_sys - heap_released == committed - inStacks - inWorkBufs - inPtrScalarBits
+	//
+	// Check if that's actually true.
+	//
+	// TODO(mknyszek): Maybe don't throw here. It would be bad if a
+	// bug in otherwise benign accounting caused the whole application
+	// to crash.
+	if memstats.heap_inuse != uint64(consStats.inHeap) {
+		print("runtime: heap_inuse=", memstats.heap_inuse, "\n")
+		print("runtime: consistent value=", consStats.inHeap, "\n")
+		throw("heap_inuse and consistent stats are not equal")
+	}
+	if memstats.heap_released != uint64(consStats.released) {
+		print("runtime: heap_released=", memstats.heap_released, "\n")
+		print("runtime: consistent value=", consStats.released, "\n")
+		throw("heap_released and consistent stats are not equal")
+	}
+	globalRetained := memstats.heap_sys.load() - memstats.heap_released
+	consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+	if globalRetained != consRetained {
+		print("runtime: global value=", globalRetained, "\n")
+		print("runtime: consistent value=", consRetained, "\n")
+		throw("measures of the retained heap are not equal")
 	}
 }
 
@@ -607,6 +708,8 @@ func cachestats() {
 //
 //go:nowritebarrier
 func flushmcache(i int) {
+	assertWorldStopped()
+
 	p := allp[i]
 	c := p.mcache
 	if c == nil {
@@ -622,69 +725,256 @@ func flushmcache(i int) {
 //
 //go:nowritebarrier
 func flushallmcaches() {
+	assertWorldStopped()
+
 	for i := 0; i < int(gomaxprocs); i++ {
 		flushmcache(i)
 	}
 }
 
+// sysMemStat represents a global system statistic that is managed atomically.
+//
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
 //go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by either heap or GC lock.
-	h := &mheap_
-	memstats.heap_scan += uint64(c.local_scan)
-	c.local_scan = 0
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
-	h.largefree += uint64(c.local_largefree)
-	c.local_largefree = 0
-	h.nlargefree += uint64(c.local_nlargefree)
-	c.local_nlargefree = 0
-	for i := 0; i < len(c.local_nsmallfree); i++ {
-		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
-		c.local_nsmallfree[i] = 0
-	}
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
 }
 
-// Atomically increases a given *system* memory stat. We are counting on this
-// stat never overflowing a uintptr, so this function must only be used for
-// system memory stats.
-//
-// The current implementation for little endian architectures is based on
-// xadduintptr(), which is less than ideal: xadd64() should really be used.
-// Using xadduintptr() is a stop-gap solution until arm supports xadd64() that
-// doesn't use locks.  (Locks are a problem as they require a valid G, which
-// restricts their useability.)
+// add atomically adds the sysMemStat by n.
 //
-// A side-effect of using xadduintptr() is that we need to check for
-// overflow errors.
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
 //go:nosplit
-func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
 		return
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, int64(n))
-		return
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
+	}
+}
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	// Memory stats.
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
-		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
-		exit(2)
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
 	}
 }
 
-// Atomically decreases a given *system* memory stat. Same comments as
-// mSysStatInc apply.
-//go:nosplit
-func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each P's
+	// statsSeq field. If the reader observes a P not writing,
+	// it can be sure that it will pick up the new gen value the
+	// next time it writes.
+	//
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need a ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+
+	// noPLock is intended to provide mutual exclusion for updating
+	// stats when no P is available. It does not block other writers
+	// with a P, only other writers without a P and the reader. Because
+	// stats are usually updated when a P is available, contention on
+	// this lock should be minimal.
+	noPLock mutex
+}
+
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated.
+//
+// The returned heapStatsDelta must be updated atomically.
+//
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) acquire() *heapStatsDelta {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 == 0 {
+			// Should have been incremented to odd.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		lock(&m.noPLock)
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, -int64(n))
-		return
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) release() {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 != 0 {
+			// Should have been incremented to even.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		unlock(&m.noPLock)
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
-		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
-		exit(2)
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// world must be stopped.
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		out.merge(&m.stats[i])
+	}
+}
+
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
 	}
 }
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently. The world must be stopped
+// or metricsSema must be held.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Get the current generation. We can be confident that this
+	// will not change since read is serialized and is the only
+	// one that modifies currGen.
+	currGen := atomic.Load(&m.gen)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
+	}
+
+	// Prevent writers without a P from writing while we update gen.
+	lock(&m.noPLock)
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+
+	// Allow P-less writers to continue. They'll be writing to the
+	// next generation now.
+	unlock(&m.noPLock)
+
+	for _, p := range allp {
+		// Spin until there are no more writers.
+		for atomic.Load(&p.statsSeq)%2 != 0 {
+		}
+	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+
+	releasem(mp)
+}
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index 632769c114..6efc00007d 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -57,12 +57,6 @@ type wbBuf struct {
 	// on. This must be a multiple of wbBufEntryPointers because
 	// the write barrier only checks for overflow once per entry.
 	buf [wbBufEntryPointers * wbBufEntries]uintptr
-
-	// debugGen causes the write barrier buffer to flush after
-	// every write barrier if equal to gcWorkPauseGen. This is for
-	// debugging #27993. This is only set if debugCachedWork is
-	// set.
-	debugGen uint32
 }
 
 const (
@@ -86,7 +80,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if writeBarrier.cgo || (debugCachedWork && (throwOnGCWork || b.debugGen == atomic.Load(&gcWorkPauseGen))) {
+	if writeBarrier.cgo {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
@@ -204,32 +198,10 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Switch to the system stack so we don't have to worry about
 	// the untyped stack slots or safe points.
 	systemstack(func() {
-		if debugCachedWork {
-			// For debugging, include the old value of the
-			// slot and some other data in the traceback.
-			wbBuf := &getg().m.p.ptr().wbBuf
-			var old uintptr
-			if dst != nil {
-				// dst may be nil in direct calls to wbBufFlush.
-				old = *dst
-			}
-			wbBufFlush1Debug(old, wbBuf.buf[0], wbBuf.buf[1], &wbBuf.buf[0], wbBuf.next)
-		} else {
-			wbBufFlush1(getg().m.p.ptr())
-		}
+		wbBufFlush1(getg().m.p.ptr())
 	})
 }
 
-// wbBufFlush1Debug is a temporary function for debugging issue
-// #27993. It exists solely to add some context to the traceback.
-//
-//go:nowritebarrierrec
-//go:systemstack
-//go:noinline
-func wbBufFlush1Debug(old, buf1, buf2 uintptr, start *uintptr, next uintptr) {
-	wbBufFlush1(getg().m.p.ptr())
-}
-
 // wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
 //
 // This must not have write barriers because it is part of the write
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 34ea82a7fa..77eb3aa4c6 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -79,16 +79,17 @@ type pollDesc struct {
 	lock    mutex // protects the following fields
 	fd      uintptr
 	closing bool
-	everr   bool    // marks event scanning error happened
-	user    uint32  // user settable cookie
-	rseq    uintptr // protects from stale read timers
-	rg      uintptr // pdReady, pdWait, G waiting for read or nil
-	rt      timer   // read deadline timer (set if rt.f != nil)
-	rd      int64   // read deadline
-	wseq    uintptr // protects from stale write timers
-	wg      uintptr // pdReady, pdWait, G waiting for write or nil
-	wt      timer   // write deadline timer
-	wd      int64   // write deadline
+	everr   bool      // marks event scanning error happened
+	user    uint32    // user settable cookie
+	rseq    uintptr   // protects from stale read timers
+	rg      uintptr   // pdReady, pdWait, G waiting for read or nil
+	rt      timer     // read deadline timer (set if rt.f != nil)
+	rd      int64     // read deadline
+	wseq    uintptr   // protects from stale write timers
+	wg      uintptr   // pdReady, pdWait, G waiting for write or nil
+	wt      timer     // write deadline timer
+	wd      int64     // write deadline
+	self    *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
 }
 
 type pollCache struct {
@@ -157,6 +158,7 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd.wseq++
 	pd.wg = 0
 	pd.wd = 0
+	pd.self = pd
 	unlock(&pd.lock)
 
 	var errno int32
@@ -271,14 +273,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 			// Copy current seq into the timer arg.
 			// Timer func will check the seq against current descriptor seq,
 			// if they differ the descriptor was reused or timers were reset.
-			pd.rt.arg = pd
+			pd.rt.arg = pd.makeArg()
 			pd.rt.seq = pd.rseq
 			resettimer(&pd.rt, pd.rd)
 		}
 	} else if pd.rd != rd0 || combo != combo0 {
 		pd.rseq++ // invalidate current timers
 		if pd.rd > 0 {
-			modtimer(&pd.rt, pd.rd, 0, rtf, pd, pd.rseq)
+			modtimer(&pd.rt, pd.rd, 0, rtf, pd.makeArg(), pd.rseq)
 		} else {
 			deltimer(&pd.rt)
 			pd.rt.f = nil
@@ -287,14 +289,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	if pd.wt.f == nil {
 		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
-			pd.wt.arg = pd
+			pd.wt.arg = pd.makeArg()
 			pd.wt.seq = pd.wseq
 			resettimer(&pd.wt, pd.wd)
 		}
 	} else if pd.wd != wd0 || combo != combo0 {
 		pd.wseq++ // invalidate current timers
 		if pd.wd > 0 && !combo {
-			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd, pd.wseq)
+			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd.makeArg(), pd.wseq)
 		} else {
 			deltimer(&pd.wt)
 			pd.wt.f = nil
@@ -547,3 +549,20 @@ func (c *pollCache) alloc() *pollDesc {
 	unlock(&c.lock)
 	return pd
 }
+
+// makeArg converts pd to an interface{}.
+// makeArg does not do any allocation. Normally, such
+// a conversion requires an allocation because pointers to
+// go:notinheap types (which pollDesc is) must be stored
+// in interfaces indirectly. See issue 42076.
+func (pd *pollDesc) makeArg() (i interface{}) {
+	x := (*eface)(unsafe.Pointer(&i))
+	x._type = pdType
+	x.data = unsafe.Pointer(&pd.self)
+	return
+}
+
+var (
+	pdEface interface{} = (*pollDesc)(nil)
+	pdType  *_type      = efaceOf(&pdEface)._type
+)
diff --git a/src/runtime/os2_aix.go b/src/runtime/os2_aix.go
index 31ac6ddf79..428ff7f225 100644
--- a/src/runtime/os2_aix.go
+++ b/src/runtime/os2_aix.go
@@ -518,9 +518,10 @@ func sigaltstack(new, old *stackt) {
 }
 
 //go:nosplit
-func getsystemcfg(label uint) uintptr {
+//go:linkname internal_cpu_getsystemcfg internal/cpu.getsystemcfg
+func internal_cpu_getsystemcfg(label uint) uint {
 	r, _ := syscall1(&libc_getsystemcfg, uintptr(label))
-	return r
+	return uint(r)
 }
 
 func usleep1(us uint32)
diff --git a/src/runtime/os_aix.go b/src/runtime/os_aix.go
index 9a6b8aec7c..0c501be96a 100644
--- a/src/runtime/os_aix.go
+++ b/src/runtime/os_aix.go
@@ -7,7 +7,6 @@
 package runtime
 
 import (
-	"internal/cpu"
 	"unsafe"
 )
 
@@ -94,7 +93,6 @@ func semawakeup(mp *m) {
 func osinit() {
 	ncpu = int32(sysconf(__SC_NPROCESSORS_ONLN))
 	physPageSize = sysconf(__SC_PAGE_SIZE)
-	setupSystemConf()
 }
 
 // newosproc0 is a version of newosproc that can be called before the runtime
@@ -340,25 +338,6 @@ func walltime1() (sec int64, nsec int32) {
 	return ts.tv_sec, int32(ts.tv_nsec)
 }
 
-const (
-	// getsystemcfg constants
-	_SC_IMPL     = 2
-	_IMPL_POWER8 = 0x10000
-	_IMPL_POWER9 = 0x20000
-)
-
-// setupSystemConf retrieves information about the CPU and updates
-// cpu.HWCap variables.
-func setupSystemConf() {
-	impl := getsystemcfg(_SC_IMPL)
-	if impl&_IMPL_POWER8 != 0 {
-		cpu.HWCap2 |= cpu.PPC_FEATURE2_ARCH_2_07
-	}
-	if impl&_IMPL_POWER9 != 0 {
-		cpu.HWCap2 |= cpu.PPC_FEATURE2_ARCH_3_00
-	}
-}
-
 //go:nosplit
 func fcntl(fd, cmd, arg int32) int32 {
 	r, _ := syscall3(&libc_fcntl, uintptr(fd), uintptr(cmd), uintptr(arg))
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 01c40b4813..e0a43c28aa 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -130,6 +130,18 @@ func osinit() {
 	physPageSize = getPageSize()
 }
 
+func sysctlbynameInt32(name []byte) (int32, int32) {
+	out := int32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	return ret, out
+}
+
+//go:linkname internal_cpu_getsysctlbyname internal/cpu.getsysctlbyname
+func internal_cpu_getsysctlbyname(name []byte) (int32, int32) {
+	return sysctlbynameInt32(name)
+}
+
 const (
 	_CTL_HW      = 6
 	_HW_NCPU     = 3
@@ -198,7 +210,6 @@ func newosproc(mp *m) {
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
-	//mSysStatInc(&memstats.stacks_sys, stacksize) //TODO: do this?
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -247,7 +258,7 @@ func newosproc0(stacksize uintptr, fn uintptr) {
 		exit(1)
 	}
 	g0.stack.hi = stacksize // for mstart
-	mSysStatInc(&memstats.stacks_sys, stacksize)
+	memstats.stacks_sys.add(int64(stacksize))
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -284,14 +295,20 @@ func libpreinit() {
 func mpreinit(mp *m) {
 	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
 	mp.gsignal.m = mp
+	if GOOS == "darwin" && GOARCH == "arm64" {
+		// mlock the signal stack to work around a kernel bug where it may
+		// SIGILL when the signal stack is not faulted in while a signal
+		// arrives. See issue 42774.
+		mlock(unsafe.Pointer(mp.gsignal.stack.hi-physPageSize), physPageSize)
+	}
 }
 
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// The signal handler handles it directly.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		minitSignalStack()
 	}
 	minitSignalMask()
@@ -301,9 +318,9 @@ func minit() {
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// See minit.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		unminitSignals()
 	}
 }
diff --git a/src/runtime/os_freebsd_arm64.go b/src/runtime/os_freebsd_arm64.go
index 51ebf9d478..b5b25f0dc5 100644
--- a/src/runtime/os_freebsd_arm64.go
+++ b/src/runtime/os_freebsd_arm64.go
@@ -4,149 +4,6 @@
 
 package runtime
 
-import "internal/cpu"
-
-const (
-	hwcap_FP       = 1 << 0
-	hwcap_ASIMD    = 1 << 1
-	hwcap_EVTSTRM  = 1 << 2
-	hwcap_AES      = 1 << 3
-	hwcap_PMULL    = 1 << 4
-	hwcap_SHA1     = 1 << 5
-	hwcap_SHA2     = 1 << 6
-	hwcap_CRC32    = 1 << 7
-	hwcap_ATOMICS  = 1 << 8
-	hwcap_FPHP     = 1 << 9
-	hwcap_ASIMDHP  = 1 << 10
-	hwcap_CPUID    = 1 << 11
-	hwcap_ASIMDRDM = 1 << 12
-	hwcap_JSCVT    = 1 << 13
-	hwcap_FCMA     = 1 << 14
-	hwcap_LRCPC    = 1 << 15
-	hwcap_DCPOP    = 1 << 16
-	hwcap_SHA3     = 1 << 17
-	hwcap_SM3      = 1 << 18
-	hwcap_SM4      = 1 << 19
-	hwcap_ASIMDDP  = 1 << 20
-	hwcap_SHA512   = 1 << 21
-	hwcap_SVE      = 1 << 22
-	hwcap_ASIMDFHM = 1 << 23
-)
-
-func getisar0() uint64
-func getisar1() uint64
-func getpfr0() uint64
-
-// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
-// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
-func archauxv(tag, val uintptr) {
-	var isar0, isar1, pfr0 uint64
-
-	isar0 = getisar0()
-	isar1 = getisar1()
-	pfr0 = getpfr0()
-
-	// ID_AA64ISAR0_EL1
-	switch extractBits(isar0, 4, 7) {
-	case 1:
-		cpu.HWCap |= hwcap_AES
-	case 2:
-		cpu.HWCap |= hwcap_PMULL | hwcap_AES
-	}
-
-	switch extractBits(isar0, 8, 11) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA1
-	}
-
-	switch extractBits(isar0, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA2
-	case 2:
-		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
-	}
-
-	switch extractBits(isar0, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_CRC32
-	}
-
-	switch extractBits(isar0, 20, 23) {
-	case 2:
-		cpu.HWCap |= hwcap_ATOMICS
-	}
-
-	switch extractBits(isar0, 28, 31) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDRDM
-	}
-
-	switch extractBits(isar0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA3
-	}
-
-	switch extractBits(isar0, 36, 39) {
-	case 1:
-		cpu.HWCap |= hwcap_SM3
-	}
-
-	switch extractBits(isar0, 40, 43) {
-	case 1:
-		cpu.HWCap |= hwcap_SM4
-	}
-
-	switch extractBits(isar0, 44, 47) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDDP
-	}
-
-	// ID_AA64ISAR1_EL1
-	switch extractBits(isar1, 0, 3) {
-	case 1:
-		cpu.HWCap |= hwcap_DCPOP
-	}
-
-	switch extractBits(isar1, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_JSCVT
-	}
-
-	switch extractBits(isar1, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_FCMA
-	}
-
-	switch extractBits(isar1, 20, 23) {
-	case 1:
-		cpu.HWCap |= hwcap_LRCPC
-	}
-
-	// ID_AA64PFR0_EL1
-	switch extractBits(pfr0, 16, 19) {
-	case 0:
-		cpu.HWCap |= hwcap_FP
-	case 1:
-		cpu.HWCap |= hwcap_FP | hwcap_FPHP
-	}
-
-	switch extractBits(pfr0, 20, 23) {
-	case 0:
-		cpu.HWCap |= hwcap_ASIMD
-	case 1:
-		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
-	}
-
-	switch extractBits(pfr0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SVE
-	}
-}
-
-func extractBits(data uint64, start, end uint) uint {
-	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
-}
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go
index c6a49927c8..01efb9b7c9 100644
--- a/src/runtime/os_freebsd_noauxv.go
+++ b/src/runtime/os_freebsd_noauxv.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build freebsd
-// +build !arm,!arm64
+// +build !arm
 
 package runtime
 
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index ff0ee3aa6b..94983b358d 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -59,7 +59,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 19968dc164..c5fd742048 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -11,19 +11,7 @@ import "internal/cpu"
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
-		// HWCAP/HWCAP2 bits for hardware capabilities.
-		hwcap := uint(val)
-		if GOOS == "android" {
-			// The Samsung S9+ kernel reports support for atomics, but not all cores
-			// actually support them, resulting in SIGILL. See issue #28431.
-			// TODO(elias.naur): Only disable the optimization on bad chipsets.
-			const hwcap_ATOMICS = 1 << 8
-			hwcap &= ^uint(hwcap_ATOMICS)
-		}
-		cpu.HWCap = hwcap
-	case _AT_HWCAP2:
-		cpu.HWCap2 = uint(val)
+		cpu.HWCap = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_be64.go b/src/runtime/os_linux_be64.go
index 14fbad5d5f..9860002ee4 100644
--- a/src/runtime/os_linux_be64.go
+++ b/src/runtime/os_linux_be64.go
@@ -38,6 +38,7 @@ func sigdelset(mask *sigset, i int) {
 	*mask &^= 1 << (uint(i) - 1)
 }
 
+//go:nosplit
 func sigfillset(mask *uint64) {
 	*mask = ^uint64(0)
 }
diff --git a/src/runtime/os_linux_generic.go b/src/runtime/os_linux_generic.go
index 14810e3cc3..e1d0952ddf 100644
--- a/src/runtime/os_linux_generic.go
+++ b/src/runtime/os_linux_generic.go
@@ -38,6 +38,7 @@ func sigdelset(mask *sigset, i int) {
 	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
 }
 
+//go:nosplit
 func sigfillset(mask *uint64) {
 	*mask = ^uint64(0)
 }
diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
index 4ff66f9538..815a83a04b 100644
--- a/src/runtime/os_linux_mips64x.go
+++ b/src/runtime/os_linux_mips64x.go
@@ -48,6 +48,7 @@ func sigdelset(mask *sigset, i int) {
 	(*mask)[(i-1)/64] &^= 1 << ((uint32(i) - 1) & 63)
 }
 
+//go:nosplit
 func sigfillset(mask *[2]uint64) {
 	(*mask)[0], (*mask)[1] = ^uint64(0), ^uint64(0)
 }
diff --git a/src/runtime/os_linux_mipsx.go b/src/runtime/os_linux_mipsx.go
index 87962ed982..00fb02e4bf 100644
--- a/src/runtime/os_linux_mipsx.go
+++ b/src/runtime/os_linux_mipsx.go
@@ -42,6 +42,7 @@ func sigdelset(mask *sigset, i int) {
 	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
 }
 
+//go:nosplit
 func sigfillset(mask *[4]uint32) {
 	(*mask)[0], (*mask)[1], (*mask)[2], (*mask)[3] = ^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)
 }
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
index ee18fd1dc2..b9651f186c 100644
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -6,15 +6,10 @@ package runtime
 
 import "internal/cpu"
 
-const (
-	// bit masks taken from bits/hwcap.h
-	_HWCAP_S390_VX = 2048 // vector facility
-)
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_HWCAP: // CPU capability bit flags
-		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
index cd3565df5b..d7960f4c91 100644
--- a/src/runtime/os_openbsd.go
+++ b/src/runtime/os_openbsd.go
@@ -236,7 +236,11 @@ func goenvs() {
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
 func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
+	gsignalSize := int32(32 * 1024)
+	if GOARCH == "mips64" {
+		gsignalSize = int32(64 * 1024)
+	}
+	mp.gsignal = malg(gsignalSize)
 	mp.gsignal.m = mp
 }
 
diff --git a/src/runtime/os_openbsd_arm64.go b/src/runtime/os_openbsd_arm64.go
index d559a2a3e5..d71de7d196 100644
--- a/src/runtime/os_openbsd_arm64.go
+++ b/src/runtime/os_openbsd_arm64.go
@@ -4,20 +4,9 @@
 
 package runtime
 
-import (
-	"internal/cpu"
-)
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func sysargs(argc int32, argv **byte) {
-	// OpenBSD does not have auxv, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue #31746.
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/src/runtime/os_openbsd_mips64.go b/src/runtime/os_openbsd_mips64.go
new file mode 100644
index 0000000000..ae220cd683
--- /dev/null
+++ b/src/runtime/os_openbsd_mips64.go
@@ -0,0 +1,12 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+//go:nosplit
+func cputicks() int64 {
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	return nanotime()
+}
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 128c30adeb..62aecea060 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -92,9 +92,12 @@ func sigpanic() {
 		}
 		addr := note[i:]
 		g.sigcode1 = uintptr(atolwhex(addr))
-		if g.sigcode1 < 0x1000 || g.paniconfault {
+		if g.sigcode1 < 0x1000 {
 			panicmem()
 		}
+		if g.paniconfault {
+			panicmemAddr(g.sigcode1)
+		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
 	case _SIGTRAP:
@@ -181,7 +184,7 @@ func mpreinit(mp *m) {
 	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, true))
 }
 
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 func msigrestore(sigmask sigset) {
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index a62e941229..ffb087f9db 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -21,6 +21,7 @@ const (
 //go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateWaitableTimerExW CreateWaitableTimerExW%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
 //go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
@@ -68,6 +69,7 @@ var (
 	_CreateIoCompletionPort,
 	_CreateThread,
 	_CreateWaitableTimerA,
+	_CreateWaitableTimerExW,
 	_DuplicateHandle,
 	_ExitProcess,
 	_FreeEnvironmentStringsW,
@@ -151,6 +153,8 @@ type mOS struct {
 	waitsema   uintptr // semaphore for parking on locks
 	resumesema uintptr // semaphore to indicate suspend/resume
 
+	highResTimer uintptr // high resolution timer handle used in usleep
+
 	// preemptExtLock synchronizes preemptM with entry/exit from
 	// external C code.
 	//
@@ -402,11 +406,21 @@ const osRelaxMinNS = 60 * 1e6
 // osRelax is called by the scheduler when transitioning to and from
 // all Ps being idle.
 //
-// On Windows, it adjusts the system-wide timer resolution. Go needs a
+// Some versions of Windows have high resolution timer. For those
+// versions osRelax is noop.
+// For Windows versions without high resolution timer, osRelax
+// adjusts the system-wide timer resolution. Go needs a
 // high resolution timer while running and there's little extra cost
 // if we're already using the CPU, but if all Ps are idle there's no
 // need to consume extra power to drive the high-res timer.
 func osRelax(relax bool) uint32 {
+	if haveHighResTimer {
+		// If the high resolution timer is available, the runtime uses the timer
+		// to sleep for short durations. This means there's no need to adjust
+		// the global clock frequency.
+		return 0
+	}
+
 	if relax {
 		return uint32(stdcall1(_timeEndPeriod, 1))
 	} else {
@@ -414,6 +428,42 @@ func osRelax(relax bool) uint32 {
 	}
 }
 
+// haveHighResTimer indicates that the CreateWaitableTimerEx
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag is available.
+var haveHighResTimer = false
+
+// createHighResTimer calls CreateWaitableTimerEx with
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag to create high
+// resolution timer. createHighResTimer returns new timer
+// handle or 0, if CreateWaitableTimerEx failed.
+func createHighResTimer() uintptr {
+	const (
+		// As per @jstarks, see
+		// https://github.com/golang/go/issues/8687#issuecomment-656259353
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION = 0x00000002
+
+		_SYNCHRONIZE        = 0x00100000
+		_TIMER_QUERY_STATE  = 0x0001
+		_TIMER_MODIFY_STATE = 0x0002
+	)
+	return stdcall4(_CreateWaitableTimerExW, 0, 0,
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+		_SYNCHRONIZE|_TIMER_QUERY_STATE|_TIMER_MODIFY_STATE)
+}
+
+func initHighResTimer() {
+	if GOARCH == "arm" {
+		// TODO: Not yet implemented.
+		return
+	}
+	h := createHighResTimer()
+	if h != 0 {
+		haveHighResTimer = true
+		usleep2Addr = unsafe.Pointer(funcPC(usleep2HighRes))
+		stdcall1(_CloseHandle, h)
+	}
+}
+
 func osinit() {
 	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
 	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
@@ -429,6 +479,7 @@ func osinit() {
 
 	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
 
+	initHighResTimer()
 	timeBeginPeriodRetValue = osRelax(false)
 
 	ncpu = getproccount()
@@ -822,7 +873,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
@@ -844,9 +895,20 @@ func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
 
+	// Configure usleep timer, if possible.
+	var timer uintptr
+	if haveHighResTimer {
+		timer = createHighResTimer()
+		if timer == 0 {
+			print("runtime: CreateWaitableTimerEx failed; errno=", getlasterror(), "\n")
+			throw("CreateWaitableTimerEx when creating timer failed")
+		}
+	}
+
 	mp := getg().m
 	lock(&mp.threadLock)
 	mp.thread = thandle
+	mp.highResTimer = timer
 	unlock(&mp.threadLock)
 
 	// Query the true stack base from the OS. Currently we're
@@ -884,6 +946,10 @@ func unminit() {
 	lock(&mp.threadLock)
 	stdcall1(_CloseHandle, mp.thread)
 	mp.thread = 0
+	if mp.highResTimer != 0 {
+		stdcall1(_CloseHandle, mp.highResTimer)
+		mp.highResTimer = 0
+	}
 	unlock(&mp.threadLock)
 }
 
@@ -976,9 +1042,12 @@ func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
 	return stdcall(fn)
 }
 
-// in sys_windows_386.s and sys_windows_amd64.s
+// In sys_windows_386.s and sys_windows_amd64.s.
 func onosstack(fn unsafe.Pointer, arg uint32)
+
+// These are not callable functions. They should only be called via onosstack.
 func usleep2(usec uint32)
+func usleep2HighRes(usec uint32)
 func switchtothread()
 
 var usleep2Addr unsafe.Pointer
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 127843b081..aed17d6fc6 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -212,6 +212,11 @@ func panicmem() {
 	panic(memoryError)
 }
 
+func panicmemAddr(addr uintptr) {
+	panicCheck2("invalid memory address or nil pointer dereference")
+	panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr})
+}
+
 // Create a new deferred function fn with siz bytes of arguments.
 // The compiler turns a defer statement into a call to this.
 //go:nosplit
@@ -416,15 +421,6 @@ func newdefer(siz int32) *_defer {
 			total := roundupsize(totaldefersize(uintptr(siz)))
 			d = (*_defer)(mallocgc(total, deferType, true))
 		})
-		if debugCachedWork {
-			// Duplicate the tail below so if there's a
-			// crash in checkPut we can tell if d was just
-			// allocated or came from the pool.
-			d.siz = siz
-			d.link = gp._defer
-			gp._defer = d
-			return d
-		}
 	}
 	d.siz = siz
 	d.heap = true
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index f253f07def..c11a45fd69 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
diff --git a/src/runtime/pprof/pprof_rusage.go b/src/runtime/pprof/pprof_rusage.go
index d42e6ed473..7954673811 100644
--- a/src/runtime/pprof/pprof_rusage.go
+++ b/src/runtime/pprof/pprof_rusage.go
@@ -19,7 +19,7 @@ func addMaxRSS(w io.Writer) {
 	switch runtime.GOOS {
 	case "linux", "android":
 		rssToBytes = 1024
-	case "darwin":
+	case "darwin", "ios":
 		rssToBytes = 1
 	default:
 		panic("unsupported OS")
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 7149bfb31f..b6ee160e84 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -13,7 +13,6 @@ import (
 	"internal/profile"
 	"internal/testenv"
 	"io"
-	"io/ioutil"
 	"math/big"
 	"os"
 	"os/exec"
@@ -262,7 +261,7 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 // as interpreted by matches, and returns the parsed profile.
 func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
 	switch runtime.GOOS {
-	case "darwin":
+	case "darwin", "ios":
 		switch runtime.GOARCH {
 		case "arm64":
 			// nothing
@@ -280,12 +279,16 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 
 	broken := false
 	switch runtime.GOOS {
-	case "darwin", "dragonfly", "netbsd", "illumos", "solaris":
+	case "darwin", "ios", "dragonfly", "netbsd", "illumos", "solaris":
 		broken = true
 	case "openbsd":
 		if runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" {
 			broken = true
 		}
+	case "windows":
+		if runtime.GOARCH == "arm" {
+			broken = true // See https://golang.org/issues/42862
+		}
 	}
 
 	maxDuration := 5 * time.Second
@@ -1175,7 +1178,7 @@ func TestLabelRace(t *testing.T) {
 // Check that there is no deadlock when the program receives SIGPROF while in
 // 64bit atomics' critical section. Used to happen on mips{,le}. See #20146.
 func TestAtomicLoadStore64(t *testing.T) {
-	f, err := ioutil.TempFile("", "profatomic")
+	f, err := os.CreateTemp("", "profatomic")
 	if err != nil {
 		t.Fatalf("TempFile: %v", err)
 	}
@@ -1204,7 +1207,7 @@ func TestAtomicLoadStore64(t *testing.T) {
 func TestTracebackAll(t *testing.T) {
 	// With gccgo, if a profiling signal arrives at the wrong time
 	// during traceback, it may crash or hang. See issue #29448.
-	f, err := ioutil.TempFile("", "proftraceback")
+	f, err := os.CreateTemp("", "proftraceback")
 	if err != nil {
 		t.Fatalf("TempFile: %v", err)
 	}
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
index 8519af6985..bdb4454b6e 100644
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -9,7 +9,7 @@ import (
 	"compress/gzip"
 	"fmt"
 	"io"
-	"io/ioutil"
+	"os"
 	"runtime"
 	"strconv"
 	"time"
@@ -575,7 +575,7 @@ func (b *profileBuilder) emitLocation() uint64 {
 // It saves the address ranges of the mappings in b.mem for use
 // when emitting locations.
 func (b *profileBuilder) readMapping() {
-	data, _ := ioutil.ReadFile("/proc/self/maps")
+	data, _ := os.ReadFile("/proc/self/maps")
 	parseProcSelfMaps(data, b.addMapping)
 	if len(b.mem) == 0 { // pprof expects a map entry, so fake one.
 		b.addMappingEntry(0, 0, 0, "", "", true)
diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go
index 3043d5353f..5eb1aab140 100644
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -10,7 +10,6 @@ import (
 	"fmt"
 	"internal/profile"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"reflect"
@@ -78,7 +77,7 @@ func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
 	switch runtime.GOOS {
 	case "linux", "android", "netbsd":
 		// Figure out two addresses from /proc/self/maps.
-		mmap, err := ioutil.ReadFile("/proc/self/maps")
+		mmap, err := os.ReadFile("/proc/self/maps")
 		if err != nil {
 			t.Fatal(err)
 		}
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index 17ef2c90d3..372185266f 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -58,11 +58,6 @@ import (
 	"unsafe"
 )
 
-// Keep in sync with cmd/compile/internal/gc/plive.go:go115ReduceLiveness.
-const go115ReduceLiveness = true
-
-const go115RestartSeq = go115ReduceLiveness && true // enable restartable sequences
-
 type suspendGState struct {
 	g *g
 
@@ -402,24 +397,12 @@ func isAsyncSafePoint(gp *g, pc, sp, lr uintptr) (bool, uintptr) {
 		// use the LR for unwinding, which will be bad.
 		return false, 0
 	}
-	var up int32
-	var startpc uintptr
-	if !go115ReduceLiveness {
-		smi := pcdatavalue(f, _PCDATA_RegMapIndex, pc, nil)
-		if smi == _PCDATA_RegMapUnsafe {
-			// Unsafe-point marked by compiler. This includes
-			// atomic sequences (e.g., write barrier) and nosplit
-			// functions (except at calls).
-			return false, 0
-		}
-	} else {
-		up, startpc = pcdatavalue2(f, _PCDATA_UnsafePoint, pc)
-		if up != _PCDATA_UnsafePointSafe {
-			// Unsafe-point marked by compiler. This includes
-			// atomic sequences (e.g., write barrier) and nosplit
-			// functions (except at calls).
-			return false, 0
-		}
+	up, startpc := pcdatavalue2(f, _PCDATA_UnsafePoint, pc)
+	if up != _PCDATA_UnsafePointSafe {
+		// Unsafe-point marked by compiler. This includes
+		// atomic sequences (e.g., write barrier) and nosplit
+		// functions (except at calls).
+		return false, 0
 	}
 	if fd := funcdata(f, _FUNCDATA_LocalsPointerMaps); fd == nil || fd == unsafe.Pointer(&no_pointers_stackmap) {
 		// This is assembly code. Don't assume it's
@@ -455,25 +438,17 @@ func isAsyncSafePoint(gp *g, pc, sp, lr uintptr) (bool, uintptr) {
 		// in incrementally.
 		return false, 0
 	}
-	if go115RestartSeq {
-		switch up {
-		case _PCDATA_Restart1, _PCDATA_Restart2:
-			// Restartable instruction sequence. Back off PC to
-			// the start PC.
-			if startpc == 0 || startpc > pc || pc-startpc > 20 {
-				throw("bad restart PC")
-			}
-			return true, startpc
-		case _PCDATA_RestartAtEntry:
-			// Restart from the function entry at resumption.
-			return true, f.entry
-		}
-	} else {
-		switch up {
-		case _PCDATA_Restart1, _PCDATA_Restart2, _PCDATA_RestartAtEntry:
-			// go115RestartSeq is not enabled. Treat it as unsafe point.
-			return false, 0
+	switch up {
+	case _PCDATA_Restart1, _PCDATA_Restart2:
+		// Restartable instruction sequence. Back off PC to
+		// the start PC.
+		if startpc == 0 || startpc > pc || pc-startpc > 20 {
+			throw("bad restart PC")
 		}
+		return true, startpc
+	case _PCDATA_RestartAtEntry:
+		// Restart from the function entry at resumption.
+		return true, f.entry
 	}
 	return true, pc
 }
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
index a00ac8f385..a803b24dc6 100644
--- a/src/runtime/preempt_386.s
+++ b/src/runtime/preempt_386.s
@@ -3,9 +3,10 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	PUSHFL
-	ADJSP $264
+	ADJSP $156
 	NOP SP
 	MOVL AX, 0(SP)
 	MOVL CX, 4(SP)
@@ -14,32 +15,29 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVL BP, 16(SP)
 	MOVL SI, 20(SP)
 	MOVL DI, 24(SP)
-	FSAVE 28(SP)
-	FLDCW runtime·controlWord64(SB)
 	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE nosse
-	MOVUPS X0, 136(SP)
-	MOVUPS X1, 152(SP)
-	MOVUPS X2, 168(SP)
-	MOVUPS X3, 184(SP)
-	MOVUPS X4, 200(SP)
-	MOVUPS X5, 216(SP)
-	MOVUPS X6, 232(SP)
-	MOVUPS X7, 248(SP)
+	MOVUPS X0, 28(SP)
+	MOVUPS X1, 44(SP)
+	MOVUPS X2, 60(SP)
+	MOVUPS X3, 76(SP)
+	MOVUPS X4, 92(SP)
+	MOVUPS X5, 108(SP)
+	MOVUPS X6, 124(SP)
+	MOVUPS X7, 140(SP)
 nosse:
 	CALL ·asyncPreempt2(SB)
 	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE nosse2
-	MOVUPS 248(SP), X7
-	MOVUPS 232(SP), X6
-	MOVUPS 216(SP), X5
-	MOVUPS 200(SP), X4
-	MOVUPS 184(SP), X3
-	MOVUPS 168(SP), X2
-	MOVUPS 152(SP), X1
-	MOVUPS 136(SP), X0
+	MOVUPS 140(SP), X7
+	MOVUPS 124(SP), X6
+	MOVUPS 108(SP), X5
+	MOVUPS 92(SP), X4
+	MOVUPS 76(SP), X3
+	MOVUPS 60(SP), X2
+	MOVUPS 44(SP), X1
+	MOVUPS 28(SP), X0
 nosse2:
-	FRSTOR 28(SP)
 	MOVL 24(SP), DI
 	MOVL 20(SP), SI
 	MOVL 16(SP), BP
@@ -47,6 +45,6 @@ nosse2:
 	MOVL 8(SP), DX
 	MOVL 4(SP), CX
 	MOVL 0(SP), AX
-	ADJSP $-264
+	ADJSP $-156
 	POPFL
 	RET
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 4765e9f448..92c664d79a 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -3,7 +3,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	PUSHQ BP
 	MOVQ SP, BP
 	// Save flags before clobbering them
diff --git a/src/runtime/preempt_arm.s b/src/runtime/preempt_arm.s
index 8f243c0dcd..bbc9fbb1ea 100644
--- a/src/runtime/preempt_arm.s
+++ b/src/runtime/preempt_arm.s
@@ -3,7 +3,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW.W R14, -188(R13)
 	MOVW R0, 4(R13)
 	MOVW R1, 8(R13)
diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s
index 3c27b52de1..2b70a28479 100644
--- a/src/runtime/preempt_arm64.s
+++ b/src/runtime/preempt_arm64.s
@@ -3,14 +3,15 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD R30, -496(RSP)
 	SUB $496, RSP
 	#ifdef GOOS_linux
 	MOVD R29, -8(RSP)
 	SUB $8, RSP, R29
 	#endif
-	#ifdef GOOS_darwin
+	#ifdef GOOS_ios
 	MOVD R30, (RSP)
 	#endif
 	MOVD R0, 8(RSP)
diff --git a/src/runtime/preempt_mips64x.s b/src/runtime/preempt_mips64x.s
index 1e123e8077..0d0c157c36 100644
--- a/src/runtime/preempt_mips64x.s
+++ b/src/runtime/preempt_mips64x.s
@@ -5,7 +5,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVV R31, -488(R29)
 	SUBV $488, R29
 	MOVV R1, 8(R29)
diff --git a/src/runtime/preempt_mipsx.s b/src/runtime/preempt_mipsx.s
index afac33e0a0..86d3a918d3 100644
--- a/src/runtime/preempt_mipsx.s
+++ b/src/runtime/preempt_mipsx.s
@@ -5,7 +5,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW R31, -244(R29)
 	SUB $244, R29
 	MOVW R1, 4(R29)
diff --git a/src/runtime/preempt_ppc64x.s b/src/runtime/preempt_ppc64x.s
index b2d7e30ec7..90634386db 100644
--- a/src/runtime/preempt_ppc64x.s
+++ b/src/runtime/preempt_ppc64x.s
@@ -5,7 +5,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD R31, -488(R1)
 	MOVD LR, R31
 	MOVDU R31, -520(R1)
diff --git a/src/runtime/preempt_riscv64.s b/src/runtime/preempt_riscv64.s
index 0338c22a94..d4f9cc277f 100644
--- a/src/runtime/preempt_riscv64.s
+++ b/src/runtime/preempt_riscv64.s
@@ -3,9 +3,10 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
-	MOV X1, -480(X2)
-	ADD $-480, X2
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
+	MOV X1, -472(X2)
+	ADD $-472, X2
 	MOV X3, 8(X2)
 	MOV X5, 16(X2)
 	MOV X6, 24(X2)
@@ -29,79 +30,77 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOV X24, 168(X2)
 	MOV X25, 176(X2)
 	MOV X26, 184(X2)
-	MOV X27, 192(X2)
-	MOV X28, 200(X2)
-	MOV X29, 208(X2)
-	MOV X30, 216(X2)
-	MOVD F0, 224(X2)
-	MOVD F1, 232(X2)
-	MOVD F2, 240(X2)
-	MOVD F3, 248(X2)
-	MOVD F4, 256(X2)
-	MOVD F5, 264(X2)
-	MOVD F6, 272(X2)
-	MOVD F7, 280(X2)
-	MOVD F8, 288(X2)
-	MOVD F9, 296(X2)
-	MOVD F10, 304(X2)
-	MOVD F11, 312(X2)
-	MOVD F12, 320(X2)
-	MOVD F13, 328(X2)
-	MOVD F14, 336(X2)
-	MOVD F15, 344(X2)
-	MOVD F16, 352(X2)
-	MOVD F17, 360(X2)
-	MOVD F18, 368(X2)
-	MOVD F19, 376(X2)
-	MOVD F20, 384(X2)
-	MOVD F21, 392(X2)
-	MOVD F22, 400(X2)
-	MOVD F23, 408(X2)
-	MOVD F24, 416(X2)
-	MOVD F25, 424(X2)
-	MOVD F26, 432(X2)
-	MOVD F27, 440(X2)
-	MOVD F28, 448(X2)
-	MOVD F29, 456(X2)
-	MOVD F30, 464(X2)
-	MOVD F31, 472(X2)
+	MOV X28, 192(X2)
+	MOV X29, 200(X2)
+	MOV X30, 208(X2)
+	MOVD F0, 216(X2)
+	MOVD F1, 224(X2)
+	MOVD F2, 232(X2)
+	MOVD F3, 240(X2)
+	MOVD F4, 248(X2)
+	MOVD F5, 256(X2)
+	MOVD F6, 264(X2)
+	MOVD F7, 272(X2)
+	MOVD F8, 280(X2)
+	MOVD F9, 288(X2)
+	MOVD F10, 296(X2)
+	MOVD F11, 304(X2)
+	MOVD F12, 312(X2)
+	MOVD F13, 320(X2)
+	MOVD F14, 328(X2)
+	MOVD F15, 336(X2)
+	MOVD F16, 344(X2)
+	MOVD F17, 352(X2)
+	MOVD F18, 360(X2)
+	MOVD F19, 368(X2)
+	MOVD F20, 376(X2)
+	MOVD F21, 384(X2)
+	MOVD F22, 392(X2)
+	MOVD F23, 400(X2)
+	MOVD F24, 408(X2)
+	MOVD F25, 416(X2)
+	MOVD F26, 424(X2)
+	MOVD F27, 432(X2)
+	MOVD F28, 440(X2)
+	MOVD F29, 448(X2)
+	MOVD F30, 456(X2)
+	MOVD F31, 464(X2)
 	CALL ·asyncPreempt2(SB)
-	MOVD 472(X2), F31
-	MOVD 464(X2), F30
-	MOVD 456(X2), F29
-	MOVD 448(X2), F28
-	MOVD 440(X2), F27
-	MOVD 432(X2), F26
-	MOVD 424(X2), F25
-	MOVD 416(X2), F24
-	MOVD 408(X2), F23
-	MOVD 400(X2), F22
-	MOVD 392(X2), F21
-	MOVD 384(X2), F20
-	MOVD 376(X2), F19
-	MOVD 368(X2), F18
-	MOVD 360(X2), F17
-	MOVD 352(X2), F16
-	MOVD 344(X2), F15
-	MOVD 336(X2), F14
-	MOVD 328(X2), F13
-	MOVD 320(X2), F12
-	MOVD 312(X2), F11
-	MOVD 304(X2), F10
-	MOVD 296(X2), F9
-	MOVD 288(X2), F8
-	MOVD 280(X2), F7
-	MOVD 272(X2), F6
-	MOVD 264(X2), F5
-	MOVD 256(X2), F4
-	MOVD 248(X2), F3
-	MOVD 240(X2), F2
-	MOVD 232(X2), F1
-	MOVD 224(X2), F0
-	MOV 216(X2), X30
-	MOV 208(X2), X29
-	MOV 200(X2), X28
-	MOV 192(X2), X27
+	MOVD 464(X2), F31
+	MOVD 456(X2), F30
+	MOVD 448(X2), F29
+	MOVD 440(X2), F28
+	MOVD 432(X2), F27
+	MOVD 424(X2), F26
+	MOVD 416(X2), F25
+	MOVD 408(X2), F24
+	MOVD 400(X2), F23
+	MOVD 392(X2), F22
+	MOVD 384(X2), F21
+	MOVD 376(X2), F20
+	MOVD 368(X2), F19
+	MOVD 360(X2), F18
+	MOVD 352(X2), F17
+	MOVD 344(X2), F16
+	MOVD 336(X2), F15
+	MOVD 328(X2), F14
+	MOVD 320(X2), F13
+	MOVD 312(X2), F12
+	MOVD 304(X2), F11
+	MOVD 296(X2), F10
+	MOVD 288(X2), F9
+	MOVD 280(X2), F8
+	MOVD 272(X2), F7
+	MOVD 264(X2), F6
+	MOVD 256(X2), F5
+	MOVD 248(X2), F4
+	MOVD 240(X2), F3
+	MOVD 232(X2), F2
+	MOVD 224(X2), F1
+	MOVD 216(X2), F0
+	MOV 208(X2), X30
+	MOV 200(X2), X29
+	MOV 192(X2), X28
 	MOV 184(X2), X26
 	MOV 176(X2), X25
 	MOV 168(X2), X24
@@ -125,7 +124,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOV 24(X2), X6
 	MOV 16(X2), X5
 	MOV 8(X2), X3
-	MOV 480(X2), X1
+	MOV 472(X2), X1
 	MOV (X2), X31
-	ADD $488, X2
+	ADD $480, X2
 	JMP (X31)
diff --git a/src/runtime/preempt_s390x.s b/src/runtime/preempt_s390x.s
index ca9e47cde1..c6f11571df 100644
--- a/src/runtime/preempt_s390x.s
+++ b/src/runtime/preempt_s390x.s
@@ -3,7 +3,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	IPM R10
 	MOVD R14, -248(R15)
 	ADD $-248, R15
diff --git a/src/runtime/preempt_wasm.s b/src/runtime/preempt_wasm.s
index 0cf57d3d22..da90e8aa6d 100644
--- a/src/runtime/preempt_wasm.s
+++ b/src/runtime/preempt_wasm.s
@@ -3,6 +3,7 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.
+TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	// No async preemption on wasm
 	UNDEF
diff --git a/src/runtime/print.go b/src/runtime/print.go
index e605eb34cb..64055a34cc 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -237,6 +237,9 @@ func printhex(v uint64) {
 func printpointer(p unsafe.Pointer) {
 	printhex(uint64(uintptr(p)))
 }
+func printuintptr(p uintptr) {
+	printhex(uint64(p))
+}
 
 func printstring(s string) {
 	gwrite(bytes(s))
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 341d52aea8..64e102fb0a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -128,10 +128,19 @@ func main() {
 		maxstacksize = 250000000
 	}
 
+	// An upper limit for max stack size. Used to avoid random crashes
+	// after calling SetMaxStack and trying to allocate a stack that is too big,
+	// since stackalloc works with 32-bit sizes.
+	maxstackceiling = 2 * maxstacksize
+
 	// Allow newproc to start new Ms.
 	mainStarted = true
 
 	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
+		// For runtime_syscall_doAllThreadsSyscall, we
+		// register sysmon is not ready for the world to be
+		// stopped.
+		atomic.Store(&sched.sysmonStarting, 1)
 		systemstack(func() {
 			newm(sysmon, nil, -1)
 		})
@@ -148,12 +157,22 @@ func main() {
 	if g.m != &m0 {
 		throw("runtime.main not on m0")
 	}
+	m0.doesPark = true
 
-	doInit(&runtime_inittask) // must be before defer
-	if nanotime() == 0 {
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
 		throw("nanotime returning zero")
 	}
 
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	doInit(&runtime_inittask) // Must be before defer.
+
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
 	needUnlock := true
 	defer func() {
@@ -162,9 +181,6 @@ func main() {
 		}
 	}()
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
-
 	gcenable()
 
 	main_init_done = make(chan bool)
@@ -191,6 +207,10 @@ func main() {
 
 	doInit(&main_inittask)
 
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
 	close(main_init_done)
 
 	needUnlock = false
@@ -279,14 +299,23 @@ func goschedguarded() {
 	mcall(goschedguarded_m)
 }
 
-// Puts the current goroutine into a waiting state and calls unlockf.
+// Puts the current goroutine into a waiting state and calls unlockf on the
+// system stack.
+//
 // If unlockf returns false, the goroutine is resumed.
+//
 // unlockf must not access this G's stack, as it may be moved between
 // the call to gopark and the call to unlockf.
-// Reason explains why the goroutine has been parked.
-// It is displayed in stack traces and heap dumps.
-// Reasons should be unique and descriptive.
-// Do not re-use reasons, add new ones.
+//
+// Note that because unlockf is called after putting the G into a waiting
+// state, the G may have already been readied by the time unlockf is called
+// unless there is external synchronization preventing the G from being
+// readied. If unlockf returns false, it must guarantee that the G cannot be
+// externally readied.
+//
+// Reason explains why the goroutine has been parked. It is displayed in stack
+// traces and heap dumps. Reasons should be unique and descriptive. Do not
+// re-use reasons, add new ones.
 func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) {
 	if reason != waitReasonSleep {
 		checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy
@@ -489,7 +518,7 @@ func cpuinit() {
 	var env string
 
 	switch GOOS {
-	case "aix", "darwin", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
+	case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
 		cpu.DebugOptions = true
 
 		// Similar to goenv_unix but extracts the environment value for
@@ -548,6 +577,10 @@ func schedinit() {
 	lockInit(&trace.lock, lockRankTrace)
 	lockInit(&cpuprof.lock, lockRankCpuprof)
 	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+	// Enforce that this lock is always a leaf lock.
+	// All of this lock's critical sections should be
+	// extremely short.
+	lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
 
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
@@ -558,6 +591,9 @@ func schedinit() {
 
 	sched.maxmcount = 10000
 
+	// The world starts stopped.
+	worldStopped()
+
 	moduledataverify()
 	stackinit()
 	mallocinit()
@@ -569,7 +605,7 @@ func schedinit() {
 	typelinksinit() // uses maps, activeModules
 	itabsinit()     // uses activeModules
 
-	msigsave(_g_.m)
+	sigsave(&_g_.m.sigmask)
 	initSigmask = _g_.m.sigmask
 
 	goargs()
@@ -577,6 +613,7 @@ func schedinit() {
 	parsedebugvars()
 	gcinit()
 
+	lock(&sched.lock)
 	sched.lastpoll = uint64(nanotime())
 	procs := ncpu
 	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
@@ -585,6 +622,10 @@ func schedinit() {
 	if procresize(procs) != nil {
 		throw("unknown runnable goroutine during bootstrap")
 	}
+	unlock(&sched.lock)
+
+	// World is effectively started now, as P's can run.
+	worldStarted()
 
 	// For cgocheck > 1, we turn on the write barrier at all times
 	// and check all pointer writes. We can't do this until after
@@ -615,8 +656,10 @@ func dumpgstatus(gp *g) {
 	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
 }
 
+// sched.lock must be held.
 func checkmcount() {
-	// sched lock is held
+	assertLockHeld(&sched.lock)
+
 	if mcount() > sched.maxmcount {
 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
 		throw("thread exhaustion")
@@ -628,6 +671,8 @@ func checkmcount() {
 //
 // sched.lock must be held.
 func mReserveID() int64 {
+	assertLockHeld(&sched.lock)
+
 	if sched.mnext+1 < sched.mnext {
 		throw("runtime: thread ID overflow")
 	}
@@ -920,10 +965,26 @@ func stopTheWorld(reason string) {
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
 	systemstack(func() { startTheWorldWithSema(false) })
+
 	// worldsema must be held over startTheWorldWithSema to ensure
 	// gomaxprocs cannot change while worldsema is held.
-	semrelease(&worldsema)
-	getg().m.preemptoff = ""
+	//
+	// Release worldsema with direct handoff to the next waiter, but
+	// acquirem so that semrelease1 doesn't try to yield our time.
+	//
+	// Otherwise if e.g. ReadMemStats is being called in a loop,
+	// it might stomp on other attempts to stop the world, such as
+	// for starting or ending GC. The operation this blocks is
+	// so heavy-weight that we should just try to be as fair as
+	// possible here.
+	//
+	// We don't want to just allow us to get preempted between now
+	// and releasing the semaphore because then we keep everyone
+	// (including, for example, GCs) waiting longer.
+	mp := acquirem()
+	mp.preemptoff = ""
+	semrelease1(&worldsema, true, 0)
+	releasem(mp)
 }
 
 // stopTheWorldGC has the same effect as stopTheWorld, but blocks
@@ -1047,9 +1108,13 @@ func stopTheWorldWithSema() {
 	if bad != "" {
 		throw(bad)
 	}
+
+	worldStopped()
 }
 
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
+	assertWorldStopped()
+
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if netpollinited() {
 		list := netpoll(0) // non-blocking
@@ -1070,6 +1135,8 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	}
 	unlock(&sched.lock)
 
+	worldStarted()
+
 	for p1 != nil {
 		p := p1
 		p1 = p1.link.ptr()
@@ -1138,7 +1205,7 @@ func mstart() {
 
 	// Exit this thread.
 	switch GOOS {
-	case "windows", "solaris", "illumos", "plan9", "darwin", "aix":
+	case "windows", "solaris", "illumos", "plan9", "darwin", "ios", "aix":
 		// Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate
 		// the stack, but put it in _g_.stack before mstart,
 		// so the logic above hasn't set osStack yet.
@@ -1196,6 +1263,21 @@ func mstartm0() {
 	initsig(false)
 }
 
+// mPark causes a thread to park itself - temporarily waking for
+// fixups but otherwise waiting to be fully woken. This is the
+// only way that m's should park themselves.
+//go:nosplit
+func mPark() {
+	g := getg()
+	for {
+		notesleep(&g.m.park)
+		noteclear(&g.m.park)
+		if !mDoFixup() {
+			return
+		}
+	}
+}
+
 // mexit tears down and exits the current thread.
 //
 // Don't call this directly to exit the thread, since it must run at
@@ -1227,7 +1309,7 @@ func mexit(osStack bool) {
 		sched.nmfreed++
 		checkdead()
 		unlock(&sched.lock)
-		notesleep(&m.park)
+		mPark()
 		throw("locked m0 woke up")
 	}
 
@@ -1281,6 +1363,14 @@ found:
 	checkdead()
 	unlock(&sched.lock)
 
+	if GOOS == "darwin" || GOOS == "ios" {
+		// Make sure pendingPreemptSignals is correct when an M exits.
+		// For #41702.
+		if atomic.Load(&m.signalPending) != 0 {
+			atomic.Xadd(&pendingPreemptSignals, -1)
+		}
+	}
+
 	if osStack {
 		// Return from mstart and let the system thread
 		// library free the g0 stack and terminate the thread.
@@ -1386,6 +1476,127 @@ func forEachP(fn func(*p)) {
 	releasem(mp)
 }
 
+// syscall_runtime_doAllThreadsSyscall serializes Go execution and
+// executes a specified fn() call on all m's.
+//
+// The boolean argument to fn() indicates whether the function's
+// return value will be consulted or not. That is, fn(true) should
+// return true if fn() succeeds, and fn(true) should return false if
+// it failed. When fn(false) is called, its return status will be
+// ignored.
+//
+// syscall_runtime_doAllThreadsSyscall first invokes fn(true) on a
+// single, coordinating, m, and only if it returns true does it go on
+// to invoke fn(false) on all of the other m's known to the process.
+//
+//go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall
+func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
+	if iscgo {
+		panic("doAllThreadsSyscall not supported with cgo enabled")
+	}
+	if fn == nil {
+		return
+	}
+	for atomic.Load(&sched.sysmonStarting) != 0 {
+		osyield()
+	}
+	stopTheWorldGC("doAllThreadsSyscall")
+	if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
+		// Ensure that there are no in-flight thread
+		// creations: don't want to race with allm.
+		lock(&newmHandoff.lock)
+		for !newmHandoff.waiting {
+			unlock(&newmHandoff.lock)
+			osyield()
+			lock(&newmHandoff.lock)
+		}
+		unlock(&newmHandoff.lock)
+	}
+	if netpollinited() {
+		netpollBreak()
+	}
+	_g_ := getg()
+	if raceenabled {
+		// For m's running without racectx, we loan out the
+		// racectx of this call.
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = _g_.racectx
+		unlock(&mFixupRace.lock)
+	}
+	if ok := fn(true); ok {
+		tid := _g_.m.procid
+		for mp := allm; mp != nil; mp = mp.alllink {
+			if mp.procid == tid {
+				// This m has already completed fn()
+				// call.
+				continue
+			}
+			// Be wary of mp's without procid values if
+			// they are known not to park. If they are
+			// marked as parking with a zero procid, then
+			// they will be racing with this code to be
+			// allocated a procid and we will annotate
+			// them with the need to execute the fn when
+			// they acquire a procid to run it.
+			if mp.procid == 0 && !mp.doesPark {
+				// Reaching here, we are either
+				// running Windows, or cgo linked
+				// code. Neither of which are
+				// currently supported by this API.
+				throw("unsupported runtime environment")
+			}
+			// stopTheWorldGC() doesn't guarantee stopping
+			// all the threads, so we lock here to avoid
+			// the possibility of racing with mp.
+			lock(&mp.mFixup.lock)
+			mp.mFixup.fn = fn
+			if mp.doesPark {
+				// For non-service threads this will
+				// cause the wakeup to be short lived
+				// (once the mutex is unlocked). The
+				// next real wakeup will occur after
+				// startTheWorldGC() is called.
+				notewakeup(&mp.park)
+			}
+			unlock(&mp.mFixup.lock)
+		}
+		for {
+			done := true
+			for mp := allm; done && mp != nil; mp = mp.alllink {
+				if mp.procid == tid {
+					continue
+				}
+				lock(&mp.mFixup.lock)
+				done = done && (mp.mFixup.fn == nil)
+				unlock(&mp.mFixup.lock)
+			}
+			if done {
+				break
+			}
+			// if needed force sysmon and/or newmHandoff to wakeup.
+			lock(&sched.lock)
+			if atomic.Load(&sched.sysmonwait) != 0 {
+				atomic.Store(&sched.sysmonwait, 0)
+				notewakeup(&sched.sysmonnote)
+			}
+			unlock(&sched.lock)
+			lock(&newmHandoff.lock)
+			if newmHandoff.waiting {
+				newmHandoff.waiting = false
+				notewakeup(&newmHandoff.wake)
+			}
+			unlock(&newmHandoff.lock)
+			osyield()
+		}
+	}
+	if raceenabled {
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = 0
+		unlock(&mFixupRace.lock)
+	}
+	startTheWorldGC()
+}
+
 // runSafePointFn runs the safe point function, if any, for this P.
 // This should be called like
 //
@@ -1454,7 +1665,12 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 				freem = next
 				continue
 			}
-			stackfree(freem.g0.stack)
+			// stackfree must be on the system stack, but allocm is
+			// reachable off the system stack transitively from
+			// startm.
+			systemstack(func() {
+				stackfree(freem.g0.stack)
+			})
 			freem = freem.freelink
 		}
 		sched.freem = newList
@@ -1467,7 +1683,7 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 
 	// In case of cgo or Solaris or illumos or Darwin, pthread_create will make us a stack.
 	// Windows and Plan 9 will layout sched stack on OS stack.
-	if iscgo || GOOS == "solaris" || GOOS == "illumos" || GOOS == "windows" || GOOS == "plan9" || GOOS == "darwin" {
+	if iscgo || GOOS == "solaris" || GOOS == "illumos" || GOOS == "windows" || GOOS == "plan9" || GOOS == "darwin" || GOOS == "ios" {
 		mp.g0 = malg(-1)
 	} else {
 		mp.g0 = malg(8192 * sys.StackGuardMultiplier)
@@ -1516,7 +1732,7 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 // When the callback is done with the m, it calls dropm to
 // put the m back on the list.
 //go:nosplit
-func needm(x byte) {
+func needm() {
 	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
 		// Can happen if C/C++ code calls Go from a global ctor.
 		// Can also happen on Windows if a global ctor uses a
@@ -1528,6 +1744,18 @@ func needm(x byte) {
 		exit(1)
 	}
 
+	// Save and block signals before getting an M.
+	// The signal handler may call needm itself,
+	// and we must avoid a deadlock. Also, once g is installed,
+	// any incoming signals will try to execute,
+	// but we won't have the sigaltstack settings and other data
+	// set up appropriately until the end of minit, which will
+	// unblock the signals. This is the same dance as when
+	// starting a new m to run Go code via newosproc.
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock()
+
 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
 	// that the extra list always contains or will soon contain
@@ -1545,14 +1773,8 @@ func needm(x byte) {
 	extraMCount--
 	unlockextra(mp.schedlink.ptr())
 
-	// Save and block signals before installing g.
-	// Once g is installed, any incoming signals will try to execute,
-	// but we won't have the sigaltstack settings and other data
-	// set up appropriately until the end of minit, which will
-	// unblock the signals. This is the same dance as when
-	// starting a new m to run Go code via newosproc.
-	msigsave(mp)
-	sigblock()
+	// Store the original signal mask for use by minit.
+	mp.sigmask = sigmask
 
 	// Install g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
@@ -1561,8 +1783,8 @@ func needm(x byte) {
 	// which is more than enough for us.
 	setg(mp.g0)
 	_g_ := getg()
-	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
-	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+	_g_.stack.hi = getcallersp() + 1024
+	_g_.stack.lo = getcallersp() - 32*1024
 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
 
 	// Initialize this thread to use the m.
@@ -1778,6 +2000,7 @@ var newmHandoff struct {
 //go:nowritebarrierrec
 func newm(fn func(), _p_ *p, id int64) {
 	mp := allocm(_p_, fn, id)
+	mp.doesPark = (_p_ != nil)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
@@ -1850,6 +2073,57 @@ func startTemplateThread() {
 	releasem(mp)
 }
 
+// mFixupRace is used to temporarily borrow the race context from the
+// coordinating m during a syscall_runtime_doAllThreadsSyscall and
+// loan it out to each of the m's of the runtime so they can execute a
+// mFixup.fn in that context.
+var mFixupRace struct {
+	lock mutex
+	ctx  uintptr
+}
+
+// mDoFixup runs any outstanding fixup function for the running m.
+// Returns true if a fixup was outstanding and actually executed.
+//
+//go:nosplit
+func mDoFixup() bool {
+	_g_ := getg()
+	lock(&_g_.m.mFixup.lock)
+	fn := _g_.m.mFixup.fn
+	if fn != nil {
+		if gcphase != _GCoff {
+			// We can't have a write barrier in this
+			// context since we may not have a P, but we
+			// clear fn to signal that we've executed the
+			// fixup. As long as fn is kept alive
+			// elsewhere, technically we should have no
+			// issues with the GC, but fn is likely
+			// generated in a different package altogether
+			// that may change independently. Just assert
+			// the GC is off so this lack of write barrier
+			// is more obviously safe.
+			throw("GC must be disabled to protect validity of fn value")
+		}
+		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+		if _g_.racectx != 0 || !raceenabled {
+			fn(false)
+		} else {
+			// temporarily acquire the context of the
+			// originator of the
+			// syscall_runtime_doAllThreadsSyscall and
+			// block others from using it for the duration
+			// of the fixup call.
+			lock(&mFixupRace.lock)
+			_g_.racectx = mFixupRace.ctx
+			fn(false)
+			_g_.racectx = 0
+			unlock(&mFixupRace.lock)
+		}
+	}
+	unlock(&_g_.m.mFixup.lock)
+	return fn != nil
+}
+
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
 // may not be in a good state.
@@ -1886,6 +2160,7 @@ func templateThread() {
 		noteclear(&newmHandoff.wake)
 		unlock(&newmHandoff.lock)
 		notesleep(&newmHandoff.wake)
+		mDoFixup()
 	}
 }
 
@@ -1907,8 +2182,7 @@ func stopm() {
 	lock(&sched.lock)
 	mput(_g_.m)
 	unlock(&sched.lock)
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	acquirep(_g_.m.nextp.ptr())
 	_g_.m.nextp = 0
 }
@@ -1923,8 +2197,30 @@ func mspinning() {
 // May run with m.p==nil, so write barriers are not allowed.
 // If spinning is set, the caller has incremented nmspinning and startm will
 // either decrement nmspinning or set m.spinning in the newly started M.
+//
+// Callers passing a non-nil P must call from a non-preemptible context. See
+// comment on acquirem below.
+//
+// Must not have write barriers because this may be called without a P.
 //go:nowritebarrierrec
 func startm(_p_ *p, spinning bool) {
+	// Disable preemption.
+	//
+	// Every owned P must have an owner that will eventually stop it in the
+	// event of a GC stop request. startm takes transient ownership of a P
+	// (either from argument or pidleget below) and transfers ownership to
+	// a started M, which will be responsible for performing the stop.
+	//
+	// Preemption must be disabled during this transient ownership,
+	// otherwise the P this is running on may enter GC stop while still
+	// holding the transient P, leaving that P in limbo and deadlocking the
+	// STW.
+	//
+	// Callers passing a non-nil P must already be in non-preemptible
+	// context, otherwise such preemption could occur on function entry to
+	// startm. Callers passing a nil P may be preemptible, so we must
+	// disable preemption before acquiring a P from pidleget below.
+	mp := acquirem()
 	lock(&sched.lock)
 	if _p_ == nil {
 		_p_ = pidleget()
@@ -1937,11 +2233,12 @@ func startm(_p_ *p, spinning bool) {
 					throw("startm: negative nmspinning")
 				}
 			}
+			releasem(mp)
 			return
 		}
 	}
-	mp := mget()
-	if mp == nil {
+	nmp := mget()
+	if nmp == nil {
 		// No M is available, we must drop sched.lock and call newm.
 		// However, we already own a P to assign to the M.
 		//
@@ -1963,22 +2260,28 @@ func startm(_p_ *p, spinning bool) {
 			fn = mspinning
 		}
 		newm(fn, _p_, id)
+		// Ownership transfer of _p_ committed by start in newm.
+		// Preemption is now safe.
+		releasem(mp)
 		return
 	}
 	unlock(&sched.lock)
-	if mp.spinning {
+	if nmp.spinning {
 		throw("startm: m is spinning")
 	}
-	if mp.nextp != 0 {
+	if nmp.nextp != 0 {
 		throw("startm: m has p")
 	}
 	if spinning && !runqempty(_p_) {
 		throw("startm: p has runnable gs")
 	}
 	// The caller incremented nmspinning, so set m.spinning in the new M.
-	mp.spinning = spinning
-	mp.nextp.set(_p_)
-	notewakeup(&mp.park)
+	nmp.spinning = spinning
+	nmp.nextp.set(_p_)
+	notewakeup(&nmp.park)
+	// Ownership transfer of _p_ committed by wakeup. Preemption is now
+	// safe.
+	releasem(mp)
 }
 
 // Hands off P from syscall or locked M.
@@ -2033,11 +2336,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2068,12 +2376,11 @@ func stoplockedm() {
 	}
 	incidlelocked(1)
 	// Wait until another thread schedules lockedg again.
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
-		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_)
+		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_.m.lockedg.ptr())
 		throw("stoplockedm: not runnable")
 	}
 	acquirep(_g_.m.nextp.ptr())
@@ -2247,31 +2554,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
-			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-				return gp, false
-			}
 
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// timerpMask tells us whether the P may have timers at all. If it
+			// can't, no need to check at all.
+			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2292,6 +2601,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2304,14 +2620,17 @@ stop:
 	// We have nothing to do. If we're in the GC mark phase, can
 	// safely scan and blacken objects, and have work to do, run
 	// idle-time marking rather than give up the P.
-	if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
-		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
-		gp := _p_.gcBgMarkWorker.ptr()
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		if trace.enabled {
-			traceGoUnpark(gp, 0)
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+		if node != nil {
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			gp := node.gp.ptr()
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
 		}
-		return gp, false
 	}
 
 	delta := int64(-1)
@@ -2341,6 +2660,10 @@ stop:
 	// safe-points. We don't need to snapshot the contents because
 	// everything up to cap(allp) is immutable.
 	allpSnapshot := allp
+	// Also snapshot masks. Value changes are OK, but we can't allow
+	// len to change out from under us.
+	idlepMaskSnapshot := idlepMask
+	timerpMaskSnapshot := timerpMask
 
 	// return P and block
 	lock(&sched.lock)
@@ -2364,7 +2687,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2381,8 +2704,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for _, _p_ := range allpSnapshot {
-		if !runqempty(_p_) {
+	for id, _p_ := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2398,13 +2721,56 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for id, _p_ := range allpSnapshot {
+		if timerpMaskSnapshot.read(uint32(id)) {
+			w := nobarrierWakeTime(_p_)
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
-	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
+	//
+	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
+	// must check again after acquiring a P.
+	if atomic.Load(&gcBlackenEnabled) != 0 && gcMarkWorkAvailable(nil) {
+		// Work is available; we can start an idle GC worker only if
+		// there is an available P and available worker G.
+		//
+		// We can attempt to acquire these in either order. Workers are
+		// almost always available (see comment in findRunnableGCWorker
+		// for the one case there may be none). Since we're slightly
+		// less likely to find a P, check for that first.
 		lock(&sched.lock)
+		var node *gcBgMarkWorkerNode
 		_p_ = pidleget()
-		if _p_ != nil && _p_.gcBgMarkWorker == 0 {
-			pidleput(_p_)
-			_p_ = nil
+		if _p_ != nil {
+			// Now that we own a P, gcBlackenEnabled can't change
+			// (as it requires STW).
+			if gcBlackenEnabled != 0 {
+				node = (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+				if node == nil {
+					pidleput(_p_)
+					_p_ = nil
+				}
+			} else {
+				pidleput(_p_)
+				_p_ = nil
+			}
 		}
 		unlock(&sched.lock)
 		if _p_ != nil {
@@ -2413,8 +2779,15 @@ stop:
 				_g_.m.spinning = true
 				atomic.Xadd(&sched.nmspinning, 1)
 			}
-			// Go back to idle GC check.
-			goto stop
+
+			// Run the idle worker.
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			gp := node.gp.ptr()
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
 		}
 	}
 
@@ -2493,9 +2866,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2506,6 +2879,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -2531,7 +2908,7 @@ func resetspinning() {
 // Otherwise, for each idle P, this adds a G to the global queue
 // and starts an M. Any remaining G's are added to the current P's
 // local run queue.
-// This may temporarily acquire the scheduler lock.
+// This may temporarily acquire sched.lock.
 // Can run concurrently with GC.
 func injectglist(glist *gList) {
 	if glist.empty() {
@@ -2737,40 +3114,40 @@ func dropg() {
 // We pass now in and out to avoid extra calls of nanotime.
 //go:yeswritebarrierrec
 func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
-	// If there are no timers to adjust, and the first timer on
-	// the heap is not yet ready to run, then there is nothing to do.
-	if atomic.Load(&pp.adjustTimers) == 0 {
-		next := int64(atomic.Load64(&pp.timer0When))
-		if next == 0 {
-			return now, 0, false
-		}
-		if now == 0 {
-			now = nanotime()
-		}
-		if now < next {
-			// Next timer is not ready to run.
-			// But keep going if we would clear deleted timers.
-			// This corresponds to the condition below where
-			// we decide whether to call clearDeletedTimers.
-			if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
-				return now, next, false
-			}
+	// If it's not yet time for the first timer, or the first adjusted
+	// timer, then there is nothing to do.
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+
+	if next == 0 {
+		// No timers to run or adjust.
+		return now, 0, false
+	}
+
+	if now == 0 {
+		now = nanotime()
+	}
+	if now < next {
+		// Next timer is not ready to run, but keep going
+		// if we would clear deleted timers.
+		// This corresponds to the condition below where
+		// we decide whether to call clearDeletedTimers.
+		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+			return now, next, false
 		}
 	}
 
 	lock(&pp.timersLock)
 
-	adjusttimers(pp)
-
-	rnow = now
 	if len(pp.timers) > 0 {
-		if rnow == 0 {
-			rnow = nanotime()
-		}
+		adjusttimers(pp, now)
 		for len(pp.timers) > 0 {
 			// Note that runtimer may temporarily unlock
 			// pp.timersLock.
-			if tw := runtimer(pp, rnow); tw != 0 {
+			if tw := runtimer(pp, now); tw != 0 {
 				if tw > 0 {
 					pollUntil = tw
 				}
@@ -2789,26 +3166,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 
 	unlock(&pp.timersLock)
 
-	return rnow, pollUntil, ran
-}
-
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
+	return now, pollUntil, ran
 }
 
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
@@ -2966,7 +3324,8 @@ func goexit0(gp *g) {
 		// Flush assist credit to the global pool. This gives
 		// better information to pacing if the application is
 		// rapidly creating an exiting goroutines.
-		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
 		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
 		gp.gcAssistBytes = 0
 	}
@@ -3414,7 +3773,7 @@ func beforefork() {
 	// a signal handler before exec if a signal is sent to the process
 	// group. See issue #18600.
 	gp.m.locks++
-	msigsave(gp.m)
+	sigsave(&gp.m.sigmask)
 	sigblock()
 
 	// This function is called before fork in syscall package.
@@ -3480,11 +3839,24 @@ func syscall_runtime_AfterForkInChild() {
 	inForkedChild = false
 }
 
+// pendingPreemptSignals is the number of preemption signals
+// that have been sent but not received. This is only used on Darwin.
+// For #41702.
+var pendingPreemptSignals uint32
+
 // Called from syscall package before Exec.
 //go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
 func syscall_runtime_BeforeExec() {
 	// Prevent thread creation during exec.
 	execLock.lock()
+
+	// On Darwin, wait for all pending preemption signals to
+	// be received. See issue #41702.
+	if GOOS == "darwin" || GOOS == "ios" {
+		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
+			osyield()
+		}
+	}
 }
 
 // Called from syscall package after Exec.
@@ -3928,6 +4300,13 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 		return
 	}
 
+	// If mp.profilehz is 0, then profiling is not enabled for this thread.
+	// We must check this to avoid a deadlock between setcpuprofilerate
+	// and the call to cpuprof.add, below.
+	if mp != nil && mp.profilehz == 0 {
+		return
+	}
+
 	// On mips{,le}, 64bit atomics are emulated with spinlocks, in
 	// runtime/internal/atomic. If SIGPROF arrives while the program is inside
 	// the critical section, it creates a deadlock (when writing the sample).
@@ -3981,7 +4360,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// First, it may be that the g switch has no PC update, because the SP
 	// either corresponds to a user g throughout (as in asmcgocall)
 	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback_gofunc). In this case, since the entire
+	// (as in cgocallback). In this case, since the entire
 	// transition is a g+SP update, a partial transition updating just one of
 	// those will be detected by the stack bounds check.
 	//
@@ -4050,7 +4429,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 		// Normal traceback is impossible or has failed.
 		// See if it falls into several common cases.
 		n = 0
-		if (GOOS == "windows" || GOOS == "solaris" || GOOS == "illumos" || GOOS == "darwin" || GOOS == "aix") && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+		if (GOOS == "windows" || GOOS == "solaris" || GOOS == "illumos" || GOOS == "darwin" || GOOS == "ios" || GOOS == "aix") && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
 			// Libcall, i.e. runtime syscall on windows.
 			// Collect Go stack that leads to the call.
 			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
@@ -4214,6 +4593,13 @@ func (pp *p) init(id int32) {
 		}
 	}
 	lockInit(&pp.timersLock, lockRankTimers)
+
+	// This P may get timers when it starts running. Set the mask here
+	// since the P may not go through pidleget (notably P 0 on startup).
+	timerpMask.set(id)
+	// Similarly, we may not go through pidleget before this P starts
+	// running if it is P 0 on startup.
+	idlepMask.clear(id)
 }
 
 // destroy releases all of the resources associated with pp and
@@ -4221,6 +4607,9 @@ func (pp *p) init(id int32) {
 //
 // sched.lock must be held and the world must be stopped.
 func (pp *p) destroy() {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
 	// Move all runnable goroutines to the global queue
 	for pp.runqhead != pp.runqtail {
 		// Pop from tail of local queue
@@ -4250,18 +4639,6 @@ func (pp *p) destroy() {
 		unlock(&pp.timersLock)
 		unlock(&plocal.timersLock)
 	}
-	// If there's a background worker, make it runnable and put
-	// it on the global queue so it can clean itself up.
-	if gp := pp.gcBgMarkWorker.ptr(); gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		if trace.enabled {
-			traceGoUnpark(gp, 0)
-		}
-		globrunqput(gp)
-		// This assignment doesn't race because the
-		// world is stopped.
-		pp.gcBgMarkWorker.set(nil)
-	}
 	// Flush p's write barrier buffer.
 	if gcphase != _GCoff {
 		wbBufFlush1(pp)
@@ -4283,7 +4660,9 @@ func (pp *p) destroy() {
 			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
 		}
 		pp.mspancache.len = 0
+		lock(&mheap_.lock)
 		pp.pcache.flush(&mheap_.pages)
+		unlock(&mheap_.lock)
 	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
@@ -4312,11 +4691,18 @@ func (pp *p) destroy() {
 	pp.status = _Pdead
 }
 
-// Change number of processors. The world is stopped, sched is locked.
-// gcworkbufs are not being modified by either the GC or
-// the write barrier code.
+// Change number of processors.
+//
+// sched.lock must be held, and the world must be stopped.
+//
+// gcworkbufs must not be being modified by either the GC or the write barrier
+// code, so the GC must not be running if the number of Ps actually changes.
+//
 // Returns list of Ps with local work, they need to be scheduled by the caller.
 func procresize(nprocs int32) *p {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
 	old := gomaxprocs
 	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
@@ -4332,6 +4718,8 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	maskWords := (nprocs + 31) / 32
+
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
 		// Synchronize with retake, which could be running
@@ -4346,6 +4734,20 @@ func procresize(nprocs int32) *p {
 			copy(nallp, allp[:cap(allp)])
 			allp = nallp
 		}
+
+		if maskWords <= int32(cap(idlepMask)) {
+			idlepMask = idlepMask[:maskWords]
+			timerpMask = timerpMask[:maskWords]
+		} else {
+			nidlepMask := make([]uint32, maskWords)
+			// No need to copy beyond len, old Ps are irrelevant.
+			copy(nidlepMask, idlepMask)
+			idlepMask = nidlepMask
+
+			ntimerpMask := make([]uint32, maskWords)
+			copy(ntimerpMask, timerpMask)
+			timerpMask = ntimerpMask
+		}
 		unlock(&allpLock)
 	}
 
@@ -4404,6 +4806,8 @@ func procresize(nprocs int32) *p {
 	if int32(len(allp)) != nprocs {
 		lock(&allpLock)
 		allp = allp[:nprocs]
+		idlepMask = idlepMask[:maskWords]
+		timerpMask = timerpMask[:maskWords]
 		unlock(&allpLock)
 	}
 
@@ -4508,6 +4912,8 @@ func incidlelocked(v int32) {
 // The check is based on number of running M's, if 0 -> deadlock.
 // sched.lock must be held.
 func checkdead() {
+	assertLockHeld(&sched.lock)
+
 	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
 	// there are no running goroutines. The calling program is
 	// assumed to be running.
@@ -4623,9 +5029,14 @@ func sysmon() {
 	checkdead()
 	unlock(&sched.lock)
 
+	// For syscall_runtime_doAllThreadsSyscall, sysmon is
+	// sufficiently up to participate in fixups.
+	atomic.Store(&sched.sysmonStarting, 0)
+
 	lasttrace := int64(0)
 	idle := 0 // how many cycles in succession we had not wokeup somebody
 	delay := uint32(0)
+
 	for {
 		if idle == 0 { // start with 20us sleep...
 			delay = 20
@@ -4636,11 +5047,29 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4654,32 +5083,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
+					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4703,12 +5127,7 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
+		mDoFixup()
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
@@ -4982,7 +5401,11 @@ func schedEnableUser(enable bool) {
 
 // schedEnabled reports whether gp should be scheduled. It returns
 // false is scheduling of gp is disabled.
+//
+// sched.lock must be held.
 func schedEnabled(gp *g) bool {
+	assertLockHeld(&sched.lock)
+
 	if sched.disable.user {
 		return isSystemGoroutine(gp, true)
 	}
@@ -4990,10 +5413,12 @@ func schedEnabled(gp *g) bool {
 }
 
 // Put mp on midle list.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func mput(mp *m) {
+	assertLockHeld(&sched.lock)
+
 	mp.schedlink = sched.midle
 	sched.midle.set(mp)
 	sched.nmidle++
@@ -5001,10 +5426,12 @@ func mput(mp *m) {
 }
 
 // Try to get an m from midle list.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func mget() *m {
+	assertLockHeld(&sched.lock)
+
 	mp := sched.midle.ptr()
 	if mp != nil {
 		sched.midle = mp.schedlink
@@ -5014,35 +5441,43 @@ func mget() *m {
 }
 
 // Put gp on the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqput(gp *g) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.pushBack(gp)
 	sched.runqsize++
 }
 
 // Put gp at the head of the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqputhead(gp *g) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.push(gp)
 	sched.runqsize++
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
 // This clears *batch.
-// Sched must be locked.
+// sched.lock must be held.
 func globrunqputbatch(batch *gQueue, n int32) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.pushBackAll(*batch)
 	sched.runqsize += n
 	*batch = gQueue{}
 }
 
 // Try get a batch of G's from the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 func globrunqget(_p_ *p, max int32) *g {
+	assertLockHeld(&sched.lock)
+
 	if sched.runqsize == 0 {
 		return nil
 	}
@@ -5069,26 +5504,106 @@ func globrunqget(_p_ *p, max int32) *g {
 	return gp
 }
 
-// Put p to on _Pidle list.
-// Sched must be locked.
+// pMask is an atomic bitstring with one bit per P.
+type pMask []uint32
+
+// read returns true if P id's bit is set.
+func (p pMask) read(id uint32) bool {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	return (atomic.Load(&p[word]) & mask) != 0
+}
+
+// set sets P id's bit.
+func (p pMask) set(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.Or(&p[word], mask)
+}
+
+// clear clears P id's bit.
+func (p pMask) clear(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.And(&p[word], ^mask)
+}
+
+// updateTimerPMask clears pp's timer mask if it has no timers on its heap.
+//
+// Ideally, the timer mask would be kept immediately consistent on any timer
+// operations. Unfortunately, updating a shared global data structure in the
+// timer hot path adds too much overhead in applications frequently switching
+// between no timers and some timers.
+//
+// As a compromise, the timer mask is updated only on pidleget / pidleput. A
+// running P (returned by pidleget) may add a timer at any time, so its mask
+// must be set. An idle P (passed to pidleput) cannot add new timers while
+// idle, so if it has no timers at that time, its mask may be cleared.
+//
+// Thus, we get the following effects on timer-stealing in findrunnable:
+//
+// * Idle Ps with no timers when they go idle are never checked in findrunnable
+//   (for work- or timer-stealing; this is the ideal case).
+// * Running Ps must always be checked.
+// * Idle Ps whose timers are stolen must continue to be checked until they run
+//   again, even after timer expiration.
+//
+// When the P starts running again, the mask should be set, as a timer may be
+// added at any time.
+//
+// TODO(prattmic): Additional targeted updates may improve the above cases.
+// e.g., updating the mask when stealing a timer.
+func updateTimerPMask(pp *p) {
+	if atomic.Load(&pp.numTimers) > 0 {
+		return
+	}
+
+	// Looks like there are no timers, however another P may transiently
+	// decrement numTimers when handling a timerModified timer in
+	// checkTimers. We must take timersLock to serialize with these changes.
+	lock(&pp.timersLock)
+	if atomic.Load(&pp.numTimers) == 0 {
+		timerpMask.clear(pp.id)
+	}
+	unlock(&pp.timersLock)
+}
+
+// pidleput puts p to on the _Pidle list.
+//
+// This releases ownership of p. Once sched.lock is released it is no longer
+// safe to use p.
+//
+// sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleput(_p_ *p) {
+	assertLockHeld(&sched.lock)
+
 	if !runqempty(_p_) {
 		throw("pidleput: P has non-empty run queue")
 	}
+	updateTimerPMask(_p_) // clear if there are no timers.
+	idlepMask.set(_p_.id)
 	_p_.link = sched.pidle
 	sched.pidle.set(_p_)
 	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
 }
 
-// Try get a p from _Pidle list.
-// Sched must be locked.
+// pidleget tries to get a p from the _Pidle list, acquiring ownership.
+//
+// sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleget() *p {
+	assertLockHeld(&sched.lock)
+
 	_p_ := sched.pidle.ptr()
 	if _p_ != nil {
+		// Timer may get added at any time now.
+		timerpMask.set(_p_.id)
+		idlepMask.clear(_p_.id)
 		sched.pidle = _p_.link
 		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
 	}
@@ -5608,6 +6123,17 @@ type initTask struct {
 	// followed by nfns pcs, one per init function to run
 }
 
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init go routine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
+
 func doInit(t *initTask) {
 	switch t.state {
 	case 2: // fully initialized
@@ -5616,16 +6142,52 @@ func doInit(t *initTask) {
 		throw("recursive call during initialization - linker skew")
 	default: // not initialized yet
 		t.state = 1 // initialization in progress
+
 		for i := uintptr(0); i < t.ndeps; i++ {
 			p := add(unsafe.Pointer(t), (3+i)*sys.PtrSize)
 			t2 := *(**initTask)(p)
 			doInit(t2)
 		}
+
+		if t.nfns == 0 {
+			t.state = 2 // initialization done
+			return
+		}
+
+		var (
+			start  int64
+			before tracestat
+		)
+
+		if inittrace.active {
+			start = nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			before = inittrace
+		}
+
+		firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*sys.PtrSize)
 		for i := uintptr(0); i < t.nfns; i++ {
-			p := add(unsafe.Pointer(t), (3+t.ndeps+i)*sys.PtrSize)
+			p := add(firstFunc, i*sys.PtrSize)
 			f := *(*func())(unsafe.Pointer(&p))
 			f()
 		}
+
+		if inittrace.active {
+			end := nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			after := inittrace
+
+			pkg := funcpkgpath(findfunc(funcPC(firstFunc)))
+
+			var sbuf [24]byte
+			print("init ", pkg, " @")
+			print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ")
+			print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ")
+			print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ")
+			print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs")
+			print("\n")
+		}
+
 		t.state = 2 // initialization done
 	}
 }
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index de4dec36ce..767bde15b4 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -523,9 +523,17 @@ func BenchmarkPingPongHog(b *testing.B) {
 	<-done
 }
 
+var padData [128]uint64
+
 func stackGrowthRecursive(i int) {
 	var pad [128]uint64
-	if i != 0 && pad[0] == 0 {
+	pad = padData
+	for j := range pad {
+		if pad[j] != 0 {
+			return
+		}
+	}
+	if i != 0 {
 		stackGrowthRecursive(i - 1)
 	}
 }
diff --git a/src/runtime/race.go b/src/runtime/race.go
index 53910f991c..79fd21765d 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go
@@ -268,6 +268,9 @@ var __tsan_acquire byte
 //go:linkname __tsan_release __tsan_release
 var __tsan_release byte
 
+//go:linkname __tsan_release_acquire __tsan_release_acquire
+var __tsan_release_acquire byte
+
 //go:linkname __tsan_release_merge __tsan_release_merge
 var __tsan_release_merge byte
 
@@ -293,6 +296,7 @@ var __tsan_report_count byte
 //go:cgo_import_static __tsan_free
 //go:cgo_import_static __tsan_acquire
 //go:cgo_import_static __tsan_release
+//go:cgo_import_static __tsan_release_acquire
 //go:cgo_import_static __tsan_release_merge
 //go:cgo_import_static __tsan_go_ignore_sync_begin
 //go:cgo_import_static __tsan_go_ignore_sync_end
@@ -536,6 +540,19 @@ func racereleaseg(gp *g, addr unsafe.Pointer) {
 }
 
 //go:nosplit
+func racereleaseacquire(addr unsafe.Pointer) {
+	racereleaseacquireg(getg(), addr)
+}
+
+//go:nosplit
+func racereleaseacquireg(gp *g, addr unsafe.Pointer) {
+	if getg().raceignore != 0 || !isvalidaddr(addr) {
+		return
+	}
+	racecall(&__tsan_release_acquire, gp.racectx, uintptr(addr), 0, 0)
+}
+
+//go:nosplit
 func racereleasemerge(addr unsafe.Pointer) {
 	racereleasemergeg(getg(), addr)
 }
diff --git a/src/runtime/race/README b/src/runtime/race/README
index 34485f0fb2..178ab94ab5 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -4,10 +4,11 @@ the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).
 
 To update the .syso files use golang.org/x/build/cmd/racebuild.
 
-race_darwin_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_freebsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_amd64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_linux_ppc64le.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_netbsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_windows_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_arm64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
+race_darwin_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_freebsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_ppc64le.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_netbsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_windows_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_arm64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_darwin_arm64.syso built with LLVM 00da38ce2d36c07f12c287dc515d37bb7bc410e9 and Go fe70a3a0fd31441bcbb9932ecab11a6083cf2119.
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index d3e7762175..986667332f 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -8,7 +8,6 @@ package race_test
 
 import (
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -19,7 +18,7 @@ import (
 )
 
 func TestOutput(t *testing.T) {
-	pkgdir, err := ioutil.TempDir("", "go-build-race-output")
+	pkgdir, err := os.MkdirTemp("", "go-build-race-output")
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -34,7 +33,7 @@ func TestOutput(t *testing.T) {
 			t.Logf("test %v runs only on %v, skipping: ", test.name, test.goos)
 			continue
 		}
-		dir, err := ioutil.TempDir("", "go-build")
+		dir, err := os.MkdirTemp("", "go-build")
 		if err != nil {
 			t.Fatalf("failed to create temp directory: %v", err)
 		}
@@ -309,7 +308,7 @@ Read at 0x[0-9,a-f]+ by main goroutine:
 Previous write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.goCallback\(\)
       .*/main\.go:27 \+0x[0-9,a-f]+
-  main._cgoexpwrap_[0-9a-z]+_goCallback\(\)
+  _cgoexp_[0-9a-z]+_goCallback\(\)
       .*_cgo_gotypes\.go:[0-9]+ \+0x[0-9,a-f]+
 
 Goroutine [0-9] \(running\) created at:
@@ -338,4 +337,77 @@ func TestPass(t *testing.T) {
 --- FAIL: TestFail \(0...s\)
 .*testing.go:.*: race detected during execution of test
 FAIL`},
+	{"mutex", "run", "", "atexit_sleep_ms=0", `
+package main
+import (
+	"sync"
+	"fmt"
+)
+func main() {
+	c := make(chan bool, 1)
+	threads := 1
+	iterations := 20000
+	data := 0
+	var wg sync.WaitGroup
+	for i := 0; i < threads; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for i := 0; i < iterations; i++ {
+				c <- true
+				data += 1
+				<- c
+			}
+		}()
+	}
+	for i := 0; i < iterations; i++ {
+		c <- true
+		data += 1
+		<- c
+	}
+	wg.Wait()
+	if (data == iterations*(threads+1)) { fmt.Println("pass") }
+}`, `pass`},
+	// Test for https://github.com/golang/go/issues/37355
+	{"chanmm", "run", "", "atexit_sleep_ms=0", `
+package main
+import (
+	"sync"
+	"time"
+)
+func main() {
+	c := make(chan bool, 1)
+	var data uint64
+	var wg sync.WaitGroup
+	wg.Add(2)
+	c <- true
+	go func() {
+		defer wg.Done()
+		c <- true
+	}()
+	go func() {
+		defer wg.Done()
+		time.Sleep(time.Second)
+		<-c
+		data = 2
+	}()
+	data = 1
+	<-c
+	wg.Wait()
+	_ = data
+}
+`, `==================
+WARNING: DATA RACE
+Write at 0x[0-9,a-f]+ by goroutine [0-9]:
+  main\.main\.func2\(\)
+      .*/main\.go:21 \+0x[0-9,a-f]+
+
+Previous write at 0x[0-9,a-f]+ by main goroutine:
+  main\.main\(\)
+      .*/main\.go:23 \+0x[0-9,a-f]+
+
+Goroutine [0-9] \(running\) created at:
+  main\.main\(\)
+      .*/main.go:[0-9]+ \+0x[0-9,a-f]+
+==================`},
 }
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index c894de5f72..d6a14b79e7 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build race,linux,amd64 race,freebsd,amd64 race,netbsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le race,linux,arm64
+// +build race,linux,amd64 race,freebsd,amd64 race,netbsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le race,linux,arm64 race,darwin,arm64
 
 package race
 
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
index d03a593f5a..3f95ecc8ee 100644
--- a/src/runtime/race/race_darwin_amd64.syso
+++ b/src/runtime/race/race_darwin_amd64.syso
diff --git a/src/runtime/race/race_darwin_arm64.syso b/src/runtime/race/race_darwin_arm64.syso
new file mode 100644
index 0000000000..f6eaa62ae3
--- /dev/null
+++ b/src/runtime/race/race_darwin_arm64.syso
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
index 573591c56f..2a5b46f4ce 100644
--- a/src/runtime/race/race_freebsd_amd64.syso
+++ b/src/runtime/race/race_freebsd_amd64.syso
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
index d31f85df56..e00398c964 100644
--- a/src/runtime/race/race_linux_amd64.syso
+++ b/src/runtime/race/race_linux_amd64.syso
diff --git a/src/runtime/race/race_linux_arm64.syso b/src/runtime/race/race_linux_arm64.syso
index 7c74171b0f..9dae738700 100644
--- a/src/runtime/race/race_linux_arm64.syso
+++ b/src/runtime/race/race_linux_arm64.syso
diff --git a/src/runtime/race/race_linux_ppc64le.syso b/src/runtime/race/race_linux_ppc64le.syso
index a3c72bec55..b562656d56 100644
--- a/src/runtime/race/race_linux_ppc64le.syso
+++ b/src/runtime/race/race_linux_ppc64le.syso
diff --git a/src/runtime/race/race_netbsd_amd64.syso b/src/runtime/race/race_netbsd_amd64.syso
index 54e276bcff..11af16f046 100644
--- a/src/runtime/race/race_netbsd_amd64.syso
+++ b/src/runtime/race/race_netbsd_amd64.syso
diff --git a/src/runtime/race/race_test.go b/src/runtime/race/race_test.go
index a0b8531b42..d433af6bd0 100644
--- a/src/runtime/race/race_test.go
+++ b/src/runtime/race/race_test.go
@@ -177,6 +177,10 @@ func runTests(t *testing.T) ([]byte, error) {
 	)
 	// There are races: we expect tests to fail and the exit code to be non-zero.
 	out, _ := cmd.CombinedOutput()
+	if bytes.Contains(out, []byte("fatal error:")) {
+		// But don't expect runtime to crash.
+		return out, fmt.Errorf("runtime fatal error")
+	}
 	return out, nil
 }
 
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
index abaf42649f..9fbf9b4391 100644
--- a/src/runtime/race/race_windows_amd64.syso
+++ b/src/runtime/race/race_windows_amd64.syso
diff --git a/src/runtime/race/testdata/atomic_test.go b/src/runtime/race/testdata/atomic_test.go
index 769c8d7398..4ce72604a4 100644
--- a/src/runtime/race/testdata/atomic_test.go
+++ b/src/runtime/race/testdata/atomic_test.go
@@ -299,3 +299,27 @@ func TestNoRaceAtomicCrash(t *testing.T) {
 	}()
 	atomic.AddInt32(nilptr, 1)
 }
+
+func TestNoRaceDeferAtomicStore(t *testing.T) {
+	// Test that when an atomic function is deferred directly, the
+	// GC scans it correctly. See issue 42599.
+	type foo struct {
+		bar int64
+	}
+
+	var doFork func(f *foo, depth int)
+	doFork = func(f *foo, depth int) {
+		atomic.StoreInt64(&f.bar, 1)
+		defer atomic.StoreInt64(&f.bar, 0)
+		if depth > 0 {
+			for i := 0; i < 2; i++ {
+				f2 := &foo{}
+				go doFork(f2, depth-1)
+			}
+		}
+		runtime.GC()
+	}
+
+	f := &foo{}
+	doFork(f, 11)
+}
diff --git a/src/runtime/race/testdata/chan_test.go b/src/runtime/race/testdata/chan_test.go
index 3e57b8221c..e39ad4f99c 100644
--- a/src/runtime/race/testdata/chan_test.go
+++ b/src/runtime/race/testdata/chan_test.go
@@ -763,3 +763,25 @@ func TestNoRaceCloseHappensBeforeRead(t *testing.T) {
 		<-read
 	}
 }
+
+// Test that we call the proper race detector function when c.elemsize==0.
+// See https://github.com/golang/go/issues/42598
+func TestNoRaceElemetSize0(t *testing.T) {
+	var x, y int
+	var c = make(chan struct{}, 2)
+	c <- struct{}{}
+	c <- struct{}{}
+	go func() {
+		x += 1
+		<-c
+	}()
+	go func() {
+		y += 1
+		<-c
+	}()
+	time.Sleep(10 * time.Millisecond)
+	c <- struct{}{}
+	c <- struct{}{}
+	x += 1
+	y += 1
+}
diff --git a/src/runtime/race/testdata/io_test.go b/src/runtime/race/testdata/io_test.go
index 30a121bee4..c5055f7837 100644
--- a/src/runtime/race/testdata/io_test.go
+++ b/src/runtime/race/testdata/io_test.go
@@ -6,7 +6,6 @@ package race_test
 
 import (
 	"fmt"
-	"io/ioutil"
 	"net"
 	"net/http"
 	"os"
@@ -18,7 +17,7 @@ import (
 
 func TestNoRaceIOFile(t *testing.T) {
 	x := 0
-	path, _ := ioutil.TempDir("", "race_test")
+	path, _ := os.MkdirTemp("", "race_test")
 	fname := filepath.Join(path, "data")
 	go func() {
 		x = 42
diff --git a/src/runtime/race/testdata/sync_test.go b/src/runtime/race/testdata/sync_test.go
index 2b2d95d76b..b5fcd6c4cf 100644
--- a/src/runtime/race/testdata/sync_test.go
+++ b/src/runtime/race/testdata/sync_test.go
@@ -126,11 +126,11 @@ func TestNoRaceAfterFunc1(t *testing.T) {
 
 func TestNoRaceAfterFunc2(t *testing.T) {
 	var x int
+	_ = x
 	timer := time.AfterFunc(10, func() {
 		x = 1
 	})
 	defer timer.Stop()
-	_ = x
 }
 
 func TestNoRaceAfterFunc3(t *testing.T) {
diff --git a/src/runtime/race0.go b/src/runtime/race0.go
index 6f26afa854..180f707b1a 100644
--- a/src/runtime/race0.go
+++ b/src/runtime/race0.go
@@ -32,6 +32,8 @@ func raceacquireg(gp *g, addr unsafe.Pointer)                               { th
 func raceacquirectx(racectx uintptr, addr unsafe.Pointer)                   { throw("race") }
 func racerelease(addr unsafe.Pointer)                                       { throw("race") }
 func racereleaseg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func racereleaseacquire(addr unsafe.Pointer)                                { throw("race") }
+func racereleaseacquireg(gp *g, addr unsafe.Pointer)                        { throw("race") }
 func racereleasemerge(addr unsafe.Pointer)                                  { throw("race") }
 func racereleasemergeg(gp *g, addr unsafe.Pointer)                          { throw("race") }
 func racefingo()                                                            { throw("race") }
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
index 758d543203..9818bc6ddf 100644
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s
@@ -41,7 +41,9 @@
 
 // func runtime·raceread(addr uintptr)
 // Called from instrumented code.
-TEXT	runtime·raceread(SB), NOSPLIT, $0-8
+// Defined as ABIInternal so as to avoid introducing a wrapper,
+// which would render runtime.getcallerpc ineffective.
+TEXT	runtime·raceread<ABIInternal>(SB), NOSPLIT, $0-8
 	MOVQ	addr+0(FP), RARG1
 	MOVQ	(SP), RARG2
 	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
@@ -65,7 +67,9 @@ TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
 
 // func runtime·racewrite(addr uintptr)
 // Called from instrumented code.
-TEXT	runtime·racewrite(SB), NOSPLIT, $0-8
+// Defined as ABIInternal so as to avoid introducing a wrapper,
+// which would render runtime.getcallerpc ineffective.
+TEXT	runtime·racewrite<ABIInternal>(SB), NOSPLIT, $0-8
 	MOVQ	addr+0(FP), RARG1
 	MOVQ	(SP), RARG2
 	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
@@ -114,7 +118,9 @@ TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
 
 // func runtime·racewriterange(addr, size uintptr)
 // Called from instrumented code.
-TEXT	runtime·racewriterange(SB), NOSPLIT, $0-16
+// Defined as ABIInternal so as to avoid introducing a wrapper,
+// which would render runtime.getcallerpc ineffective.
+TEXT	runtime·racewriterange<ABIInternal>(SB), NOSPLIT, $0-16
 	MOVQ	addr+0(FP), RARG1
 	MOVQ	size+8(FP), RARG2
 	MOVQ	(SP), RARG3
@@ -201,110 +207,136 @@ TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 // Atomic operations for sync/atomic package.
 
 // Load
-TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic32_load(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic64_load(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	JMP	sync∕atomic·LoadInt32(SB)
 
-TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-16
+	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-16
+	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
 // Store
-TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic32_store(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic64_store(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	JMP	sync∕atomic·StoreInt32(SB)
 
-TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	JMP	sync∕atomic·StoreInt64(SB)
 
-TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-16
+	GO_ARGS
 	JMP	sync∕atomic·StoreInt64(SB)
 
 // Swap
-TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic32_exchange(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic64_exchange(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	JMP	sync∕atomic·SwapInt32(SB)
 
-TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	JMP	sync∕atomic·SwapInt64(SB)
 
-TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-24
+	GO_ARGS
 	JMP	sync∕atomic·SwapInt64(SB)
 
 // Add
-TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic32_fetch_add(SB), AX
 	CALL	racecallatomic<>(SB)
 	MOVL	add+8(FP), AX	// convert fetch_add to add_fetch
 	ADDL	AX, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic64_fetch_add(SB), AX
 	CALL	racecallatomic<>(SB)
 	MOVQ	add+8(FP), AX	// convert fetch_add to add_fetch
 	ADDQ	AX, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	JMP	sync∕atomic·AddInt32(SB)
 
-TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	JMP	sync∕atomic·AddInt64(SB)
 
-TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24
+	GO_ARGS
 	JMP	sync∕atomic·AddInt64(SB)
 
 // CompareAndSwap
-TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic32_compare_exchange(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-25
+	GO_ARGS
 	MOVQ	$__tsan_go_atomic64_compare_exchange(SB), AX
 	CALL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-17
+	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt32(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-25
+	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt64(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-25
+	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt64(SB)
 
 // Generic atomic operation implementation.
diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s
index 9b909ac021..8aa17742b8 100644
--- a/src/runtime/race_arm64.s
+++ b/src/runtime/race_arm64.s
@@ -25,11 +25,20 @@
 
 // The race ctx, ThreadState *thr below, is passed in R0 and loaded in racecalladdr.
 
+// Darwin may return unaligned thread pointer. Align it. (See tls_arm64.s)
+// No-op on other OSes.
+#ifdef TLS_darwin
+#define TP_ALIGN	AND	$~7, R0
+#else
+#define TP_ALIGN
+#endif
+
+// Load g from TLS. (See tls_arm64.s)
 #define load_g \
 	MRS_TPIDR_R0 \
+	TP_ALIGN \
 	MOVD    runtime·tls_g(SB), R11 \
-	ADD     R11, R0 \
-	MOVD    0(R0), g
+	MOVD    (R0)(R11), g
 
 // func runtime·raceread(addr uintptr)
 // Called from instrumented code.
@@ -191,86 +200,86 @@ TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 // R0, R1, R2 set in racecallatomic
 
 // Load
-TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-12
 	GO_ARGS
 	MOVD	$__tsan_go_atomic32_load(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-16
 	GO_ARGS
 	MOVD	$__tsan_go_atomic64_load(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-12
 	GO_ARGS
 	JMP	sync∕atomic·LoadInt32(SB)
 
-TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-16
 	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-16
 	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0
+TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-16
 	GO_ARGS
 	JMP	sync∕atomic·LoadInt64(SB)
 
 // Store
-TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-12
 	GO_ARGS
 	MOVD	$__tsan_go_atomic32_store(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-16
 	GO_ARGS
 	MOVD	$__tsan_go_atomic64_store(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-12
 	GO_ARGS
 	JMP	sync∕atomic·StoreInt32(SB)
 
-TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-16
 	GO_ARGS
 	JMP	sync∕atomic·StoreInt64(SB)
 
-TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0
+TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-16
 	GO_ARGS
 	JMP	sync∕atomic·StoreInt64(SB)
 
 // Swap
-TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-20
 	GO_ARGS
 	MOVD	$__tsan_go_atomic32_exchange(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-24
 	GO_ARGS
 	MOVD	$__tsan_go_atomic64_exchange(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-20
 	GO_ARGS
 	JMP	sync∕atomic·SwapInt32(SB)
 
-TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-24
 	GO_ARGS
 	JMP	sync∕atomic·SwapInt64(SB)
 
-TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0
+TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-24
 	GO_ARGS
 	JMP	sync∕atomic·SwapInt64(SB)
 
 // Add
-TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-20
 	GO_ARGS
 	MOVD	$__tsan_go_atomic32_fetch_add(SB), R9
 	BL	racecallatomic<>(SB)
@@ -280,7 +289,7 @@ TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0
 	MOVW	R0, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-24
 	GO_ARGS
 	MOVD	$__tsan_go_atomic64_fetch_add(SB), R9
 	BL	racecallatomic<>(SB)
@@ -290,40 +299,40 @@ TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0
 	MOVD	R0, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-20
 	GO_ARGS
 	JMP	sync∕atomic·AddInt32(SB)
 
-TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-24
 	GO_ARGS
 	JMP	sync∕atomic·AddInt64(SB)
 
-TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0
+TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24
 	GO_ARGS
 	JMP	sync∕atomic·AddInt64(SB)
 
 // CompareAndSwap
-TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17
 	GO_ARGS
 	MOVD	$__tsan_go_atomic32_compare_exchange(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-25
 	GO_ARGS
 	MOVD	$__tsan_go_atomic64_compare_exchange(SB), R9
 	BL	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0
+TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-17
 	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt32(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0
+TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-25
 	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt64(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0
+TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-25
 	GO_ARGS
 	JMP	sync∕atomic·CompareAndSwapInt64(SB)
 
@@ -423,7 +432,13 @@ TEXT	runtime·racecallbackthunk(SB), NOSPLIT|NOFRAME, $0
 	// benefit from this fast path.
 	CBNZ	R0, rest
 	MOVD	g, R13
+#ifdef TLS_darwin
+	MOVD	R27, R12 // save R27 a.k.a. REGTMP (callee-save in C). load_g clobbers it
+#endif
 	load_g
+#ifdef TLS_darwin
+	MOVD	R12, R27
+#endif
 	MOVD	g_m(g), R0
 	MOVD	m_p(R0), R0
 	MOVD	p_raceprocctx(R0), R0
@@ -477,5 +492,7 @@ noswitch:
 	BL	runtime·racecallback(SB)
 	JMP	ret
 
+#ifndef TLSG_IS_VARIABLE
 // tls_g, g value for each thread in TLS
 GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
+#endif
diff --git a/src/runtime/race_ppc64le.s b/src/runtime/race_ppc64le.s
index 7421d539ca..8961254ea6 100644
--- a/src/runtime/race_ppc64le.s
+++ b/src/runtime/race_ppc64le.s
@@ -207,78 +207,95 @@ TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 // R3, R4, R5 set in racecallatomic
 
 // Load atomic in tsan
-TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	// void __tsan_go_atomic32_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic32_load(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	// void __tsan_go_atomic64_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic64_load(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 	RET
 
-TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	BR	sync∕atomic·LoadInt32(SB)
 
-TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	BR	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-16
+	GO_ARGS
 	BR	sync∕atomic·LoadInt64(SB)
 
-TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-16
+	GO_ARGS
 	BR	sync∕atomic·LoadInt64(SB)
 
 // Store atomic in tsan
-TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	// void __tsan_go_atomic32_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic32_store(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	// void __tsan_go_atomic64_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic64_store(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-12
+	GO_ARGS
 	BR	sync∕atomic·StoreInt32(SB)
 
-TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-16
+	GO_ARGS
 	BR	sync∕atomic·StoreInt64(SB)
 
-TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-16
+	GO_ARGS
 	BR	sync∕atomic·StoreInt64(SB)
 
 // Swap in tsan
-TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	// void __tsan_go_atomic32_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic32_exchange(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	// void __tsan_go_atomic64_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a)
 	MOVD	$__tsan_go_atomic64_exchange(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	BR	sync∕atomic·SwapInt32(SB)
 
-TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	BR	sync∕atomic·SwapInt64(SB)
 
-TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-24
+	GO_ARGS
 	BR	sync∕atomic·SwapInt64(SB)
 
 // Add atomic in tsan
-TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	// void __tsan_go_atomic32_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic32_fetch_add(SB), R8
 	ADD	$64, R1, R6	// addr of caller's 1st arg
@@ -291,7 +308,8 @@ TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
 	MOVW	R3, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	// void __tsan_go_atomic64_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
 	MOVD	$__tsan_go_atomic64_fetch_add(SB), R8
 	ADD	$64, R1, R6	// addr of caller's 1st arg
@@ -304,37 +322,45 @@ TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
 	MOVD	R3, ret+16(FP)
 	RET
 
-TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-20
+	GO_ARGS
 	BR	sync∕atomic·AddInt32(SB)
 
-TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-24
+	GO_ARGS
 	BR	sync∕atomic·AddInt64(SB)
 
-TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24
+	GO_ARGS
 	BR	sync∕atomic·AddInt64(SB)
 
 // CompareAndSwap in tsan
-TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17
+	GO_ARGS
 	// void __tsan_go_atomic32_compare_exchange(
 	//   ThreadState *thr, uptr cpc, uptr pc, u8 *a)
 	MOVD	$__tsan_go_atomic32_compare_exchange(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-25
+	GO_ARGS
 	// void __tsan_go_atomic32_compare_exchange(
 	//   ThreadState *thr, uptr cpc, uptr pc, u8 *a)
 	MOVD	$__tsan_go_atomic64_compare_exchange(SB), R8
 	ADD	$32, R1, R6	// addr of caller's 1st arg
 	BR	racecallatomic<>(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-17
+	GO_ARGS
 	BR	sync∕atomic·CompareAndSwapInt32(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-25
+	GO_ARGS
 	BR	sync∕atomic·CompareAndSwapInt64(SB)
 
-TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-0
+TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-25
+	GO_ARGS
 	BR	sync∕atomic·CompareAndSwapInt64(SB)
 
 // Common function used to call tsan's atomic functions
diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s
index e3972f4924..0040361215 100644
--- a/src/runtime/rt0_darwin_arm64.s
+++ b/src/runtime/rt0_darwin_arm64.s
@@ -4,11 +4,14 @@
 
 #include "textflag.h"
 
-// No need for _rt0_arm64_darwin as darwin/arm64 only
-// supports external linking.
 TEXT _rt0_arm64_darwin(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$42, R0
-	BL  libc_exit(SB)
+	MOVD	$runtime·rt0_go(SB), R2
+	BL	(R2)
+exit:
+	MOVD	$0, R0
+	MOVD	$1, R16	// sys_exit
+	SVC	$0x80
+	B	exit
 
 // When linking with -buildmode=c-archive or -buildmode=c-shared,
 // this symbol is called from a global initialization function.
@@ -86,11 +89,6 @@ GLOBL _rt0_arm64_darwin_lib_argc<>(SB),NOPTR, $8
 DATA  _rt0_arm64_darwin_lib_argv<>(SB)/8, $0
 GLOBL _rt0_arm64_darwin_lib_argv<>(SB),NOPTR, $8
 
+// external linking entry point.
 TEXT main(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·rt0_go(SB), R2
-	BL	(R2)
-exit:
-	MOVD	$0, R0
-	MOVD	$1, R16	// sys_exit
-	SVC	$0x80
-	B	exit
+	JMP	_rt0_arm64_darwin(SB)
diff --git a/src/runtime/rt0_ios_amd64.s b/src/runtime/rt0_ios_amd64.s
new file mode 100644
index 0000000000..c6990324f4
--- /dev/null
+++ b/src/runtime/rt0_ios_amd64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/amd64 only supports external linking.
+TEXT _rt0_amd64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_amd64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_amd64_darwin_lib(SB)
diff --git a/src/runtime/rt0_ios_arm64.s b/src/runtime/rt0_ios_arm64.s
new file mode 100644
index 0000000000..dcc83656e2
--- /dev/null
+++ b/src/runtime/rt0_ios_arm64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/arm64 only supports external linking.
+TEXT _rt0_arm64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_arm64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_arm64_darwin_lib(SB)
diff --git a/src/runtime/rt0_openbsd_mips64.s b/src/runtime/rt0_openbsd_mips64.s
new file mode 100644
index 0000000000..82a8dfaba6
--- /dev/null
+++ b/src/runtime/rt0_openbsd_mips64.s
@@ -0,0 +1,36 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_mips64_openbsd(SB),NOSPLIT,$0
+	JMP	_main<>(SB)
+
+TEXT _rt0_mips64le_openbsd(SB),NOSPLIT,$0
+	JMP	_main<>(SB)
+
+TEXT _main<>(SB),NOSPLIT|NOFRAME,$0
+	// In a statically linked binary, the stack contains argc,
+	// argv as argc string pointers followed by a NULL, envv as a
+	// sequence of string pointers followed by a NULL, and auxv.
+	// There is no TLS base pointer.
+#ifdef GOARCH_mips64
+	MOVW	4(R29), R4 // argc, big-endian ABI places int32 at offset 4
+#else
+	MOVW	0(R29), R4 // argc
+#endif
+	ADDV	$8, R29, R5 // argv
+	JMP	main(SB)
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	// in external linking, glibc jumps to main with argc in R4
+	// and argv in R5
+
+	// initialize REGSB = PC&0xffffffff00000000
+	BGEZAL	R0, 1(PC)
+	SRLV	$32, R31, RSB
+	SLLV	$32, RSB
+
+	MOVV	$runtime·rt0_go(SB), R1
+	JMP	(R1)
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index e52bd1c4c4..5df8c3c745 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -8,7 +8,6 @@ import (
 	"bytes"
 	"fmt"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -170,7 +169,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 	checkGdbVersion(t)
 	checkGdbPython(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -195,7 +194,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 		}
 	}
 
-	err = ioutil.WriteFile(filepath.Join(dir, "main.go"), src, 0644)
+	err = os.WriteFile(filepath.Join(dir, "main.go"), src, 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -404,7 +403,7 @@ func TestGdbBacktrace(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -412,7 +411,7 @@ func TestGdbBacktrace(t *testing.T) {
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(backtraceSource), 0644)
+	err = os.WriteFile(src, []byte(backtraceSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -482,7 +481,7 @@ func TestGdbAutotmpTypes(t *testing.T) {
 		t.Skip("TestGdbAutotmpTypes is too slow on aix/ppc64")
 	}
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -490,7 +489,7 @@ func TestGdbAutotmpTypes(t *testing.T) {
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(autotmpTypeSource), 0644)
+	err = os.WriteFile(src, []byte(autotmpTypeSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -551,7 +550,7 @@ func TestGdbConst(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -559,7 +558,7 @@ func TestGdbConst(t *testing.T) {
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(constsSource), 0644)
+	err = os.WriteFile(src, []byte(constsSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -618,7 +617,7 @@ func TestGdbPanic(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -626,7 +625,7 @@ func TestGdbPanic(t *testing.T) {
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(panicSource), 0644)
+	err = os.WriteFile(src, []byte(panicSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -696,7 +695,7 @@ func TestGdbInfCallstack(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -704,7 +703,7 @@ func TestGdbInfCallstack(t *testing.T) {
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(InfCallstackSource), 0644)
+	err = os.WriteFile(src, []byte(InfCallstackSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
diff --git a/src/runtime/runtime-lldb_test.go b/src/runtime/runtime-lldb_test.go
index 1e2e5d5be9..c923b872aa 100644
--- a/src/runtime/runtime-lldb_test.go
+++ b/src/runtime/runtime-lldb_test.go
@@ -6,7 +6,6 @@ package runtime_test
 
 import (
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -143,20 +142,20 @@ func TestLldbPython(t *testing.T) {
 
 	checkLldbPython(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
 	defer os.RemoveAll(dir)
 
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
+	err = os.WriteFile(src, []byte(lldbHelloSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create src file: %v", err)
 	}
 
 	mod := filepath.Join(dir, "go.mod")
-	err = ioutil.WriteFile(mod, []byte("module lldbtest"), 0644)
+	err = os.WriteFile(mod, []byte("module lldbtest"), 0644)
 	if err != nil {
 		t.Fatalf("failed to create mod file: %v", err)
 	}
@@ -172,7 +171,7 @@ func TestLldbPython(t *testing.T) {
 	}
 
 	src = filepath.Join(dir, "script.py")
-	err = ioutil.WriteFile(src, []byte(lldbScriptSource), 0755)
+	err = os.WriteFile(src, []byte(lldbScriptSource), 0755)
 	if err != nil {
 		t.Fatalf("failed to create script: %v", err)
 	}
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 7c893aa25c..30b7044bff 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -300,7 +300,6 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace     int32
 	cgocheck           int32
 	clobberfree        int32
 	efence             int32
@@ -311,13 +310,20 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	sbrk               int32
 	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
 }
 
 var dbgvars = []dbgVar{
@@ -339,12 +345,24 @@ var dbgvars = []dbgVar{
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
 }
 
 func parsedebugvars() {
 	// defaults
 	debug.cgocheck = 1
 	debug.invalidptr = 1
+	if GOOS == "linux" {
+		// On Linux, MADV_FREE is faster than MADV_DONTNEED,
+		// but doesn't affect many of the statistics that
+		// MADV_DONTNEED does until the memory is actually
+		// reclaimed. This generally leads to poor user
+		// experience, like confusing stats in top and other
+		// monitoring tools; and bad integration with
+		// management systems that respond to memory usage.
+		// Hence, default to MADV_DONTNEED.
+		debug.madvdontneed = 1
+	}
 
 	for p := gogetenv("GODEBUG"); p != ""; {
 		field := ""
@@ -378,6 +396,8 @@ func parsedebugvars() {
 		}
 	}
 
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 }
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index a3157037e7..c9376827da 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -387,14 +387,6 @@ type libcall struct {
 	err  uintptr // error number
 }
 
-// describes how to handle callback
-type wincallbackcontext struct {
-	gobody       unsafe.Pointer // go function to call
-	argsize      uintptr        // callback arguments size (in bytes)
-	restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
-	cleanstack   bool
-}
-
 // Stack describes a Go execution stack.
 // The bounds of the stack are exactly [lo, hi),
 // with no implicit data structures on either side.
@@ -453,6 +445,10 @@ type g struct {
 	// copying needs to acquire channel locks to protect these
 	// areas of the stack.
 	activeStackChans bool
+	// parkingOnChan indicates that the goroutine is about to
+	// park on a chansend or chanrecv. Used to signal an unsafe point
+	// for stack shrinking. It's a boolean value, but is updated atomically.
+	parkingOnChan uint8
 
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
@@ -524,6 +520,7 @@ type m struct {
 	ncgo          int32       // number of cgo calls currently in progress
 	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
 	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	doesPark      bool        // non-P running threads: sysmon and newmHandoff never use .park
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
@@ -540,6 +537,13 @@ type m struct {
 	syscalltick   uint32
 	freelink      *m // on sched.freem
 
+	// mFixup is used to synchronize OS related m state (credentials etc)
+	// use mutex to access.
+	mFixup struct {
+		lock mutex
+		fn   func(bool) bool
+	}
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -642,14 +646,25 @@ type p struct {
 	// This is 0 if the timer heap is empty.
 	timer0When uint64
 
+	// The earliest known nextwhen field of a timer with
+	// timerModifiedEarlier status. Because the timer may have been
+	// modified again, there need not be any timer with this value.
+	// This is updated using atomic functions.
+	// This is 0 if the value is unknown.
+	timerModifiedEarliest uint64
+
 	// Per-P GC state
-	gcAssistTime         int64    // Nanoseconds in assistAlloc
-	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
-	gcBgMarkWorker       guintptr // (atomic)
-	gcMarkWorkerMode     gcMarkWorkerMode
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
 
-	// gcMarkWorkerStartTime is the nanotime() at which this mark
-	// worker started.
+	// gcMarkWorkerMode is the mode for the next mark worker to run in.
+	// That is, this is used to communicate with the worker goroutine
+	// selected for immediate execution by
+	// gcController.findRunnableGCWorker. When scheduling other goroutines,
+	// this field must be set to gcMarkWorkerNotWorker.
+	gcMarkWorkerMode gcMarkWorkerMode
+	// gcMarkWorkerStartTime is the nanotime() at which the most recent
+	// mark worker started.
 	gcMarkWorkerStartTime int64
 
 	// gcw is this P's GC work buffer cache. The work buffer is
@@ -664,6 +679,10 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
+
 	// Lock for timers. We normally access the timers while running
 	// on this P, but the scheduler can also do it from a different P.
 	timersLock mutex
@@ -764,6 +783,10 @@ type schedt struct {
 	sysmonwait uint32
 	sysmonnote note
 
+	// While true, sysmon not ready for mFixup calls.
+	// Accessed atomically.
+	sysmonStarting uint32
+
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
 	safePointFn   func(*p)
@@ -806,13 +829,14 @@ type _func struct {
 	args        int32  // in/out args size
 	deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
 
-	pcsp      int32
-	pcfile    int32
-	pcln      int32
-	npcdata   int32
-	cuIndex   uint16 // TODO(jfaller): 16 bits is never enough, make this larger.
-	funcID    funcID // set for certain special runtime functions
-	nfuncdata uint8  // must be last
+	pcsp      uint32
+	pcfile    uint32
+	pcln      uint32
+	npcdata   uint32
+	cuOffset  uint32  // runtime.cutab offset of this function's CU
+	funcID    funcID  // set for certain special runtime functions
+	_         [2]byte // pad
+	nfuncdata uint8   // must be last
 }
 
 // Pseudo-Func that is returned for PCs that occur in inlined code.
@@ -1030,14 +1054,40 @@ func (w waitReason) String() string {
 var (
 	allglen    uintptr
 	allm       *m
-	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
-	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
 	sched      schedt
 	newprocs   int32
 
+	// allpLock protects P-less reads and size changes of allp, idlepMask,
+	// and timerpMask, and all writes to allp.
+	allpLock mutex
+	// len(allp) == gomaxprocs; may change at safe points, otherwise
+	// immutable.
+	allp []*p
+	// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
+	// be atomic. Length may change at safe points.
+	//
+	// Each P must update only its own bit. In order to maintain
+	// consistency, a P going idle must the idle mask simultaneously with
+	// updates to the idle P list under the sched.lock, otherwise a racing
+	// pidleget may clear the mask before pidleput sets the mask,
+	// corrupting the bitmap.
+	//
+	// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
+	idlepMask pMask
+	// Bitmask of Ps that may have a timer, one bit per P. Reads and writes
+	// must be atomic. Length may change at safe points.
+	timerpMask pMask
+
+	// Pool of GC parked background workers. Entries are type
+	// *gcBgMarkWorkerNode.
+	gcBgMarkWorkerPool lfstack
+
+	// Total number of gcBgMarkWorker goroutines. Protected by worldsema.
+	gcBgMarkWorkerCount int32
+
 	// Information about what cpu features are available.
 	// Packages outside the runtime should not use these
 	// as they are not an external api.
@@ -1056,4 +1106,4 @@ var (
 )
 
 // Must agree with cmd/internal/objabi.Framepointer_enabled.
-const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64" && (GOOS == "linux" || GOOS == "darwin")
+const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64" && (GOOS == "linux" || GOOS == "darwin" || GOOS == "ios")
diff --git a/src/runtime/select.go b/src/runtime/select.go
index a506747910..e72761bfa9 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -7,6 +7,7 @@ package runtime
 // This file contains the implementation of Go select statements.
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -61,7 +62,20 @@ func selunlock(scases []scase, lockorder []uint16) {
 func selparkcommit(gp *g, _ unsafe.Pointer) bool {
 	// There are unlocked sudogs that point into gp's stack. Stack
 	// copying must lock the channels of those sudogs.
+	// Set activeStackChans here instead of before we try parking
+	// because we could self-deadlock in stack growth on a
+	// channel lock.
 	gp.activeStackChans = true
+	// Mark that it's safe for stack shrinking to occur now,
+	// because any thread acquiring this G's stack for shrinking
+	// is guaranteed to observe activeStackChans after this store.
+	atomic.Store8(&gp.parkingOnChan, 0)
+	// Make sure we unlock after setting activeStackChans and
+	// unsetting parkingOnChan. The moment we unlock any of the
+	// channel locks we risk gp getting readied by a channel operation
+	// and so gp could continue running before everything before the
+	// unlock is visible (even to gp itself).
+
 	// This must not access gp's stack (see gopark). In
 	// particular, it must not access the *hselect. That's okay,
 	// because by the time this is called, gp.waiting has all
@@ -305,6 +319,11 @@ func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, blo
 
 	// wait for someone to wake us up
 	gp.param = nil
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
 	gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
 	gp.activeStackChans = false
 
@@ -396,8 +415,7 @@ bufrecv:
 		if cas.elem != nil {
 			raceWriteObjectPC(c.elemtype, cas.elem, casePC(casi), chanrecvpc)
 		}
-		raceacquire(chanbuf(c, c.recvx))
-		racerelease(chanbuf(c, c.recvx))
+		racenotify(c, c.recvx, nil)
 	}
 	if msanenabled && cas.elem != nil {
 		msanwrite(cas.elem, c.elemtype.size)
@@ -419,8 +437,7 @@ bufrecv:
 bufsend:
 	// can send to buffer
 	if raceenabled {
-		raceacquire(chanbuf(c, c.sendx))
-		racerelease(chanbuf(c, c.sendx))
+		racenotify(c, c.sendx, nil)
 		raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
 	}
 	if msanenabled {
diff --git a/src/runtime/signal_mips64x.go b/src/runtime/signal_mips64x.go
index 040c959f04..2a347ff4cb 100644
--- a/src/runtime/signal_mips64x.go
+++ b/src/runtime/signal_mips64x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux
+// +build linux openbsd
 // +build mips64 mips64le
 
 package runtime
diff --git a/src/runtime/signal_openbsd_mips64.go b/src/runtime/signal_openbsd_mips64.go
new file mode 100644
index 0000000000..54ed523c7b
--- /dev/null
+++ b/src/runtime/signal_openbsd_mips64.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *sigcontext {
+	return (*sigcontext)(c.ctxt)
+}
+
+func (c *sigctxt) r0() uint64  { return c.regs().sc_regs[0] }
+func (c *sigctxt) r1() uint64  { return c.regs().sc_regs[1] }
+func (c *sigctxt) r2() uint64  { return c.regs().sc_regs[2] }
+func (c *sigctxt) r3() uint64  { return c.regs().sc_regs[3] }
+func (c *sigctxt) r4() uint64  { return c.regs().sc_regs[4] }
+func (c *sigctxt) r5() uint64  { return c.regs().sc_regs[5] }
+func (c *sigctxt) r6() uint64  { return c.regs().sc_regs[6] }
+func (c *sigctxt) r7() uint64  { return c.regs().sc_regs[7] }
+func (c *sigctxt) r8() uint64  { return c.regs().sc_regs[8] }
+func (c *sigctxt) r9() uint64  { return c.regs().sc_regs[9] }
+func (c *sigctxt) r10() uint64 { return c.regs().sc_regs[10] }
+func (c *sigctxt) r11() uint64 { return c.regs().sc_regs[11] }
+func (c *sigctxt) r12() uint64 { return c.regs().sc_regs[12] }
+func (c *sigctxt) r13() uint64 { return c.regs().sc_regs[13] }
+func (c *sigctxt) r14() uint64 { return c.regs().sc_regs[14] }
+func (c *sigctxt) r15() uint64 { return c.regs().sc_regs[15] }
+func (c *sigctxt) r16() uint64 { return c.regs().sc_regs[16] }
+func (c *sigctxt) r17() uint64 { return c.regs().sc_regs[17] }
+func (c *sigctxt) r18() uint64 { return c.regs().sc_regs[18] }
+func (c *sigctxt) r19() uint64 { return c.regs().sc_regs[19] }
+func (c *sigctxt) r20() uint64 { return c.regs().sc_regs[20] }
+func (c *sigctxt) r21() uint64 { return c.regs().sc_regs[21] }
+func (c *sigctxt) r22() uint64 { return c.regs().sc_regs[22] }
+func (c *sigctxt) r23() uint64 { return c.regs().sc_regs[23] }
+func (c *sigctxt) r24() uint64 { return c.regs().sc_regs[24] }
+func (c *sigctxt) r25() uint64 { return c.regs().sc_regs[25] }
+func (c *sigctxt) r26() uint64 { return c.regs().sc_regs[26] }
+func (c *sigctxt) r27() uint64 { return c.regs().sc_regs[27] }
+func (c *sigctxt) r28() uint64 { return c.regs().sc_regs[28] }
+func (c *sigctxt) r29() uint64 { return c.regs().sc_regs[29] }
+func (c *sigctxt) r30() uint64 { return c.regs().sc_regs[30] }
+func (c *sigctxt) r31() uint64 { return c.regs().sc_regs[31] }
+func (c *sigctxt) sp() uint64  { return c.regs().sc_regs[29] }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) pc() uint64 { return c.regs().sc_pc }
+
+func (c *sigctxt) link() uint64 { return c.regs().sc_regs[31] }
+func (c *sigctxt) lo() uint64   { return c.regs().mullo }
+func (c *sigctxt) hi() uint64   { return c.regs().mulhi }
+
+func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 {
+	return *(*uint64)(add(unsafe.Pointer(c.info), 16))
+}
+
+func (c *sigctxt) set_r28(x uint64)  { c.regs().sc_regs[28] = x }
+func (c *sigctxt) set_r30(x uint64)  { c.regs().sc_regs[30] = x }
+func (c *sigctxt) set_pc(x uint64)   { c.regs().sc_pc = x }
+func (c *sigctxt) set_sp(x uint64)   { c.regs().sc_regs[29] = x }
+func (c *sigctxt) set_link(x uint64) { c.regs().sc_regs[31] = x }
+
+func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uint64)(add(unsafe.Pointer(c.info), 16)) = x
+}
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 064a0ea100..e8f39c3321 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -335,6 +335,10 @@ func doSigPreempt(gp *g, ctxt *sigctxt) {
 	// Acknowledge the preemption.
 	atomic.Xadd(&gp.m.preemptGen, 1)
 	atomic.Store(&gp.m.signalPending, 0)
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		atomic.Xadd(&pendingPreemptSignals, -1)
+	}
 }
 
 const preemptMSupported = true
@@ -346,17 +350,17 @@ const preemptMSupported = true
 // safe-point, it will preempt the goroutine. It always atomically
 // increments mp.preemptGen after handling a preemption request.
 func preemptM(mp *m) {
-	if GOOS == "darwin" && GOARCH == "arm64" && !iscgo {
-		// On darwin, we use libc calls, and cgo is required on ARM64
-		// so we have TLS set up to save/restore G during C calls. If cgo is
-		// absent, we cannot save/restore G in TLS, and if a signal is
-		// received during C execution we cannot get the G. Therefore don't
-		// send signals.
-		// This can only happen in the go_bootstrap program (otherwise cgo is
-		// required).
-		return
+	// On Darwin, don't try to preempt threads during exec.
+	// Issue #41702.
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.rlock()
 	}
+
 	if atomic.Cas(&mp.signalPending, 0, 1) {
+		if GOOS == "darwin" || GOOS == "ios" {
+			atomic.Xadd(&pendingPreemptSignals, 1)
+		}
+
 		// If multiple threads are preempting the same M, it may send many
 		// signals to the same M such that it hardly make progress, causing
 		// live-lock problem. Apparently this could happen on darwin. See
@@ -364,6 +368,10 @@ func preemptM(mp *m) {
 		// Only send a signal if there isn't already one pending.
 		signalM(mp, sigPreempt)
 	}
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.runlock()
+	}
 }
 
 // sigFetchG fetches the value of G safely when running in a signal handler.
@@ -424,6 +432,9 @@ func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
 			// no non-Go signal handler for sigPreempt.
 			// The default behavior for sigPreempt is to ignore
 			// the signal, so badsignal will be a no-op anyway.
+			if GOOS == "darwin" || GOOS == "ios" {
+				atomic.Xadd(&pendingPreemptSignals, -1)
+			}
 			return
 		}
 		c.fixsigcode(sig)
@@ -482,14 +493,14 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
 	sigaltstack(nil, &st)
 	if st.ss_flags&_SS_DISABLE != 0 {
 		setg(nil)
-		needm(0)
+		needm()
 		noSignalStack(sig)
 		dropm()
 	}
 	stsp := uintptr(unsafe.Pointer(st.ss_sp))
 	if sp < stsp || sp >= stsp+st.ss_size {
 		setg(nil)
-		needm(0)
+		needm()
 		sigNotOnStack(sig)
 		dropm()
 	}
@@ -710,7 +721,7 @@ func sigpanic() {
 		}
 		// Support runtime/debug.SetPanicOnFault.
 		if g.paniconfault {
-			panicmem()
+			panicmemAddr(g.sigcode1)
 		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
@@ -720,7 +731,7 @@ func sigpanic() {
 		}
 		// Support runtime/debug.SetPanicOnFault.
 		if g.paniconfault {
-			panicmem()
+			panicmemAddr(g.sigcode1)
 		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
@@ -929,7 +940,7 @@ func badsignal(sig uintptr, c *sigctxt) {
 		exit(2)
 		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
 	}
-	needm(0)
+	needm()
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
 		// Go code does not want to handle it.
@@ -975,7 +986,7 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 	// This function and its caller sigtrampgo assumes SIGPIPE is delivered on the
 	// originating thread. This property does not hold on macOS (golang.org/issue/33384),
 	// so we have no choice but to ignore SIGPIPE.
-	if GOOS == "darwin" && sig == _SIGPIPE {
+	if (GOOS == "darwin" || GOOS == "ios") && sig == _SIGPIPE {
 		return true
 	}
 
@@ -1009,15 +1020,15 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 	return true
 }
 
-// msigsave saves the current thread's signal mask into mp.sigmask.
+// sigsave saves the current thread's signal mask into *p.
 // This is used to preserve the non-Go signal mask when a non-Go
 // thread calls a Go function.
 // This is nosplit and nowritebarrierrec because it is called by needm
 // which may be called on a non-Go thread with no g available.
 //go:nosplit
 //go:nowritebarrierrec
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+func sigsave(p *sigset) {
+	sigprocmask(_SIG_SETMASK, nil, p)
 }
 
 // msigrestore sets the current thread's signal mask to sigmask.
@@ -1089,7 +1100,7 @@ func minitSignalStack() {
 // thread's signal mask. When this is called all signals have been
 // blocked for the thread.  This starts with m.sigmask, which was set
 // either from initSigmask for a newly created thread or by calling
-// msigsave if this is a non-Go thread calling a Go function. It
+// sigsave if this is a non-Go thread calling a Go function. It
 // removes all essential signals from the mask, thus causing those
 // signals to not be blocked. Then it sets the thread's signal mask.
 // After this is called the thread can receive signals.
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index d123276d3e..3af2e39b08 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -43,16 +43,9 @@ func initExceptionHandler() {
 //
 //go:nosplit
 func isAbort(r *context) bool {
-	switch GOARCH {
-	case "386", "amd64":
-		// In the case of an abort, the exception IP is one byte after
-		// the INT3 (this differs from UNIX OSes).
-		return isAbortPC(r.ip() - 1)
-	case "arm":
-		return isAbortPC(r.ip())
-	default:
-		return false
-	}
+	// In the case of an abort, the exception IP is one byte after
+	// the INT3 (this differs from UNIX OSes).
+	return isAbortPC(r.ip() - 1)
 }
 
 // isgoexception reports whether this exception should be translated
@@ -242,9 +235,12 @@ func sigpanic() {
 
 	switch g.sig {
 	case _EXCEPTION_ACCESS_VIOLATION:
-		if g.sigcode1 < 0x1000 || g.paniconfault {
+		if g.sigcode1 < 0x1000 {
 			panicmem()
 		}
+		if g.paniconfault {
+			panicmemAddr(g.sigcode1)
+		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
 	case _EXCEPTION_INT_DIVIDE_BY_ZERO:
diff --git a/src/runtime/signal_windows_test.go b/src/runtime/signal_windows_test.go
index f99857193c..a5a885c2f7 100644
--- a/src/runtime/signal_windows_test.go
+++ b/src/runtime/signal_windows_test.go
@@ -7,7 +7,6 @@ import (
 	"bytes"
 	"fmt"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -28,7 +27,7 @@ func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -93,7 +92,7 @@ func TestLibraryCtrlHandler(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 3bf07cb5a6..0605f5da80 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -105,7 +105,7 @@ Send:
 			break Send
 		case sigReceiving:
 			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
-				if GOOS == "darwin" {
+				if GOOS == "darwin" || GOOS == "ios" {
 					sigNoteWakeup(&sig.note)
 					break Send
 				}
@@ -140,7 +140,7 @@ func signal_recv() uint32 {
 				throw("signal_recv: inconsistent state")
 			case sigIdle:
 				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
-					if GOOS == "darwin" {
+					if GOOS == "darwin" || GOOS == "ios" {
 						sigNoteSleep(&sig.note)
 						break Receive
 					}
@@ -194,7 +194,7 @@ func signal_enable(s uint32) {
 	if !sig.inuse {
 		// This is the first call to signal_enable. Initialize.
 		sig.inuse = true // enable reception of signals; cannot disable
-		if GOOS == "darwin" {
+		if GOOS == "darwin" || GOOS == "ios" {
 			sigNoteSetup(&sig.note)
 		} else {
 			noteclear(&sig.note)
diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index b26040b803..38d686544f 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -45,7 +45,7 @@ var sigtable = [...]sigTabT{
 	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/* 34 */ {_SigNotify, "signal 34"},
+	/* 34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/* 35 */ {_SigNotify, "signal 35"},
 	/* 36 */ {_SigNotify, "signal 36"},
 	/* 37 */ {_SigNotify, "signal 37"},
diff --git a/src/runtime/sigtab_linux_mipsx.go b/src/runtime/sigtab_linux_mipsx.go
index 81dd2314c5..51ef470ce7 100644
--- a/src/runtime/sigtab_linux_mipsx.go
+++ b/src/runtime/sigtab_linux_mipsx.go
@@ -42,7 +42,7 @@ var sigtable = [...]sigTabT{
 	/*  31 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/*  32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/*  33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/*  34 */ {_SigNotify, "signal 34"},
+	/*  34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/*  35 */ {_SigNotify, "signal 35"},
 	/*  36 */ {_SigNotify, "signal 36"},
 	/*  37 */ {_SigNotify, "signal 37"},
diff --git a/src/runtime/sizeclasses.go b/src/runtime/sizeclasses.go
index 9c1b44fe0b..c5521ce1bd 100644
--- a/src/runtime/sizeclasses.go
+++ b/src/runtime/sizeclasses.go
@@ -6,82 +6,83 @@ package runtime
 // class  bytes/obj  bytes/span  objects  tail waste  max waste
 //     1          8        8192     1024           0     87.50%
 //     2         16        8192      512           0     43.75%
-//     3         32        8192      256           0     46.88%
-//     4         48        8192      170          32     31.52%
-//     5         64        8192      128           0     23.44%
-//     6         80        8192      102          32     19.07%
-//     7         96        8192       85          32     15.95%
-//     8        112        8192       73          16     13.56%
-//     9        128        8192       64           0     11.72%
-//    10        144        8192       56         128     11.82%
-//    11        160        8192       51          32      9.73%
-//    12        176        8192       46          96      9.59%
-//    13        192        8192       42         128      9.25%
-//    14        208        8192       39          80      8.12%
-//    15        224        8192       36         128      8.15%
-//    16        240        8192       34          32      6.62%
-//    17        256        8192       32           0      5.86%
-//    18        288        8192       28         128     12.16%
-//    19        320        8192       25         192     11.80%
-//    20        352        8192       23          96      9.88%
-//    21        384        8192       21         128      9.51%
-//    22        416        8192       19         288     10.71%
-//    23        448        8192       18         128      8.37%
-//    24        480        8192       17          32      6.82%
-//    25        512        8192       16           0      6.05%
-//    26        576        8192       14         128     12.33%
-//    27        640        8192       12         512     15.48%
-//    28        704        8192       11         448     13.93%
-//    29        768        8192       10         512     13.94%
-//    30        896        8192        9         128     15.52%
-//    31       1024        8192        8           0     12.40%
-//    32       1152        8192        7         128     12.41%
-//    33       1280        8192        6         512     15.55%
-//    34       1408       16384       11         896     14.00%
-//    35       1536        8192        5         512     14.00%
-//    36       1792       16384        9         256     15.57%
-//    37       2048        8192        4           0     12.45%
-//    38       2304       16384        7         256     12.46%
-//    39       2688        8192        3         128     15.59%
-//    40       3072       24576        8           0     12.47%
-//    41       3200       16384        5         384      6.22%
-//    42       3456       24576        7         384      8.83%
-//    43       4096        8192        2           0     15.60%
-//    44       4864       24576        5         256     16.65%
-//    45       5376       16384        3         256     10.92%
-//    46       6144       24576        4           0     12.48%
-//    47       6528       32768        5         128      6.23%
-//    48       6784       40960        6         256      4.36%
-//    49       6912       49152        7         768      3.37%
-//    50       8192        8192        1           0     15.61%
-//    51       9472       57344        6         512     14.28%
-//    52       9728       49152        5         512      3.64%
-//    53      10240       40960        4           0      4.99%
-//    54      10880       32768        3         128      6.24%
-//    55      12288       24576        2           0     11.45%
-//    56      13568       40960        3         256      9.99%
-//    57      14336       57344        4           0      5.35%
-//    58      16384       16384        1           0     12.49%
-//    59      18432       73728        4           0     11.11%
-//    60      19072       57344        3         128      3.57%
-//    61      20480       40960        2           0      6.87%
-//    62      21760       65536        3         256      6.25%
-//    63      24576       24576        1           0     11.45%
-//    64      27264       81920        3         128     10.00%
-//    65      28672       57344        2           0      4.91%
-//    66      32768       32768        1           0     12.50%
+//     3         24        8192      341           8     29.24%
+//     4         32        8192      256           0     21.88%
+//     5         48        8192      170          32     31.52%
+//     6         64        8192      128           0     23.44%
+//     7         80        8192      102          32     19.07%
+//     8         96        8192       85          32     15.95%
+//     9        112        8192       73          16     13.56%
+//    10        128        8192       64           0     11.72%
+//    11        144        8192       56         128     11.82%
+//    12        160        8192       51          32      9.73%
+//    13        176        8192       46          96      9.59%
+//    14        192        8192       42         128      9.25%
+//    15        208        8192       39          80      8.12%
+//    16        224        8192       36         128      8.15%
+//    17        240        8192       34          32      6.62%
+//    18        256        8192       32           0      5.86%
+//    19        288        8192       28         128     12.16%
+//    20        320        8192       25         192     11.80%
+//    21        352        8192       23          96      9.88%
+//    22        384        8192       21         128      9.51%
+//    23        416        8192       19         288     10.71%
+//    24        448        8192       18         128      8.37%
+//    25        480        8192       17          32      6.82%
+//    26        512        8192       16           0      6.05%
+//    27        576        8192       14         128     12.33%
+//    28        640        8192       12         512     15.48%
+//    29        704        8192       11         448     13.93%
+//    30        768        8192       10         512     13.94%
+//    31        896        8192        9         128     15.52%
+//    32       1024        8192        8           0     12.40%
+//    33       1152        8192        7         128     12.41%
+//    34       1280        8192        6         512     15.55%
+//    35       1408       16384       11         896     14.00%
+//    36       1536        8192        5         512     14.00%
+//    37       1792       16384        9         256     15.57%
+//    38       2048        8192        4           0     12.45%
+//    39       2304       16384        7         256     12.46%
+//    40       2688        8192        3         128     15.59%
+//    41       3072       24576        8           0     12.47%
+//    42       3200       16384        5         384      6.22%
+//    43       3456       24576        7         384      8.83%
+//    44       4096        8192        2           0     15.60%
+//    45       4864       24576        5         256     16.65%
+//    46       5376       16384        3         256     10.92%
+//    47       6144       24576        4           0     12.48%
+//    48       6528       32768        5         128      6.23%
+//    49       6784       40960        6         256      4.36%
+//    50       6912       49152        7         768      3.37%
+//    51       8192        8192        1           0     15.61%
+//    52       9472       57344        6         512     14.28%
+//    53       9728       49152        5         512      3.64%
+//    54      10240       40960        4           0      4.99%
+//    55      10880       32768        3         128      6.24%
+//    56      12288       24576        2           0     11.45%
+//    57      13568       40960        3         256      9.99%
+//    58      14336       57344        4           0      5.35%
+//    59      16384       16384        1           0     12.49%
+//    60      18432       73728        4           0     11.11%
+//    61      19072       57344        3         128      3.57%
+//    62      20480       40960        2           0      6.87%
+//    63      21760       65536        3         256      6.25%
+//    64      24576       24576        1           0     11.45%
+//    65      27264       81920        3         128     10.00%
+//    66      28672       57344        2           0      4.91%
+//    67      32768       32768        1           0     12.50%
 
 const (
 	_MaxSmallSize   = 32768
 	smallSizeDiv    = 8
 	smallSizeMax    = 1024
 	largeSizeDiv    = 128
-	_NumSizeClasses = 67
+	_NumSizeClasses = 68
 	_PageShift      = 13
 )
 
-var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
-var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
+var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
+var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
 
 type divMagic struct {
 	shift    uint8
@@ -90,6 +91,6 @@ type divMagic struct {
 	baseMask uint16
 }
 
-var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
-var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}
-var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{31, 32, 33, 34, 35, 36, 36, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66}
+var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {3, 11, 683, 0}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
+var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
+var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 0418ace25a..c0647d95a0 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -146,7 +146,7 @@ func growslice(et *_type, old slice, cap int) slice {
 	if cap > doublecap {
 		newcap = cap
 	} else {
-		if old.len < 1024 {
+		if old.cap < 1024 {
 			newcap = doublecap
 		} else {
 			// Check 0 < newcap to detect overflow
@@ -243,12 +243,13 @@ func isPowerOfTwo(x uintptr) bool {
 	return x&(x-1) == 0
 }
 
-func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int, width uintptr) int {
-	if fmLen == 0 || toLen == 0 {
+// slicecopy is used to copy from a string or slice of pointerless elements into a slice.
+func slicecopy(toPtr unsafe.Pointer, toLen int, fromPtr unsafe.Pointer, fromLen int, width uintptr) int {
+	if fromLen == 0 || toLen == 0 {
 		return 0
 	}
 
-	n := fmLen
+	n := fromLen
 	if toLen < n {
 		n = toLen
 	}
@@ -257,46 +258,23 @@ func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int,
 		return n
 	}
 
+	size := uintptr(n) * width
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
-		racereadrangepc(fmPtr, uintptr(n*int(width)), callerpc, pc)
-		racewriterangepc(toPtr, uintptr(n*int(width)), callerpc, pc)
+		racereadrangepc(fromPtr, size, callerpc, pc)
+		racewriterangepc(toPtr, size, callerpc, pc)
 	}
 	if msanenabled {
-		msanread(fmPtr, uintptr(n*int(width)))
-		msanwrite(toPtr, uintptr(n*int(width)))
+		msanread(fromPtr, size)
+		msanwrite(toPtr, size)
 	}
 
-	size := uintptr(n) * width
 	if size == 1 { // common case worth about 2x to do here
 		// TODO: is this still worth it with new memmove impl?
-		*(*byte)(toPtr) = *(*byte)(fmPtr) // known to be a byte pointer
+		*(*byte)(toPtr) = *(*byte)(fromPtr) // known to be a byte pointer
 	} else {
-		memmove(toPtr, fmPtr, size)
-	}
-	return n
-}
-
-func slicestringcopy(toPtr *byte, toLen int, fm string) int {
-	if len(fm) == 0 || toLen == 0 {
-		return 0
-	}
-
-	n := len(fm)
-	if toLen < n {
-		n = toLen
+		memmove(toPtr, fromPtr, size)
 	}
-
-	if raceenabled {
-		callerpc := getcallerpc()
-		pc := funcPC(slicestringcopy)
-		racewriterangepc(unsafe.Pointer(toPtr), uintptr(n), callerpc, pc)
-	}
-	if msanenabled {
-		msanwrite(unsafe.Pointer(toPtr), uintptr(n))
-	}
-
-	memmove(unsafe.Pointer(toPtr), stringStructOf(&fm).str, uintptr(n))
 	return n
 }
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 403b3c313e..7b9dce5393 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -66,7 +66,7 @@ const (
 	// to each stack below the usual guard area for OS-specific
 	// purposes like signal handling. Used on Windows, Plan 9,
 	// and iOS because they do not use a separate stack.
-	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + sys.GoosDarwin*sys.GoarchArm*1024 + sys.GoosDarwin*sys.GoarchArm64*1024
+	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + sys.GoosIos*sys.GoarchArm64*1024
 
 	// The minimum size of stack used by Go code
 	_StackMin = 2048
@@ -187,7 +187,7 @@ func stackpoolalloc(order uint8) gclinkptr {
 	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
-		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
+		s = mheap_.allocManual(_StackCacheSize>>_PageShift, spanAllocStack)
 		if s == nil {
 			throw("out of memory")
 		}
@@ -251,7 +251,7 @@ func stackpoolfree(x gclinkptr, order uint8) {
 		stackpool[order].item.span.remove(s)
 		s.manualFreeList = 0
 		osStackFree(s)
-		mheap_.freeManual(s, &memstats.stacks_inuse)
+		mheap_.freeManual(s, spanAllocStack)
 	}
 }
 
@@ -396,7 +396,7 @@ func stackalloc(n uint32) stack {
 
 		if s == nil {
 			// Allocate a new stack from the heap.
-			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
+			s = mheap_.allocManual(npage, spanAllocStack)
 			if s == nil {
 				throw("out of memory")
 			}
@@ -480,7 +480,7 @@ func stackfree(stk stack) {
 			// Free the stack immediately if we're
 			// sweeping.
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 		} else {
 			// If the GC is running, we can't return a
 			// stack span to the heap because it could be
@@ -497,6 +497,8 @@ func stackfree(stk stack) {
 
 var maxstacksize uintptr = 1 << 20 // enough until runtime.main sets it for real
 
+var maxstackceiling = maxstacksize
+
 var ptrnames = []string{
 	0: "scalar",
 	1: "ptr",
@@ -860,6 +862,13 @@ func copystack(gp *g, newsize uintptr) {
 	// Adjust sudogs, synchronizing with channel ops if necessary.
 	ncopy := used
 	if !gp.activeStackChans {
+		if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 {
+			// It's not safe for someone to shrink this stack while we're actively
+			// parking on a channel, but it is safe to grow since we do that
+			// ourselves and explicitly don't want to synchronize with channels
+			// since we could self-deadlock.
+			throw("racy sudog adjustment due to parking on channel")
+		}
 		adjustsudogs(gp, &adjinfo)
 	} else {
 		// sudogs may be pointing in to the stack and gp has
@@ -1050,8 +1059,12 @@ func newstack() {
 		}
 	}
 
-	if newsize > maxstacksize {
-		print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+	if newsize > maxstacksize || newsize > maxstackceiling {
+		if maxstacksize < maxstackceiling {
+			print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+		} else {
+			print("runtime: goroutine stack exceeds ", maxstackceiling, "-byte limit\n")
+		}
 		print("runtime: sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
 		throw("stack overflow")
 	}
@@ -1099,7 +1112,11 @@ func isShrinkStackSafe(gp *g) bool {
 	// We also can't copy the stack if we're at an asynchronous
 	// safe-point because we don't have precise pointer maps for
 	// all frames.
-	return gp.syscallsp == 0 && !gp.asyncSafePoint
+	//
+	// We also can't *shrink* the stack in the window between the
+	// goroutine calling gopark to park on a channel and
+	// gp.activeStackChans being set.
+	return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0
 }
 
 // Maybe shrink the stack being used by gp.
@@ -1176,7 +1193,7 @@ func freeStackSpans() {
 				list.remove(s)
 				s.manualFreeList = 0
 				osStackFree(s)
-				mheap_.freeManual(s, &memstats.stacks_inuse)
+				mheap_.freeManual(s, spanAllocStack)
 			}
 			s = next
 		}
@@ -1190,7 +1207,7 @@ func freeStackSpans() {
 			next := s.next
 			stackLarge.free[i].remove(s)
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 			s = next
 		}
 	}
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index adfc65384a..43fc5cac55 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -17,6 +17,7 @@ import (
 	"sync/atomic"
 	"testing"
 	"time"
+	_ "unsafe" // for go:linkname
 )
 
 // TestStackMem measures per-thread stack segment cache behavior.
@@ -851,3 +852,43 @@ func deferHeapAndStack(n int) (r int) {
 
 // Pass a value to escapeMe to force it to escape.
 var escapeMe = func(x interface{}) {}
+
+// Test that when F -> G is inlined and F is excluded from stack
+// traces, G still appears.
+func TestTracebackInlineExcluded(t *testing.T) {
+	defer func() {
+		recover()
+		buf := make([]byte, 4<<10)
+		stk := string(buf[:Stack(buf, false)])
+
+		t.Log(stk)
+
+		if not := "tracebackExcluded"; strings.Contains(stk, not) {
+			t.Errorf("found but did not expect %q", not)
+		}
+		if want := "tracebackNotExcluded"; !strings.Contains(stk, want) {
+			t.Errorf("expected %q in stack", want)
+		}
+	}()
+	tracebackExcluded()
+}
+
+// tracebackExcluded should be excluded from tracebacks. There are
+// various ways this could come up. Linking it to a "runtime." name is
+// rather synthetic, but it's easy and reliable. See issue #42754 for
+// one way this happened in real code.
+//
+//go:linkname tracebackExcluded runtime.tracebackExcluded
+//go:noinline
+func tracebackExcluded() {
+	// Call an inlined function that should not itself be excluded
+	// from tracebacks.
+	tracebackNotExcluded()
+}
+
+// tracebackNotExcluded should be inlined into tracebackExcluded, but
+// should not itself be excluded from the traceback.
+func tracebackNotExcluded() {
+	var x *int
+	*x = 0
+}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index b891a12fdd..b55c3c0590 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -130,6 +130,12 @@ func fastrandn(n uint32) uint32 {
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
 
+//go:linkname net_fastrand net.fastrand
+func net_fastrand() uint32 { return fastrand() }
+
+//go:linkname os_fastrand os.fastrand
+func os_fastrand() uint32 { return fastrand() }
+
 // in internal/bytealg/equal_*.s
 //go:noescape
 func memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -145,7 +151,13 @@ func noescape(p unsafe.Pointer) unsafe.Pointer {
 	return unsafe.Pointer(x ^ 0)
 }
 
-func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
+// Not all cgocallback frames are actually cgocallback,
+// so not all have these arguments. Mark them uintptr so that the GC
+// does not misinterpret memory when the arguments are not present.
+// cgocallback is not called from Go, only from crosscall2.
+// This in turn calls cgocallbackg, which is where we'll find
+// pointer-declared arguments.
+func cgocallback(fn, frame, ctxt uintptr)
 func gogo(buf *gobuf)
 func gosave(buf *gobuf)
 
@@ -160,10 +172,11 @@ func breakpoint()
 // back into arg+retoffset before returning. If copying result bytes back,
 // the caller should pass the argument frame type as argtype, so that
 // call can execute appropriate write barriers during the copy.
-// Package reflect passes a frame type. In package runtime, there is only
-// one call that copies results back, in cgocallbackg1, and it does NOT pass a
-// frame type, meaning there are no write barriers invoked. See that call
-// site for justification.
+//
+// Package reflect always passes a frame type. In package runtime,
+// Windows callbacks are the only use of this that copies results
+// back, and those cannot have pointers in their results, so runtime
+// passes nil for the frame type.
 //
 // Package reflect accesses this symbol through a linkname.
 func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
@@ -184,14 +197,6 @@ type neverCallThisFunction struct{}
 // prematurely and if there is leftover state it may panic.
 func goexit(neverCallThisFunction)
 
-// Not all cgocallback_gofunc frames are actually cgocallback_gofunc,
-// so not all have these arguments. Mark them uintptr so that the GC
-// does not misinterpret memory when the arguments are not present.
-// cgocallback_gofunc is not called from go, only from cgocallback,
-// so the arguments will be found via cgocallback's pointer-declared arguments.
-// See the assembly implementations for more details.
-func cgocallback_gofunc(fv, frame, framesize, ctxt uintptr)
-
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
 // between initializing an object and making that object accessible to
@@ -271,6 +276,7 @@ func return0()
 
 // in asm_*.s
 // not called directly; definitions here supply type information for traceback.
+func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call64(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call128(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/stubs32.go b/src/runtime/stubs32.go
deleted file mode 100644
index a7f52f6b9e..0000000000
--- a/src/runtime/stubs32.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 arm mips mipsle
-
-package runtime
-
-import "unsafe"
-
-// Declarations for runtime services implemented in C or assembly that
-// are only present on 32 bit systems.
-
-func call16(fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index fa8d17035e..7667f23f1d 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -268,25 +268,20 @@ func (f *Func) funcInfo() funcInfo {
 //
 // See funcdata.h and ../cmd/internal/objabi/funcdata.go.
 const (
-	_PCDATA_RegMapIndex   = 0 // if !go115ReduceLiveness
-	_PCDATA_UnsafePoint   = 0 // if go115ReduceLiveness
+	_PCDATA_UnsafePoint   = 0
 	_PCDATA_StackMapIndex = 1
 	_PCDATA_InlTreeIndex  = 2
 
 	_FUNCDATA_ArgsPointerMaps    = 0
 	_FUNCDATA_LocalsPointerMaps  = 1
-	_FUNCDATA_RegPointerMaps     = 2 // if !go115ReduceLiveness
-	_FUNCDATA_StackObjects       = 3
-	_FUNCDATA_InlTree            = 4
-	_FUNCDATA_OpenCodedDeferInfo = 5
+	_FUNCDATA_StackObjects       = 2
+	_FUNCDATA_InlTree            = 3
+	_FUNCDATA_OpenCodedDeferInfo = 4
 
 	_ArgsSizeUnknown = -0x80000000
 )
 
 const (
-	// Only if !go115ReduceLiveness.
-	_PCDATA_RegMapUnsafe = _PCDATA_UnsafePointUnsafe // Unsafe for async preemption
-
 	// PCDATA_UnsafePoint values.
 	_PCDATA_UnsafePointSafe   = -1 // Safe for async preemption
 	_PCDATA_UnsafePointUnsafe = -2 // Unsafe for async preemption
@@ -326,7 +321,7 @@ const (
 	funcID_gcBgMarkWorker
 	funcID_systemstack_switch
 	funcID_systemstack
-	funcID_cgocallback_gofunc
+	funcID_cgocallback
 	funcID_gogo
 	funcID_externalthreadhandler
 	funcID_debugCallV1
@@ -337,14 +332,18 @@ const (
 	funcID_wrapper // any autogenerated code (hash/eq algorithms, method wrappers, etc.)
 )
 
-// PCHeader holds data used by the pclntab lookups.
+// pcHeader holds data used by the pclntab lookups.
 type pcHeader struct {
 	magic          uint32  // 0xFFFFFFFA
 	pad1, pad2     uint8   // 0,0
 	minLC          uint8   // min instruction size
 	ptrSize        uint8   // size of a ptr in bytes
 	nfunc          int     // number of functions in the module
+	nfiles         uint    // number of entries in the file tab.
 	funcnameOffset uintptr // offset to the funcnametab variable from pcHeader
+	cuOffset       uintptr // offset to the cutab variable from pcHeader
+	filetabOffset  uintptr // offset to the filetab variable from pcHeader
+	pctabOffset    uintptr // offset to the pctab varible from pcHeader
 	pclnOffset     uintptr // offset to the pclntab variable from pcHeader
 }
 
@@ -356,9 +355,11 @@ type pcHeader struct {
 type moduledata struct {
 	pcHeader     *pcHeader
 	funcnametab  []byte
+	cutab        []uint32
+	filetab      []byte
+	pctab        []byte
 	pclntable    []byte
 	ftab         []functab
-	filetab      []uint32
 	findfunctab  uintptr
 	minpc, maxpc uintptr
 
@@ -720,7 +721,7 @@ type pcvalueCache struct {
 type pcvalueCacheEnt struct {
 	// targetpc and off together are the key of this cache entry.
 	targetpc uintptr
-	off      int32
+	off      uint32
 	// val is the value of this cached pcvalue entry.
 	val int32
 }
@@ -735,7 +736,7 @@ func pcvalueCacheKey(targetpc uintptr) uintptr {
 
 // Returns the PCData value, and the PC where this value starts.
 // TODO: the start PC is returned only when cache is nil.
-func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) (int32, uintptr) {
+func pcvalue(f funcInfo, off uint32, targetpc uintptr, cache *pcvalueCache, strict bool) (int32, uintptr) {
 	if off == 0 {
 		return -1, 0
 	}
@@ -769,7 +770,7 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric
 		return -1, 0
 	}
 	datap := f.datap
-	p := datap.pclntable[off:]
+	p := datap.pctab[off:]
 	pc := f.entry
 	prevpc := pc
 	val := int32(-1)
@@ -811,7 +812,7 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric
 
 	print("runtime: invalid pc-encoded table f=", funcname(f), " pc=", hex(pc), " targetpc=", hex(targetpc), " tab=", p, "\n")
 
-	p = datap.pclntable[off:]
+	p = datap.pctab[off:]
 	pc = f.entry
 	val = -1
 	for {
@@ -838,6 +839,22 @@ func funcname(f funcInfo) string {
 	return gostringnocopy(cfuncname(f))
 }
 
+func funcpkgpath(f funcInfo) string {
+	name := funcname(f)
+	i := len(name) - 1
+	for ; i > 0; i-- {
+		if name[i] == '/' {
+			break
+		}
+	}
+	for ; i < len(name); i++ {
+		if name[i] == '.' {
+			break
+		}
+	}
+	return name[:i]
+}
+
 func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
 	if !f.valid() {
 		return nil
@@ -854,7 +871,12 @@ func funcfile(f funcInfo, fileno int32) string {
 	if !f.valid() {
 		return "?"
 	}
-	return gostringnocopy(&datap.pclntable[datap.filetab[fileno]])
+	// Make sure the cu index and file offset are valid
+	if fileoff := datap.cutab[f.cuOffset+uint32(fileno)]; fileoff != ^uint32(0) {
+		return gostringnocopy(&datap.filetab[fileoff])
+	}
+	// pcln section is corrupt.
+	return "?"
 }
 
 func funcline1(f funcInfo, targetpc uintptr, strict bool) (file string, line int32) {
@@ -868,7 +890,7 @@ func funcline1(f funcInfo, targetpc uintptr, strict bool) (file string, line int
 		// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
 		return "?", 0
 	}
-	file = gostringnocopy(&datap.pclntable[datap.filetab[fileno]])
+	file = funcfile(f, fileno)
 	return
 }
 
@@ -887,7 +909,7 @@ func funcspdelta(f funcInfo, targetpc uintptr, cache *pcvalueCache) int32 {
 // funcMaxSPDelta returns the maximum spdelta at any point in f.
 func funcMaxSPDelta(f funcInfo) int32 {
 	datap := f.datap
-	p := datap.pclntable[f.pcsp:]
+	p := datap.pctab[f.pcsp:]
 	pc := f.entry
 	val := int32(-1)
 	max := int32(0)
@@ -903,20 +925,20 @@ func funcMaxSPDelta(f funcInfo) int32 {
 	}
 }
 
-func pcdatastart(f funcInfo, table int32) int32 {
-	return *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
+func pcdatastart(f funcInfo, table uint32) uint32 {
+	return *(*uint32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
 }
 
-func pcdatavalue(f funcInfo, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
-	if table < 0 || table >= f.npcdata {
+func pcdatavalue(f funcInfo, table uint32, targetpc uintptr, cache *pcvalueCache) int32 {
+	if table >= f.npcdata {
 		return -1
 	}
 	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, true)
 	return r
 }
 
-func pcdatavalue1(f funcInfo, table int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
-	if table < 0 || table >= f.npcdata {
+func pcdatavalue1(f funcInfo, table uint32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
+	if table >= f.npcdata {
 		return -1
 	}
 	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, strict)
@@ -925,8 +947,8 @@ func pcdatavalue1(f funcInfo, table int32, targetpc uintptr, cache *pcvalueCache
 
 // Like pcdatavalue, but also return the start PC of this PCData value.
 // It doesn't take a cache.
-func pcdatavalue2(f funcInfo, table int32, targetpc uintptr) (int32, uintptr) {
-	if table < 0 || table >= f.npcdata {
+func pcdatavalue2(f funcInfo, table uint32, targetpc uintptr) (int32, uintptr) {
+	if table >= f.npcdata {
 		return -1, 0
 	}
 	return pcvalue(f, pcdatastart(f, table), targetpc, nil, true)
@@ -1008,7 +1030,7 @@ type inlinedCall struct {
 	parent   int16  // index of parent in the inltree, or < 0
 	funcID   funcID // type of the called function
 	_        byte
-	file     int32 // fileno index into filetab
+	file     int32 // perCU file index for inlined call. See cmd/link:pcln.go
 	line     int32 // line number of the call site
 	func_    int32 // offset into pclntab for name of called function
 	parentPc int32 // position of an instruction whose source position is the call site (offset from entry)
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index e4f19bbf41..c89ce78012 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -228,6 +228,13 @@ func madvise_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
+func mlock(addr unsafe.Pointer, n uintptr) {
+	libcCall(unsafe.Pointer(funcPC(mlock_trampoline)), unsafe.Pointer(&addr))
+}
+func mlock_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
 func read(fd int32, p unsafe.Pointer, n int32) int32 {
 	return libcCall(unsafe.Pointer(funcPC(read_trampoline)), unsafe.Pointer(&fd))
 }
@@ -303,9 +310,9 @@ func nanotime_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func walltime1() (int64, int32) {
-	var t timeval
+	var t timespec
 	libcCall(unsafe.Pointer(funcPC(walltime_trampoline)), unsafe.Pointer(&t))
-	return int64(t.tv_sec), 1000 * t.tv_usec
+	return t.tv_sec, int32(t.tv_nsec)
 }
 func walltime_trampoline()
 
@@ -353,13 +360,20 @@ func setitimer_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
-func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32 {
+func sysctl(mib *uint32, miblen uint32, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
 	return libcCall(unsafe.Pointer(funcPC(sysctl_trampoline)), unsafe.Pointer(&mib))
 }
 func sysctl_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
+func sysctlbyname(name *byte, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
+	return libcCall(unsafe.Pointer(funcPC(sysctlbyname_trampoline)), unsafe.Pointer(&name))
+}
+func sysctlbyname_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
 func fcntl(fd, cmd, arg int32) int32 {
 	return libcCall(unsafe.Pointer(funcPC(fcntl_trampoline)), unsafe.Pointer(&fd))
 }
@@ -465,12 +479,13 @@ func setNonblock(fd int32) {
 //go:cgo_import_dynamic libc_mmap mmap "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_munmap munmap "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_madvise madvise "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_mlock mlock "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_error __error "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_usleep usleep "/usr/lib/libSystem.B.dylib"
 
 //go:cgo_import_dynamic libc_mach_timebase_info mach_timebase_info "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_mach_absolute_time mach_absolute_time "/usr/lib/libSystem.B.dylib"
-//go:cgo_import_dynamic libc_gettimeofday gettimeofday "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_clock_gettime clock_gettime "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_sigaction sigaction "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_sigmask pthread_sigmask "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_sigaltstack sigaltstack "/usr/lib/libSystem.B.dylib"
@@ -478,6 +493,7 @@ func setNonblock(fd int32) {
 //go:cgo_import_dynamic libc_kill kill "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_setitimer setitimer "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_fcntl fcntl "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_kqueue kqueue "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_kevent kevent "/usr/lib/libSystem.B.dylib"
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 825852d673..630fb5df64 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -10,6 +10,8 @@
 #include "go_tls.h"
 #include "textflag.h"
 
+#define CLOCK_REALTIME		0
+
 // Exit the entire program (like C exit)
 TEXT runtime·exit_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
@@ -103,6 +105,9 @@ TEXT runtime·madvise_trampoline(SB), NOSPLIT, $0
 	POPQ	BP
 	RET
 
+TEXT runtime·mlock_trampoline(SB), NOSPLIT, $0
+	UNDEF // unimplemented
+
 GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
 
 TEXT runtime·nanotime_trampoline(SB),NOSPLIT,$0
@@ -137,9 +142,9 @@ initialized:
 TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP			// make a frame; keep stack aligned
 	MOVQ	SP, BP
-	// DI already has *timeval
-	XORL	SI, SI // no timezone needed
-	CALL	libc_gettimeofday(SB)
+	MOVQ	DI, SI			// arg 2 timespec
+	MOVL	$CLOCK_REALTIME, DI	// arg 1 clock_id
+	CALL	libc_clock_gettime(SB)
 	POPQ	BP
 	RET
 
@@ -366,15 +371,27 @@ TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	8(DI), SI		// arg 2 miblen
-	MOVQ	16(DI), DX		// arg 3 out
-	MOVQ	24(DI), CX		// arg 4 size
-	MOVQ	32(DI), R8		// arg 5 dst
-	MOVQ	40(DI), R9		// arg 6 ndst
+	MOVQ	16(DI), DX		// arg 3 oldp
+	MOVQ	24(DI), CX		// arg 4 oldlenp
+	MOVQ	32(DI), R8		// arg 5 newp
+	MOVQ	40(DI), R9		// arg 6 newlen
 	MOVQ	0(DI), DI		// arg 1 mib
 	CALL	libc_sysctl(SB)
 	POPQ	BP
 	RET
 
+TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 oldp
+	MOVQ	16(DI), DX		// arg 3 oldlenp
+	MOVQ	24(DI), CX		// arg 4 newp
+	MOVQ	32(DI), R8		// arg 5 newlen
+	MOVQ	0(DI), DI		// arg 1 name
+	CALL	libc_sysctlbyname(SB)
+	POPQ	BP
+	RET
+
 TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
diff --git a/src/runtime/sys_darwin_arm64.go b/src/runtime/sys_darwin_arm64.go
new file mode 100644
index 0000000000..9c14f33a1c
--- /dev/null
+++ b/src/runtime/sys_darwin_arm64.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// libc function wrappers. Must run on system stack.
+
+//go:nosplit
+//go:cgo_unsafe_args
+func g0_pthread_key_create(k *pthreadkey, destructor uintptr) int32 {
+	return asmcgocall(unsafe.Pointer(funcPC(pthread_key_create_trampoline)), unsafe.Pointer(&k))
+}
+func pthread_key_create_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func g0_pthread_setspecific(k pthreadkey, value uintptr) int32 {
+	return asmcgocall(unsafe.Pointer(funcPC(pthread_setspecific_trampoline)), unsafe.Pointer(&k))
+}
+func pthread_setspecific_trampoline()
+
+//go:cgo_import_dynamic libc_pthread_key_create pthread_key_create "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_setspecific pthread_setspecific "/usr/lib/libSystem.B.dylib"
+
+// tlsinit allocates a thread-local storage slot for g.
+//
+// It finds the first available slot using pthread_key_create and uses
+// it as the offset value for runtime.tlsg.
+//
+// This runs at startup on g0 stack, but before g is set, so it must
+// not split stack (transitively). g is expected to be nil, so things
+// (e.g. asmcgocall) will skip saving or reading g.
+//
+//go:nosplit
+func tlsinit(tlsg *uintptr, tlsbase *[_PTHREAD_KEYS_MAX]uintptr) {
+	var k pthreadkey
+	err := g0_pthread_key_create(&k, 0)
+	if err != 0 {
+		abort()
+	}
+
+	const magic = 0xc476c475c47957
+	err = g0_pthread_setspecific(k, magic)
+	if err != 0 {
+		abort()
+	}
+
+	for i, x := range tlsbase {
+		if x == magic {
+			*tlsg = uintptr(i * sys.PtrSize)
+			g0_pthread_setspecific(k, 0)
+			return
+		}
+	}
+	abort()
+}
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 585d4f2c64..96d2ed1076 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -10,6 +10,8 @@
 #include "go_tls.h"
 #include "textflag.h"
 
+#define CLOCK_REALTIME		0
+
 TEXT notok<>(SB),NOSPLIT,$0
 	MOVD	$0, R8
 	MOVD	R8, (R8)
@@ -118,6 +120,12 @@ TEXT runtime·madvise_trampoline(SB),NOSPLIT,$0
 	BL	libc_madvise(SB)
 	RET
 
+TEXT runtime·mlock_trampoline(SB),NOSPLIT,$0
+	MOVD	8(R0), R1	// arg 2 len
+	MOVD	0(R0), R0	// arg 1 addr
+	BL	libc_mlock(SB)
+	RET
+
 TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
 	MOVD	8(R0), R1	// arg 2 new
 	MOVD	16(R0), R2	// arg 3 old
@@ -126,9 +134,9 @@ TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
 	RET
 
 TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
-	// R0 already has *timeval
-	MOVD	$0, R1 // no timezone needed
-	BL	libc_gettimeofday(SB)
+	MOVD	R0, R1			// arg 2 timespec
+	MOVW	$CLOCK_REALTIME, R0 	// arg 1 clock_id
+	BL	libc_clock_gettime(SB)
 	RET
 
 GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
@@ -197,11 +205,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$192
 
 	// this might be called in external code context,
 	// where g is not set.
-	MOVB	runtime·iscgo(SB), R0
-	CMP	$0, R0
-	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOOS_ios
 	MOVD	RSP, R6
 	CMP	$0, g
 	BEQ	nog
@@ -226,16 +232,21 @@ nog:
 	// Switch to gsignal stack.
 	MOVD	R6, RSP
 
-	// Call sigtrampgo.
+	// Save arguments.
 	MOVW	R0, (8*1)(RSP)
 	MOVD	R1, (8*2)(RSP)
 	MOVD	R2, (8*3)(RSP)
+#endif
+
+	// Call sigtrampgo.
 	MOVD	$runtime·sigtrampgo(SB), R11
 	BL	(R11)
 
+#ifdef GOOS_ios
 	// Switch to old stack.
 	MOVD	(8*4)(RSP), R5
 	MOVD	R5, RSP
+#endif
 
 	// Restore callee-save registers.
 	MOVD	(8*4)(RSP), R19
@@ -290,14 +301,24 @@ TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
 
 TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
 	MOVW	8(R0), R1	// arg 2 miblen
-	MOVD	16(R0), R2	// arg 3 out
-	MOVD	24(R0), R3	// arg 4 size
-	MOVD	32(R0), R4	// arg 5 dst
-	MOVD	40(R0), R5	// arg 6 ndst
+	MOVD	16(R0), R2	// arg 3 oldp
+	MOVD	24(R0), R3	// arg 4 oldlenp
+	MOVD	32(R0), R4	// arg 5 newp
+	MOVD	40(R0), R5	// arg 6 newlen
 	MOVD	0(R0), R0	// arg 1 mib
 	BL	libc_sysctl(SB)
 	RET
 
+TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0
+	MOVD	8(R0), R1	// arg 2 oldp
+	MOVD	16(R0), R2	// arg 3 oldlenp
+	MOVD	24(R0), R3	// arg 4 newp
+	MOVD	32(R0), R4	// arg 5 newlen
+	MOVD	0(R0), R0	// arg 1 name
+	BL	libc_sysctlbyname(SB)
+	RET
+
+
 TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
 	BL	libc_kqueue(SB)
 	RET
@@ -329,12 +350,20 @@ TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
 	ADD	$16, RSP
 	RET
 
-// sigaltstack on iOS is not supported and will always
-// run the signal handler on the main stack, so our sigtramp has
-// to do the stack switch ourselves.
 TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
+#ifdef GOOS_ios
+	// sigaltstack on iOS is not supported and will always
+	// run the signal handler on the main stack, so our sigtramp has
+	// to do the stack switch ourselves.
 	MOVW	$43, R0
 	BL	libc_exit(SB)
+#else
+	MOVD	8(R0), R1		// arg 2 old
+	MOVD	0(R0), R0		// arg 1 new
+	CALL	libc_sigaltstack(SB)
+	CBZ	R0, 2(PC)
+	BL	notok<>(SB)
+#endif
 	RET
 
 // Thread related functions
@@ -367,7 +396,8 @@ TEXT runtime·mstart_stub(SB),NOSPLIT,$160
 	FMOVD	F14, 144(RSP)
 	FMOVD	F15, 152(RSP)
 
-	MOVD    m_g0(R0), g
+	MOVD	m_g0(R0), g
+	BL	·save_g(SB)
 
 	BL	runtime·mstart(SB)
 
@@ -483,6 +513,18 @@ TEXT runtime·pthread_kill_trampoline(SB),NOSPLIT,$0
 	BL	libc_pthread_kill(SB)
 	RET
 
+TEXT runtime·pthread_key_create_trampoline(SB),NOSPLIT,$0
+	MOVD	8(R0), R1	// arg 2 destructor
+	MOVD	0(R0), R0	// arg 1 *key
+	BL	libc_pthread_key_create(SB)
+	RET
+
+TEXT runtime·pthread_setspecific_trampoline(SB),NOSPLIT,$0
+	MOVD	8(R0), R1	// arg 2 value
+	MOVD	0(R0), R0	// arg 1 key
+	BL	libc_pthread_setspecific(SB)
+	RET
+
 // syscall calls a function in libc on behalf of the syscall package.
 // syscall takes a pointer to a struct like:
 // struct {
@@ -693,3 +735,23 @@ TEXT runtime·syscall6X(SB),NOSPLIT,$0
 	MOVD	R0, 72(R2)	// save err
 ok:
 	RET
+
+// syscallNoErr is like syscall6 but does not check for errors, and
+// only returns one value, for use with standard C ABI library functions.
+TEXT runtime·syscallNoErr(SB),NOSPLIT,$0
+	SUB	$16, RSP	// push structure pointer
+	MOVD	R0, (RSP)
+
+	MOVD	0(R0), R12	// fn
+	MOVD	16(R0), R1	// a2
+	MOVD	24(R0), R2	// a3
+	MOVD	32(R0), R3	// a4
+	MOVD	40(R0), R4	// a5
+	MOVD	48(R0), R5	// a6
+	MOVD	8(R0), R0	// a1
+	BL	(R12)
+
+	MOVD	(RSP), R2	// pop structure pointer
+	ADD	$16, RSP
+	MOVD	R0, 56(R2)	// save r1
+	RET
diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
index 2330f2ffe2..8a4f9b7fa1 100644
--- a/src/runtime/sys_freebsd_arm64.s
+++ b/src/runtime/sys_freebsd_arm64.s
@@ -515,24 +515,3 @@ TEXT runtime·getCntxct(SB),NOSPLIT,$0
 
 	MOVW	R0, ret+8(FP)
 	RET
-
-// func getisar0() uint64
-TEXT runtime·getisar0(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 0 into R0
-	MRS	ID_AA64ISAR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getisar1() uint64
-TEXT runtime·getisar1(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 1 into R0
-	MRS	ID_AA64ISAR1_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getpfr0() uint64
-TEXT runtime·getpfr0(SB),NOSPLIT,$0
-	// get Processor Feature Register 0 into R0
-	MRS	ID_AA64PFR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 8d90813589..37cb8dad03 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -40,6 +40,7 @@
 #define SYS_futex		202
 #define SYS_sched_getaffinity	204
 #define SYS_epoll_create	213
+#define SYS_clock_gettime	228
 #define SYS_exit_group		231
 #define SYS_epoll_ctl		233
 #define SYS_tgkill		234
@@ -241,15 +242,15 @@ noswitch:
 	SUBQ	$16, SP		// Space for results
 	ANDQ	$~15, SP	// Align for C code
 
+	MOVL	$0, DI // CLOCK_REALTIME
+	LEAQ	0(SP), SI
 	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
-	MOVL	$0, DI // CLOCK_REALTIME
-	LEAQ	0(SP), SI
 	CALL	AX
+ret:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
-ret:
 	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
@@ -264,13 +265,8 @@ ret:
 	MOVL	DX, nsec+8(FP)
 	RET
 fallback:
-	LEAQ	0(SP), DI
-	MOVQ	$0, SI
-	MOVQ	runtime·vdsoGettimeofdaySym(SB), AX
-	CALL	AX
-	MOVQ	0(SP), AX	// sec
-	MOVL	8(SP), DX	// usec
-	IMULQ	$1000, DX
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
 	JMP ret
 
 // func nanotime1() int64
@@ -306,15 +302,15 @@ noswitch:
 	SUBQ	$16, SP		// Space for results
 	ANDQ	$~15, SP	// Align for C code
 
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
 	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
-	MOVL	$1, DI // CLOCK_MONOTONIC
-	LEAQ	0(SP), SI
 	CALL	AX
+ret:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
-ret:
 	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
@@ -332,13 +328,8 @@ ret:
 	MOVQ	AX, ret+0(FP)
 	RET
 fallback:
-	LEAQ	0(SP), DI
-	MOVQ	$0, SI
-	MOVQ	runtime·vdsoGettimeofdaySym(SB), AX
-	CALL	AX
-	MOVQ	0(SP), AX	// sec
-	MOVL	8(SP), DX	// usec
-	IMULQ	$1000, DX
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
 	JMP	ret
 
 TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28
@@ -389,7 +380,8 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$72
+// Defined as ABIInternal since it does not use the stack-based Go ABI.
+TEXT runtime·sigtramp<ABIInternal>(SB),NOSPLIT,$72
 	// Save callee-saved C registers, since the caller may be a C signal handler.
 	MOVQ	BX,  bx-8(SP)
 	MOVQ	BP,  bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
@@ -416,7 +408,8 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$72
 
 // Used instead of sigtramp in programs that use cgo.
 // Arguments from kernel are in DI, SI, DX.
-TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stack-based Go ABI.
+TEXT runtime·cgoSigtramp<ABIInternal>(SB),NOSPLIT,$0
 	// If no traceback function, do usual sigtramp.
 	MOVQ	runtime·cgoTraceback(SB), AX
 	TESTQ	AX, AX
@@ -459,12 +452,12 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
 	// The first three arguments, and the fifth, are already in registers.
 	// Set the two remaining arguments now.
 	MOVQ	runtime·cgoTraceback(SB), CX
-	MOVQ	$runtime·sigtramp(SB), R9
+	MOVQ	$runtime·sigtramp<ABIInternal>(SB), R9
 	MOVQ	_cgo_callers(SB), AX
 	JMP	AX
 
 sigtramp:
-	JMP	runtime·sigtramp(SB)
+	JMP	runtime·sigtramp<ABIInternal>(SB)
 
 sigtrampnog:
 	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
@@ -495,7 +488,8 @@ sigtrampnog:
 // https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c
 // The code that cares about the precise instructions used is:
 // https://gcc.gnu.org/viewcvs/gcc/trunk/libgcc/config/i386/linux-unwind.h?revision=219188&view=markup
-TEXT runtime·sigreturn(SB),NOSPLIT,$0
+// Defined as ABIInternal since it does not use the stack-based Go ABI.
+TEXT runtime·sigreturn<ABIInternal>(SB),NOSPLIT,$0
 	MOVQ	$SYS_rt_sigreturn, AX
 	SYSCALL
 	INT $3	// not reached
diff --git a/src/runtime/sys_netbsd_arm64.s b/src/runtime/sys_netbsd_arm64.s
index e70be0fa74..4d9b05478f 100644
--- a/src/runtime/sys_netbsd_arm64.s
+++ b/src/runtime/sys_netbsd_arm64.s
@@ -152,28 +152,23 @@ ok:
 
 // func pipe() (r, w int32, errno int32)
 TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
-	MOVW	$0, R0
+	ADD	$8, RSP, R0
+	MOVW	$0, R1
 	SVC	$SYS_pipe2
 	BCC	pipeok
-	MOVW	$-1,R1
-	MOVW	R1, r+0(FP)
-	MOVW	R1, w+4(FP)
 	NEG	R0, R0
-	MOVW	R0, errno+8(FP)
-	RET
 pipeok:
-	MOVW	R0, r+0(FP)
-	MOVW	R1, w+4(FP)
-	MOVW	ZR, errno+8(FP)
+	MOVW	R0, errno+8(FP)
 	RET
 
 // func pipe2(flags int32) (r, w int32, errno int32)
 TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
-	ADD	$8, RSP, R0
+	ADD	$16, RSP, R0
 	MOVW	flags+0(FP), R1
 	SVC	$SYS_pipe2
-	BCC	2(PC)
+	BCC	pipe2ok
 	NEG	R0, R0
+pipe2ok:
 	MOVW	R0, errno+16(FP)
 	RET
 
@@ -319,6 +314,12 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$192
 	MOVD	R26, 8*11(RSP)
 	MOVD	R27, 8*12(RSP)
 	MOVD	g, 8*13(RSP)
+	// Unclobber g for now (kernel uses it as ucontext ptr)
+	// See https://github.com/golang/go/issues/30824#issuecomment-492772426
+	// This is only correct in the non-cgo case.
+	// XXX should use lwp_getprivate as suggested.
+	// 8*36 is ucontext.uc_mcontext.__gregs[_REG_X28]
+	MOVD	8*36(g), g
 	MOVD	R29, 8*14(RSP)
 	FMOVD	F8, 8*15(RSP)
 	FMOVD	F9, 8*16(RSP)
diff --git a/src/runtime/sys_openbsd_mips64.s b/src/runtime/sys_openbsd_mips64.s
new file mode 100644
index 0000000000..3e4d209081
--- /dev/null
+++ b/src/runtime/sys_openbsd_mips64.s
@@ -0,0 +1,400 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for mips64, OpenBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define CLOCK_REALTIME	$0
+#define	CLOCK_MONOTONIC	$3
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0
+	MOVW	code+0(FP), R4		// arg 1 - status
+	MOVV	$1, R2			// sys_exit
+	SYSCALL
+	BEQ	R7, 3(PC)
+	MOVV	$0, R2			// crash on syscall failure
+	MOVV	R2, (R2)
+	RET
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0
+	MOVV	wait+0(FP), R4		// arg 1 - notdead
+	MOVV	$302, R2		// sys___threxit
+	SYSCALL
+	MOVV	$0, R2			// crash on syscall failure
+	MOVV	R2, (R2)
+	JMP	0(PC)
+
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0
+	MOVV	name+0(FP), R4		// arg 1 - path
+	MOVW	mode+8(FP), R5		// arg 2 - mode
+	MOVW	perm+12(FP), R6		// arg 3 - perm
+	MOVV	$5, R2			// sys_open
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0
+	MOVW	fd+0(FP), R4		// arg 1 - fd
+	MOVV	$6, R2			// sys_close
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0
+	MOVW	fd+0(FP), R4		// arg 1 - fd
+	MOVV	p+8(FP), R5		// arg 2 - buf
+	MOVW	n+16(FP), R6		// arg 3 - nbyte
+	MOVV	$3, R2			// sys_read
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+24(FP)
+	RET
+
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVV	$r+0(FP), R4
+	MOVW	$0, R5
+	MOVV	$101, R2		// sys_pipe2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVV	$r+8(FP), R4
+	MOVW	flags+0(FP), R5
+	MOVV	$101, R2		// sys_pipe2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0
+	MOVV	fd+0(FP), R4		// arg 1 - fd
+	MOVV	p+8(FP), R5		// arg 2 - buf
+	MOVW	n+16(FP), R6		// arg 3 - nbyte
+	MOVV	$4, R2			// sys_write
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$24-4
+	MOVWU	usec+0(FP), R3
+	MOVV	R3, R5
+	MOVW	$1000000, R4
+	DIVVU	R4, R3
+	MOVV	LO, R3
+	MOVV	R3, 8(R29)		// tv_sec
+	MOVW	$1000, R4
+	MULVU	R3, R4
+	MOVV	LO, R4
+	SUBVU	R4, R5
+	MOVV	R5, 16(R29)		// tv_nsec
+
+	ADDV	$8, R29, R4		// arg 1 - rqtp
+	MOVV	$0, R5			// arg 2 - rmtp
+	MOVV	$91, R2			// sys_nanosleep
+	SYSCALL
+	RET
+
+TEXT runtime·getthrid(SB),NOSPLIT,$0-4
+	MOVV	$299, R2		// sys_getthrid
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET
+
+TEXT runtime·thrkill(SB),NOSPLIT,$0-16
+	MOVW	tid+0(FP), R4		// arg 1 - tid
+	MOVV	sig+8(FP), R5		// arg 2 - signum
+	MOVW	$0, R6			// arg 3 - tcb
+	MOVV	$119, R2		// sys_thrkill
+	SYSCALL
+	RET
+
+TEXT runtime·raiseproc(SB),NOSPLIT,$0
+	MOVV	$20, R4			// sys_getpid
+	SYSCALL
+	MOVV	R2, R4			// arg 1 - pid
+	MOVW	sig+0(FP), R5		// arg 2 - signum
+	MOVV	$122, R2		// sys_kill
+	SYSCALL
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVV	addr+0(FP), R4		// arg 1 - addr
+	MOVV	n+8(FP), R5		// arg 2 - len
+	MOVW	prot+16(FP), R6		// arg 3 - prot
+	MOVW	flags+20(FP), R7	// arg 4 - flags
+	MOVW	fd+24(FP), R8		// arg 5 - fd
+	MOVW	$0, R9			// arg 6 - pad
+	MOVW	off+28(FP), R10		// arg 7 - offset
+	MOVV	$197, R2		// sys_mmap
+	SYSCALL
+	MOVV	$0, R4
+	BEQ	R7, 3(PC)
+	MOVV	R2, R4			// if error, move to R4
+	MOVV	$0, R2
+	MOVV	R2, p+32(FP)
+	MOVV	R4, err+40(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVV	addr+0(FP), R4		// arg 1 - addr
+	MOVV	n+8(FP), R5		// arg 2 - len
+	MOVV	$73, R2			// sys_munmap
+	SYSCALL
+	BEQ	R7, 3(PC)
+	MOVV	$0, R2			// crash on syscall failure
+	MOVV	R2, (R2)
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVV	addr+0(FP), R4		// arg 1 - addr
+	MOVV	n+8(FP), R5		// arg 2 - len
+	MOVW	flags+16(FP), R6	// arg 2 - flags
+	MOVV	$75, R2			// sys_madvise
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0
+	MOVW	mode+0(FP), R4		// arg 1 - mode
+	MOVV	new+8(FP), R5		// arg 2 - new value
+	MOVV	old+16(FP), R6		// arg 3 - old value
+	MOVV	$69, R2			// sys_setitimer
+	SYSCALL
+	RET
+
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
+	MOVW	CLOCK_REALTIME, R4	// arg 1 - clock_id
+	MOVV	$8(R29), R5		// arg 2 - tp
+	MOVV	$87, R2			// sys_clock_gettime
+	SYSCALL
+
+	MOVV	8(R29), R4		// sec
+	MOVV	16(R29), R5		// nsec
+	MOVV	R4, sec+0(FP)
+	MOVW	R5, nsec+8(FP)
+
+	RET
+
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
+	MOVW	CLOCK_MONOTONIC, R4	// arg 1 - clock_id
+	MOVV	$8(R29), R5		// arg 2 - tp
+	MOVV	$87, R2			// sys_clock_gettime
+	SYSCALL
+
+	MOVV	8(R29), R3		// sec
+	MOVV	16(R29), R5		// nsec
+
+	MOVV	$1000000000, R4
+	MULVU	R4, R3
+	MOVV	LO, R3
+	ADDVU	R5, R3
+	MOVV	R3, ret+0(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$0
+	MOVW	sig+0(FP), R4		// arg 1 - signum
+	MOVV	new+8(FP), R5		// arg 2 - new sigaction
+	MOVV	old+16(FP), R6		// arg 3 - old sigaction
+	MOVV	$46, R2			// sys_sigaction
+	SYSCALL
+	BEQ	R7, 3(PC)
+	MOVV	$3, R2			// crash on syscall failure
+	MOVV	R2, (R2)
+	RET
+
+TEXT runtime·obsdsigprocmask(SB),NOSPLIT,$0
+	MOVW	how+0(FP), R4		// arg 1 - mode
+	MOVW	new+4(FP), R5		// arg 2 - new
+	MOVV	$48, R2			// sys_sigprocmask
+	SYSCALL
+	BEQ	R7, 3(PC)
+	MOVV	$3, R2			// crash on syscall failure
+	MOVV	R2, (R2)
+	MOVW	R2, ret+8(FP)
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R4
+	MOVV	info+16(FP), R5
+	MOVV	ctx+24(FP), R6
+	MOVV	fn+0(FP), R25		// Must use R25, needed for PIC code.
+	CALL	(R25)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$192
+	// initialize REGSB = PC&0xffffffff00000000
+	BGEZAL	R0, 1(PC)
+	SRLV	$32, R31, RSB
+	SLLV	$32, RSB
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVB	runtime·iscgo(SB), R1
+	BEQ	R1, 2(PC)
+	JAL	runtime·load_g(SB)
+
+	MOVW	R4, 8(R29)
+	MOVV	R5, 16(R29)
+	MOVV	R6, 24(R29)
+	MOVV	$runtime·sigtrampgo(SB), R1
+	JAL	(R1)
+	RET
+
+// int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·tfork(SB),NOSPLIT,$0
+
+	// Copy mp, gp and fn off parent stack for use by child.
+	MOVV	mm+16(FP), R16
+	MOVV	gg+24(FP), R17
+	MOVV	fn+32(FP), R18
+
+	MOVV	param+0(FP), R4		// arg 1 - param
+	MOVV	psize+8(FP), R5		// arg 2 - psize
+	MOVV	$8, R2			// sys___tfork
+	SYSCALL
+
+	// Return if syscall failed.
+	BEQ	R7, 4(PC)
+	SUBVU	R2, R0, R2		// caller expects negative errno
+	MOVW	R2, ret+40(FP)
+	RET
+
+	// In parent, return.
+	BEQ	R2, 3(PC)
+	MOVW	R2, ret+40(FP)
+	RET
+
+	// Initialise m, g.
+	MOVV	R17, g
+	MOVV	R16, g_m(g)
+
+	// Call fn.
+	CALL	(R18)
+
+	// fn should never return.
+	MOVV	$2, R8			// crash if reached
+	MOVV	R8, (R8)
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVV	new+0(FP), R4		// arg 1 - new sigaltstack
+	MOVV	old+8(FP), R5		// arg 2 - old sigaltstack
+	MOVV	$288, R2		// sys_sigaltstack
+	SYSCALL
+	BEQ	R7, 3(PC)
+	MOVV	$0, R8			// crash on syscall failure
+	MOVV	R8, (R8)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVV	$298, R2		// sys_sched_yield
+	SYSCALL
+	RET
+
+TEXT runtime·thrsleep(SB),NOSPLIT,$0
+	MOVV	ident+0(FP), R4		// arg 1 - ident
+	MOVW	clock_id+8(FP), R5	// arg 2 - clock_id
+	MOVV	tsp+16(FP), R6		// arg 3 - tsp
+	MOVV	lock+24(FP), R7		// arg 4 - lock
+	MOVV	abort+32(FP), R8	// arg 5 - abort
+	MOVV	$94, R2			// sys___thrsleep
+	SYSCALL
+	MOVW	R2, ret+40(FP)
+	RET
+
+TEXT runtime·thrwakeup(SB),NOSPLIT,$0
+	MOVV	ident+0(FP), R4		// arg 1 - ident
+	MOVW	n+8(FP), R5		// arg 2 - n
+	MOVV	$301, R2		// sys___thrwakeup
+	SYSCALL
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVV	mib+0(FP), R4		// arg 1 - mib
+	MOVW	miblen+8(FP), R5	// arg 2 - miblen
+	MOVV	out+16(FP), R6		// arg 3 - out
+	MOVV	size+24(FP), R7		// arg 4 - size
+	MOVV	dst+32(FP), R8		// arg 5 - dest
+	MOVV	ndst+40(FP), R9		// arg 6 - newlen
+	MOVV	$202, R2		// sys___sysctl
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+48(FP)
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVV	$269, R2		// sys_kqueue
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVW	kq+0(FP), R4		// arg 1 - kq
+	MOVV	ch+8(FP), R5		// arg 2 - changelist
+	MOVW	nch+16(FP), R6		// arg 3 - nchanges
+	MOVV	ev+24(FP), R7		// arg 4 - eventlist
+	MOVW	nev+32(FP), R8		// arg 5 - nevents
+	MOVV	ts+40(FP), R9		// arg 6 - timeout
+	MOVV	$72, R2			// sys_kevent
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+48(FP)
+	RET
+
+// func closeonexec(fd int32)
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVW	fd+0(FP), R4		// arg 1 - fd
+	MOVV	$2, R5			// arg 2 - cmd (F_SETFD)
+	MOVV	$1, R6			// arg 3 - arg (FD_CLOEXEC)
+	MOVV	$92, R2			// sys_fcntl
+	SYSCALL
+	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R4		// arg 1 - fd
+	MOVV	$3, R5			// arg 2 - cmd (F_GETFL)
+	MOVV	$0, R6			// arg 3
+	MOVV	$92, R2			// sys_fcntl
+	SYSCALL
+	MOVV	$4, R6			// O_NONBLOCK
+	OR	R2, R6			// arg 3 - flags
+	MOVW	fd+0(FP), R4		// arg 1 - fd
+	MOVV	$4, R5			// arg 2 - cmd (F_SETFL)
+	MOVV	$92, R2			// sys_fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 9e1f40925d..ef8a3dd3c2 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -239,7 +239,7 @@ GLOBL runtime·cbctxts(SB), NOPTR, $4
 TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVL	0(SP), AX	// will use to find our callback context
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
 	ADDL	$4, SP
 
 	// address to callback parameters into CX
@@ -251,50 +251,35 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	PUSHL	BP
 	PUSHL	BX
 
-	// determine index into runtime·cbctxts table
+	// Go ABI requires DF flag to be cleared.
+	CLD
+
+	// determine index into runtime·cbs table
 	SUBL	$runtime·callbackasm(SB), AX
 	MOVL	$0, DX
 	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	BX
+	SUBL	$1, AX	// subtract 1 because return PC is to the next slot
 
-	// find correspondent runtime·cbctxts table entry
-	MOVL	runtime·cbctxts(SB), BX
-	MOVL	-4(BX)(AX*4), BX
-
-	// extract callback context
-	MOVL	wincallbackcontext_gobody(BX), AX
-	MOVL	wincallbackcontext_argsize(BX), DX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	PUSHL	0(CX)(DX*1)
-
-	// extend argsize by size of return value
-	ADDL	$4, DX
-
-	// remember how to restore stack on return
-	MOVL	wincallbackcontext_restorestack(BX), BX
-	PUSHL	BX
-
-	// call target Go function
-	PUSHL	DX			// argsize (including return value)
-	PUSHL	CX			// callback parameters
-	PUSHL	AX			// address of target Go function
-	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	POPL	AX
-	POPL	CX
-	POPL	DX
-
-	// how to restore stack on return
-	POPL	BX
+	// Create a struct callbackArgs on our stack.
+	SUBL	$(12+callbackArgs__size), SP
+	MOVL	AX, (12+callbackArgs_index)(SP)		// callback index
+	MOVL	CX, (12+callbackArgs_args)(SP)		// address of args vector
+	MOVL	$0, (12+callbackArgs_result)(SP)	// result
+	LEAL	12(SP), AX	// AX = &callbackArgs{...}
 
-	// return value into AX (as per Windows spec)
-	// and restore previously preserved value
-	MOVL	-4(CX)(DX*1), AX
-	POPL	-4(CX)(DX*1)
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVL	$0, 8(SP)	// context
+	MOVL	AX, 4(SP)	// frame (address of callbackArgs)
+	LEAL	·callbackWrap(SB), AX
+	MOVL	AX, 0(SP)	// PC of function to call
+	CALL	runtime·cgocallback(SB)
 
-	MOVL	BX, CX			// cannot use BX anymore
+	// Get callback result.
+	MOVL	(12+callbackArgs_result)(SP), AX
+	// Get popRet.
+	MOVL	(12+callbackArgs_retPop)(SP), CX	// Can't use a callee-save register
+	ADDL	$(12+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	POPL	BX
@@ -428,6 +413,51 @@ TEXT runtime·usleep2(SB),NOSPLIT,$20
 	MOVL	BP, SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT,$36
+	get_tls(CX)
+	CMPL	CX, $0
+	JE	gisnotset
+
+	// Want negative 100ns units.
+	NEGL	BX
+	MOVL	$-1, hi-4(SP)
+	MOVL	BX, lo-8(SP)
+
+	MOVL	g(CX), CX
+	MOVL	g_m(CX), CX
+	MOVL	(m_mOS+mOS_highResTimer)(CX), CX
+	MOVL	CX, saved_timer-12(SP)
+
+	MOVL	$0, fResume-16(SP)
+	MOVL	$0, lpArgToCompletionRoutine-20(SP)
+	MOVL	$0, pfnCompletionRoutine-24(SP)
+	MOVL	$0, lPeriod-28(SP)
+	LEAL	lo-8(SP), BX
+	MOVL	BX, lpDueTime-32(SP)
+	MOVL	CX, hTimer-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	MOVL	$0, ptime-28(SP)
+	MOVL	$0, alertable-32(SP)
+	MOVL	saved_timer-12(SP), CX
+	MOVL	CX, handle-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	RET
+
+gisnotset:
+	// TLS is not configured. Call usleep2 instead.
+	MOVL	$runtime·usleep2(SB), AX
+	CALL	AX
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT,$0
 	MOVL	SP, BP
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 6c8eecd4e7..d1690cad58 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -291,31 +291,20 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVQ	DX, (16+8)(SP)
   	MOVQ	R8, (16+16)(SP)
   	MOVQ	R9, (16+24)(SP)
+	// R8 = address of args vector
+	LEAQ	(16+0)(SP), R8
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
   	MOVQ	0(SP), AX
 	ADDQ	$8, SP
 
-	// determine index into runtime·cbctxts table
+	// determine index into runtime·cbs table
 	MOVQ	$runtime·callbackasm(SB), DX
 	SUBQ	DX, AX
 	MOVQ	$0, DX
 	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	CX
-
-	// find correspondent runtime·cbctxts table entry
-	MOVQ	runtime·cbctxts(SB), CX
-	MOVQ	-8(CX)(AX*8), AX
-
-	// extract callback context
-	MOVQ	wincallbackcontext_argsize(AX), DX
-	MOVQ	wincallbackcontext_gobody(AX), AX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	LEAQ	8(SP), CX       // args vector, skip return address
-	PUSHQ	0(CX)(DX*1)     // store 8 bytes from just after the args array
-	ADDQ	$8, DX          // extend argsize by size of return value
+	SUBQ	$1, AX	// subtract 1 because return PC is to the next slot
 
 	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
 	// as required by windows callback convention.
@@ -330,18 +319,25 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	MOVQ	R14, 8(SP)
 	MOVQ	R15, 0(SP)
 
-	// prepare call stack.  use SUBQ to hide from stack frame checks
-	// cgocallback(Go func, void *frame, uintptr framesize)
-	SUBQ	$24, SP
-	MOVQ	DX, 16(SP)	// argsize (including return value)
-	MOVQ	CX, 8(SP)	// callback parameters
-	MOVQ	AX, 0(SP)	// address of target Go function
+	// Go ABI requires DF flag to be cleared.
 	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	MOVQ	0(SP), AX
-	MOVQ	8(SP), CX
-	MOVQ	16(SP), DX
-	ADDQ	$24, SP
+
+	// Create a struct callbackArgs on our stack to be passed as
+	// the "frame" to cgocallback and on to callbackWrap.
+	SUBQ	$(24+callbackArgs__size), SP
+	MOVQ	AX, (24+callbackArgs_index)(SP) 	// callback index
+	MOVQ	R8, (24+callbackArgs_args)(SP)  	// address of args vector
+	MOVQ	$0, (24+callbackArgs_result)(SP)	// result
+	LEAQ	24(SP), AX
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVQ	$0, 16(SP)	// context
+	MOVQ	AX, 8(SP)	// frame (address of callbackArgs)
+	LEAQ	·callbackWrap(SB), BX
+	MOVQ	BX, 0(SP)	// PC of function value to call (callbackWrap)
+	CALL	·cgocallback(SB)
+	// Get callback result.
+	MOVQ	(24+callbackArgs_result)(SP), AX
+	ADDQ	$(24+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	MOVQ	0(SP), R15
@@ -355,8 +351,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	ADDQ	$64, SP
 	POPFQ
 
-	MOVQ	-8(CX)(DX*1), AX  // return value
-	POPQ	-8(CX)(DX*1)      // restore bytes just after the args
+	// The return value was placed in AX above.
 	RET
 
 // uint32 tstart_stdcall(M *newm);
@@ -457,6 +452,47 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$48
 	MOVQ	40(SP), SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$72
+	get_tls(CX)
+	CMPQ	CX, $0
+	JE	gisnotset
+
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	MOVQ	AX, 64(SP)
+
+	MOVQ	g(CX), CX
+	MOVQ	g_m(CX), CX
+	MOVQ	(m_mOS+mOS_highResTimer)(CX), CX	// hTimer
+	MOVQ	CX, 48(SP)				// save hTimer for later
+	// Want negative 100ns units.
+	NEGQ	BX
+	LEAQ	56(SP), DX				// lpDueTime
+	MOVQ	BX, (DX)
+	MOVQ	$0, R8					// lPeriod
+	MOVQ	$0, R9					// pfnCompletionRoutine
+	MOVQ	$0, AX
+	MOVQ	AX, 32(SP)				// lpArgToCompletionRoutine
+	MOVQ	AX, 40(SP)				// fResume
+	MOVQ	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+
+	MOVQ	48(SP), CX				// handle
+	MOVQ	$0, DX					// alertable
+	MOVQ	$0, R8					// ptime
+	MOVQ	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+
+	MOVQ	64(SP), SP
+	RET
+
+gisnotset:
+	// TLS is not configured. Call usleep2 instead.
+	MOVQ	$runtime·usleep2(SB), AX
+	CALL	AX
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	SP, AX
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 256b5ff7f0..fe267080cc 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -314,45 +314,42 @@ TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME,$0
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
 TEXT runtime·callbackasm1(SB),NOSPLIT|NOFRAME,$0
-	MOVM.DB.W [R4-R11, R14], (R13)	// push {r4-r11, lr}
-	SUB	$36, R13		// space for locals
+	// On entry, the trampoline in zcallback_windows_arm.s left
+	// the callback index in R12 (which is volatile in the C ABI).
 
-	// save callback arguments to stack. We currently support up to 4 arguments
-	ADD	$16, R13, R4
-	MOVM.IA	[R0-R3], (R4)
+	// Push callback register arguments r0-r3. We do this first so
+	// they're contiguous with stack arguments.
+	MOVM.DB.W [R0-R3], (R13)
+	// Push C callee-save registers r4-r11 and lr.
+	MOVM.DB.W [R4-R11, R14], (R13)
+	SUB	$(16 + callbackArgs__size), R13	// space for locals
 
-	// load cbctxts[i]. The trampoline in zcallback_windows.s puts the callback
-	// index in R12
-	MOVW	runtime·cbctxts(SB), R4
-	MOVW	R12<<2(R4), R4		// R4 holds pointer to wincallbackcontext structure
-
-	// extract callback context
-	MOVW	wincallbackcontext_argsize(R4), R5
-	MOVW	wincallbackcontext_gobody(R4), R4
-
-	// we currently support up to 4 arguments
-	CMP	$(4 * 4), R5
-	BL.GT	runtime·abort(SB)
-
-	// extend argsize by size of return value
-	ADD	$4, R5
-
-	// Build 'type args struct'
-	MOVW	R4, 4(R13)		// fn
-	ADD	$16, R13, R0		// arg (points to r0-r3, ret on stack)
-	MOVW	R0, 8(R13)
-	MOVW	R5, 12(R13)		// argsize
+	// Create a struct callbackArgs on our stack.
+	MOVW	R12, (16+callbackArgs_index)(R13)	// callback index
+	MOVW	$(16+callbackArgs__size+4*9)(R13), R0
+	MOVW	R0, (16+callbackArgs_args)(R13)		// address of args vector
+	MOVW	$0, R0
+	MOVW	R0, (16+callbackArgs_result)(R13)	// result
 
+	// Prepare for entry to Go.
 	BL	runtime·load_g(SB)
-	BL	runtime·cgocallback_gofunc(SB)
 
-	ADD	$16, R13, R0		// load arg
-	MOVW	12(R13), R1		// load argsize
-	SUB	$4, R1			// offset to return value
-	MOVW	R1<<0(R0), R0		// load return value
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVW	$0, R0
+	MOVW	R0, 12(R13)	// context
+	MOVW	$16(R13), R1	// R1 = &callbackArgs{...}
+	MOVW	R1, 8(R13)	// frame (address of callbackArgs)
+	MOVW	$·callbackWrap(SB), R1
+	MOVW	R1, 4(R13)	// PC of function to call
+	BL	runtime·cgocallback(SB)
+
+	// Get callback result.
+	MOVW	(16+callbackArgs_result)(R13), R0
 
-	ADD	$36, R13		// free locals
-	MOVM.IA.W (R13), [R4-R11, R15]	// pop {r4-r11, pc}
+	ADD	$(16 + callbackArgs__size), R13	// free locals
+	MOVM.IA.W (R13), [R4-R11, R12]	// pop {r4-r11, lr=>r12}
+	ADD	$(4*4), R13	// skip r0-r3
+	B	(R12)	// return
 
 // uint32 tstart_stdcall(M *newm);
 TEXT runtime·tstart_stdcall(SB),NOSPLIT|NOFRAME,$0
@@ -468,6 +465,11 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$0
 	MOVW	R4, R13			// Restore SP
 	MOVM.IA.W (R13), [R4, R15]	// pop {R4, pc}
 
+// Runs on OS stack. Duration (in 100ns units) is in R0.
+// TODO: neeeds to be implemented properly.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$0
+	B	runtime·abort(SB)
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVM.DB.W [R4, R14], (R13)  	// push {R4, lr}
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 0e2fcfb02d..7835b492f7 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -5,27 +5,52 @@
 package runtime
 
 import (
+	"runtime/internal/sys"
 	"unsafe"
 )
 
-type callbacks struct {
-	lock mutex
-	ctxt [cb_max]*wincallbackcontext
-	n    int
+// cbs stores all registered Go callbacks.
+var cbs struct {
+	lock  mutex
+	ctxt  [cb_max]winCallback
+	index map[winCallbackKey]int
+	n     int
 }
 
-func (c *wincallbackcontext) isCleanstack() bool {
-	return c.cleanstack
+// winCallback records information about a registered Go callback.
+type winCallback struct {
+	fn     *funcval // Go function
+	retPop uintptr  // For 386 cdecl, how many bytes to pop on return
+
+	// abiMap specifies how to translate from a C frame to a Go
+	// frame. This does not specify how to translate back because
+	// the result is always a uintptr. If the C ABI is fastcall,
+	// this assumes the four fastcall registers were first spilled
+	// to the shadow space.
+	abiMap []abiPart
+	// retOffset is the offset of the uintptr-sized result in the Go
+	// frame.
+	retOffset uintptr
 }
 
-func (c *wincallbackcontext) setCleanstack(cleanstack bool) {
-	c.cleanstack = cleanstack
+// abiPart encodes a step in translating between calling ABIs.
+type abiPart struct {
+	src, dst uintptr
+	len      uintptr
 }
 
-var (
-	cbs     callbacks
-	cbctxts **wincallbackcontext = &cbs.ctxt[0] // to simplify access to cbs.ctxt in sys_windows_*.s
-)
+func (a *abiPart) tryMerge(b abiPart) bool {
+	if a.src+a.len == b.src && a.dst+a.len == b.dst {
+		a.len += b.len
+		return true
+	}
+	return false
+}
+
+type winCallbackKey struct {
+	fn    *funcval
+	cdecl bool
+}
 
 func callbackasm()
 
@@ -53,57 +78,174 @@ func callbackasmAddr(i int) uintptr {
 	return funcPC(callbackasm) + uintptr(i*entrySize)
 }
 
+const callbackMaxFrame = 64 * sys.PtrSize
+
+// compileCallback converts a Go function fn into a C function pointer
+// that can be passed to Windows APIs.
+//
+// On 386, if cdecl is true, the returned C function will use the
+// cdecl calling convention; otherwise, it will use stdcall. On amd64,
+// it always uses fastcall. On arm, it always uses the ARM convention.
+//
 //go:linkname compileCallback syscall.compileCallback
-func compileCallback(fn eface, cleanstack bool) (code uintptr) {
+func compileCallback(fn eface, cdecl bool) (code uintptr) {
+	if GOARCH != "386" {
+		// cdecl is only meaningful on 386.
+		cdecl = false
+	}
+
 	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	ft := (*functype)(unsafe.Pointer(fn._type))
+
+	// Check arguments and construct ABI translation.
+	var abiMap []abiPart
+	var src, dst uintptr
+	for _, t := range ft.in() {
+		if t.size > sys.PtrSize {
+			// We don't support this right now. In
+			// stdcall/cdecl, 64-bit ints and doubles are
+			// passed as two words (little endian); and
+			// structs are pushed on the stack. In
+			// fastcall, arguments larger than the word
+			// size are passed by reference. On arm,
+			// 8-byte aligned arguments round up to the
+			// next even register and can be split across
+			// registers and the stack.
+			panic("compileCallback: argument size is larger than uintptr")
+		}
+		if k := t.kind & kindMask; (GOARCH == "amd64" || GOARCH == "arm") && (k == kindFloat32 || k == kindFloat64) {
+			// In fastcall, floating-point arguments in
+			// the first four positions are passed in
+			// floating-point registers, which we don't
+			// currently spill. arm passes floating-point
+			// arguments in VFP registers, which we also
+			// don't support.
+			panic("compileCallback: float arguments not supported")
+		}
+
+		// The Go ABI aligns arguments.
+		dst = alignUp(dst, uintptr(t.align))
+		// In the C ABI, we're already on a word boundary.
+		// Also, sub-word-sized fastcall register arguments
+		// are stored to the least-significant bytes of the
+		// argument word and all supported Windows
+		// architectures are little endian, so src is already
+		// pointing to the right place for smaller arguments.
+		// The same is true on arm.
+
+		// Copy just the size of the argument. Note that this
+		// could be a small by-value struct, but C and Go
+		// struct layouts are compatible, so we can copy these
+		// directly, too.
+		part := abiPart{src, dst, t.size}
+		// Add this step to the adapter.
+		if len(abiMap) == 0 || !abiMap[len(abiMap)-1].tryMerge(part) {
+			abiMap = append(abiMap, part)
+		}
+
+		// cdecl, stdcall, fastcall, and arm pad arguments to word size.
+		src += sys.PtrSize
+		// The Go ABI packs arguments.
+		dst += t.size
+	}
+	// The Go ABI aligns the result to the word size. src is
+	// already aligned.
+	dst = alignUp(dst, sys.PtrSize)
+	retOffset := dst
+
 	if len(ft.out()) != 1 {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	uintptrSize := unsafe.Sizeof(uintptr(0))
-	if ft.out()[0].size != uintptrSize {
+	if ft.out()[0].size != sys.PtrSize {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	argsize := uintptr(0)
-	for _, t := range ft.in() {
-		if t.size > uintptrSize {
-			panic("compileCallback: argument size is larger than uintptr")
-		}
-		argsize += uintptrSize
+	if k := ft.out()[0].kind & kindMask; k == kindFloat32 || k == kindFloat64 {
+		// In cdecl and stdcall, float results are returned in
+		// ST(0). In fastcall, they're returned in XMM0.
+		// Either way, it's not AX.
+		panic("compileCallback: float results not supported")
 	}
+	// Make room for the uintptr-sized result.
+	dst += sys.PtrSize
 
-	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
+	if dst > callbackMaxFrame {
+		panic("compileCallback: function argument frame too large")
+	}
 
-	n := cbs.n
-	for i := 0; i < n; i++ {
-		if cbs.ctxt[i].gobody == fn.data && cbs.ctxt[i].isCleanstack() == cleanstack {
-			r := callbackasmAddr(i)
-			unlock(&cbs.lock)
-			return r
-		}
+	// For cdecl, the callee is responsible for popping its
+	// arguments from the C stack.
+	var retPop uintptr
+	if cdecl {
+		retPop = src
 	}
-	if n >= cb_max {
+
+	key := winCallbackKey{(*funcval)(fn.data), cdecl}
+
+	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
+
+	// Check if this callback is already registered.
+	if n, ok := cbs.index[key]; ok {
 		unlock(&cbs.lock)
-		throw("too many callback functions")
+		return callbackasmAddr(n)
 	}
 
-	c := new(wincallbackcontext)
-	c.gobody = fn.data
-	c.argsize = argsize
-	c.setCleanstack(cleanstack)
-	if cleanstack && argsize != 0 {
-		c.restorestack = argsize
-	} else {
-		c.restorestack = 0
+	// Register the callback.
+	if cbs.index == nil {
+		cbs.index = make(map[winCallbackKey]int)
+	}
+	n := cbs.n
+	if n >= len(cbs.ctxt) {
+		unlock(&cbs.lock)
+		throw("too many callback functions")
 	}
+	c := winCallback{key.fn, retPop, abiMap, retOffset}
 	cbs.ctxt[n] = c
+	cbs.index[key] = n
 	cbs.n++
 
-	r := callbackasmAddr(n)
 	unlock(&cbs.lock)
-	return r
+	return callbackasmAddr(n)
+}
+
+type callbackArgs struct {
+	index uintptr
+	// args points to the argument block.
+	//
+	// For cdecl and stdcall, all arguments are on the stack.
+	//
+	// For fastcall, the trampoline spills register arguments to
+	// the reserved spill slots below the stack arguments,
+	// resulting in a layout equivalent to stdcall.
+	//
+	// For arm, the trampoline stores the register arguments just
+	// below the stack arguments, so again we can treat it as one
+	// big stack arguments frame.
+	args unsafe.Pointer
+	// Below are out-args from callbackWrap
+	result uintptr
+	retPop uintptr // For 386 cdecl, how many bytes to pop on return
+}
+
+// callbackWrap is called by callbackasm to invoke a registered C callback.
+func callbackWrap(a *callbackArgs) {
+	c := cbs.ctxt[a.index]
+	a.retPop = c.retPop
+
+	// Convert from C to Go ABI.
+	var frame [callbackMaxFrame]byte
+	goArgs := unsafe.Pointer(&frame)
+	for _, part := range c.abiMap {
+		memmove(add(goArgs, part.dst), add(a.args, part.src), part.len)
+	}
+
+	// Even though this is copying back results, we can pass a nil
+	// type because those results must not require write barriers.
+	reflectcall(nil, unsafe.Pointer(c.fn), noescape(goArgs), uint32(c.retOffset)+sys.PtrSize, uint32(c.retOffset))
+
+	// Extract the result.
+	a.result = *(*uintptr)(unsafe.Pointer(&frame[c.retOffset]))
 }
 
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 2e74546e38..fb215b3c31 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -9,11 +9,12 @@ import (
 	"fmt"
 	"internal/syscall/windows/sysdll"
 	"internal/testenv"
-	"io/ioutil"
+	"io"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
@@ -285,99 +286,108 @@ func TestCallbackInAnotherThread(t *testing.T) {
 	}
 }
 
-type cbDLLFunc int // int determines number of callback parameters
-
-func (f cbDLLFunc) stdcallName() string {
-	return fmt.Sprintf("stdcall%d", f)
+type cbFunc struct {
+	goFunc interface{}
 }
 
-func (f cbDLLFunc) cdeclName() string {
-	return fmt.Sprintf("cdecl%d", f)
+func (f cbFunc) cName(cdecl bool) string {
+	name := "stdcall"
+	if cdecl {
+		name = "cdecl"
+	}
+	t := reflect.TypeOf(f.goFunc)
+	for i := 0; i < t.NumIn(); i++ {
+		name += "_" + t.In(i).Name()
+	}
+	return name
 }
 
-func (f cbDLLFunc) buildOne(stdcall bool) string {
-	var funcname, attr string
-	if stdcall {
-		funcname = f.stdcallName()
-		attr = "__stdcall"
-	} else {
-		funcname = f.cdeclName()
+func (f cbFunc) cSrc(w io.Writer, cdecl bool) {
+	// Construct a C function that takes a callback with
+	// f.goFunc's signature, and calls it with integers 1..N.
+	funcname := f.cName(cdecl)
+	attr := "__stdcall"
+	if cdecl {
 		attr = "__cdecl"
 	}
 	typename := "t" + funcname
-	p := make([]string, f)
-	for i := range p {
-		p[i] = "uintptr_t"
-	}
-	params := strings.Join(p, ",")
-	for i := range p {
-		p[i] = fmt.Sprintf("%d", i+1)
-	}
-	args := strings.Join(p, ",")
-	return fmt.Sprintf(`
-typedef void %s (*%s)(%s);
-void %s(%s f, uintptr_t n) {
-	uintptr_t i;
-	for(i=0;i<n;i++){
-		f(%s);
+	t := reflect.TypeOf(f.goFunc)
+	cTypes := make([]string, t.NumIn())
+	cArgs := make([]string, t.NumIn())
+	for i := range cTypes {
+		// We included stdint.h, so this works for all sized
+		// integer types, and uint8Pair_t.
+		cTypes[i] = t.In(i).Name() + "_t"
+		if t.In(i).Name() == "uint8Pair" {
+			cArgs[i] = fmt.Sprintf("(uint8Pair_t){%d,1}", i)
+		} else {
+			cArgs[i] = fmt.Sprintf("%d", i+1)
+		}
 	}
+	fmt.Fprintf(w, `
+typedef uintptr_t %s (*%s)(%s);
+uintptr_t %s(%s f) {
+	return f(%s);
 }
-	`, attr, typename, params, funcname, typename, args)
+	`, attr, typename, strings.Join(cTypes, ","), funcname, typename, strings.Join(cArgs, ","))
 }
 
-func (f cbDLLFunc) build() string {
-	return "#include <stdint.h>\n\n" + f.buildOne(false) + f.buildOne(true)
+func (f cbFunc) testOne(t *testing.T, dll *syscall.DLL, cdecl bool, cb uintptr) {
+	r1, _, _ := dll.MustFindProc(f.cName(cdecl)).Call(cb)
+
+	want := 0
+	for i := 0; i < reflect.TypeOf(f.goFunc).NumIn(); i++ {
+		want += i + 1
+	}
+	if int(r1) != want {
+		t.Errorf("wanted result %d; got %d", want, r1)
+	}
 }
 
-var cbFuncs = [...]interface{}{
-	2: func(i1, i2 uintptr) uintptr {
-		if i1+i2 != 3 {
-			panic("bad input")
-		}
-		return 0
-	},
-	3: func(i1, i2, i3 uintptr) uintptr {
-		if i1+i2+i3 != 6 {
-			panic("bad input")
-		}
-		return 0
-	},
-	4: func(i1, i2, i3, i4 uintptr) uintptr {
-		if i1+i2+i3+i4 != 10 {
-			panic("bad input")
-		}
-		return 0
-	},
-	5: func(i1, i2, i3, i4, i5 uintptr) uintptr {
-		if i1+i2+i3+i4+i5 != 15 {
-			panic("bad input")
-		}
-		return 0
-	},
-	6: func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6 != 21 {
-			panic("bad input")
-		}
-		return 0
-	},
-	7: func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7 != 28 {
-			panic("bad input")
-		}
-		return 0
-	},
-	8: func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8 != 36 {
-			panic("bad input")
-		}
-		return 0
-	},
-	9: func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8+i9 != 45 {
-			panic("bad input")
-		}
-		return 0
-	},
+type uint8Pair struct{ x, y uint8 }
+
+var cbFuncs = []cbFunc{
+	{func(i1, i2 uintptr) uintptr {
+		return i1 + i2
+	}},
+	{func(i1, i2, i3 uintptr) uintptr {
+		return i1 + i2 + i3
+	}},
+	{func(i1, i2, i3, i4 uintptr) uintptr {
+		return i1 + i2 + i3 + i4
+	}},
+	{func(i1, i2, i3, i4, i5 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9
+	}},
+
+	// Non-uintptr parameters.
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint16) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 int8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1 int8, i2 int16, i3 int32, i4, i5 uintptr) uintptr {
+		return uintptr(i1) + uintptr(i2) + uintptr(i3) + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5 uint8Pair) uintptr {
+		return uintptr(i1.x + i1.y + i2.x + i2.y + i3.x + i3.y + i4.x + i4.y + i5.x + i5.y)
+	}},
 }
 
 type cbDLL struct {
@@ -385,21 +395,26 @@ type cbDLL struct {
 	buildArgs func(out, src string) []string
 }
 
-func (d *cbDLL) buildSrc(t *testing.T, path string) {
+func (d *cbDLL) makeSrc(t *testing.T, path string) {
 	f, err := os.Create(path)
 	if err != nil {
 		t.Fatalf("failed to create source file: %v", err)
 	}
 	defer f.Close()
 
-	for i := 2; i < 10; i++ {
-		fmt.Fprint(f, cbDLLFunc(i).build())
+	fmt.Fprint(f, `
+#include <stdint.h>
+typedef struct { uint8_t x, y; } uint8Pair_t;
+`)
+	for _, cbf := range cbFuncs {
+		cbf.cSrc(f, false)
+		cbf.cSrc(f, true)
 	}
 }
 
 func (d *cbDLL) build(t *testing.T, dir string) string {
 	srcname := d.name + ".c"
-	d.buildSrc(t, filepath.Join(dir, srcname))
+	d.makeSrc(t, filepath.Join(dir, srcname))
 	outname := d.name + ".dll"
 	args := d.buildArgs(outname, srcname)
 	cmd := exec.Command(args[0], args[1:]...)
@@ -426,66 +441,32 @@ var cbDLLs = []cbDLL{
 	},
 }
 
-type cbTest struct {
-	n     int     // number of callback parameters
-	param uintptr // dll function parameter
-}
-
-func (test *cbTest) run(t *testing.T, dllpath string) {
-	dll := syscall.MustLoadDLL(dllpath)
-	defer dll.Release()
-	cb := cbFuncs[test.n]
-	stdcall := syscall.NewCallback(cb)
-	f := cbDLLFunc(test.n)
-	test.runOne(t, dll, f.stdcallName(), stdcall)
-	cdecl := syscall.NewCallbackCDecl(cb)
-	test.runOne(t, dll, f.cdeclName(), cdecl)
-}
-
-func (test *cbTest) runOne(t *testing.T, dll *syscall.DLL, proc string, cb uintptr) {
-	defer func() {
-		if r := recover(); r != nil {
-			t.Errorf("dll call %v(..., %d) failed: %v", proc, test.param, r)
-		}
-	}()
-	dll.MustFindProc(proc).Call(cb, test.param)
-}
-
-var cbTests = []cbTest{
-	{2, 1},
-	{2, 10000},
-	{3, 3},
-	{4, 5},
-	{4, 6},
-	{5, 2},
-	{6, 7},
-	{6, 8},
-	{7, 6},
-	{8, 1},
-	{9, 8},
-	{9, 10000},
-	{3, 4},
-	{5, 3},
-	{7, 7},
-	{8, 2},
-	{9, 9},
-}
-
 func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	if _, err := exec.LookPath("gcc"); err != nil {
 		t.Skip("skipping test: gcc is missing")
 	}
-	tmp, err := ioutil.TempDir("", "TestCDeclCallback")
+	tmp, err := os.MkdirTemp("", "TestCDeclCallback")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
 	defer os.RemoveAll(tmp)
 
 	for _, dll := range cbDLLs {
-		dllPath := dll.build(t, tmp)
-		for _, test := range cbTests {
-			test.run(t, dllPath)
-		}
+		t.Run(dll.name, func(t *testing.T) {
+			dllPath := dll.build(t, tmp)
+			dll := syscall.MustLoadDLL(dllPath)
+			defer dll.Release()
+			for _, cbf := range cbFuncs {
+				t.Run(cbf.cName(false), func(t *testing.T) {
+					stdcall := syscall.NewCallback(cbf.goFunc)
+					cbf.testOne(t, dll, false, stdcall)
+				})
+				t.Run(cbf.cName(true), func(t *testing.T) {
+					cdecl := syscall.NewCallbackCDecl(cbf.goFunc)
+					cbf.testOne(t, dll, true, cdecl)
+				})
+			}
+		})
 	}
 }
 
@@ -620,14 +601,14 @@ uintptr_t cfunc(callback f, uintptr_t n) {
    return r;
 }
 `
-	tmpdir, err := ioutil.TempDir("", "TestReturnAfterStackGrowInCallback")
+	tmpdir, err := os.MkdirTemp("", "TestReturnAfterStackGrowInCallback")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
 	defer os.RemoveAll(tmpdir)
 
 	srcname := "mydll.c"
-	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -689,14 +670,14 @@ uintptr_t cfunc(uintptr_t a, double b, float c, double d) {
 	return 0;
 }
 `
-	tmpdir, err := ioutil.TempDir("", "TestFloatArgs")
+	tmpdir, err := os.MkdirTemp("", "TestFloatArgs")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
 	defer os.RemoveAll(tmpdir)
 
 	srcname := "mydll.c"
-	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -751,14 +732,14 @@ double cfuncDouble(uintptr_t a, double b, float c, double d) {
 	return 0;
 }
 `
-	tmpdir, err := ioutil.TempDir("", "TestFloatReturn")
+	tmpdir, err := os.MkdirTemp("", "TestFloatReturn")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
 	defer os.RemoveAll(tmpdir)
 
 	srcname := "mydll.c"
-	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -966,7 +947,7 @@ func TestDLLPreloadMitigation(t *testing.T) {
 		t.Skip("skipping test: gcc is missing")
 	}
 
-	tmpdir, err := ioutil.TempDir("", "TestDLLPreloadMitigation")
+	tmpdir, err := os.MkdirTemp("", "TestDLLPreloadMitigation")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
@@ -987,12 +968,13 @@ func TestDLLPreloadMitigation(t *testing.T) {
 #include <stdint.h>
 #include <windows.h>
 
-uintptr_t cfunc() {
+uintptr_t cfunc(void) {
    SetLastError(123);
+   return 0;
 }
 `
 	srcname := "nojack.c"
-	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -1052,7 +1034,7 @@ func TestBigStackCallbackSyscall(t *testing.T) {
 		t.Fatal("Abs failed: ", err)
 	}
 
-	tmpdir, err := ioutil.TempDir("", "TestBigStackCallback")
+	tmpdir, err := os.MkdirTemp("", "TestBigStackCallback")
 	if err != nil {
 		t.Fatal("TempDir failed: ", err)
 	}
@@ -1161,17 +1143,19 @@ func BenchmarkSyscallToSyscallPing(b *testing.B) {
 	go func() {
 		for i := 0; i < n; i++ {
 			syscall.WaitForSingleObject(event1, syscall.INFINITE)
-			err := setEvent(event2)
-			if err != nil {
-				b.Fatal(err)
+			if err := setEvent(event2); err != nil {
+				b.Errorf("Set event failed: %v", err)
+				return
 			}
 		}
 	}()
 	for i := 0; i < n; i++ {
-		err := setEvent(event1)
-		if err != nil {
+		if err := setEvent(event1); err != nil {
 			b.Fatal(err)
 		}
+		if b.Failed() {
+			break
+		}
 		syscall.WaitForSingleObject(event2, syscall.INFINITE)
 	}
 }
@@ -1199,14 +1183,14 @@ func BenchmarkOsYield(b *testing.B) {
 }
 
 func BenchmarkRunningGoProgram(b *testing.B) {
-	tmpdir, err := ioutil.TempDir("", "BenchmarkRunningGoProgram")
+	tmpdir, err := os.MkdirTemp("", "BenchmarkRunningGoProgram")
 	if err != nil {
 		b.Fatal(err)
 	}
 	defer os.RemoveAll(tmpdir)
 
 	src := filepath.Join(tmpdir, "main.go")
-	err = ioutil.WriteFile(src, []byte(benchmarkRunningGoProgram), 0666)
+	err = os.WriteFile(src, []byte(benchmarkRunningGoProgram), 0666)
 	if err != nil {
 		b.Fatal(err)
 	}
diff --git a/src/runtime/testdata/testprog/memprof.go b/src/runtime/testdata/testprog/memprof.go
index 7b134bc078..0392e60f84 100644
--- a/src/runtime/testdata/testprog/memprof.go
+++ b/src/runtime/testdata/testprog/memprof.go
@@ -7,7 +7,6 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -31,7 +30,7 @@ func MemProf() {
 
 	runtime.GC()
 
-	f, err := ioutil.TempFile("", "memprof")
+	f, err := os.CreateTemp("", "memprof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/src/runtime/testdata/testprog/syscalls_linux.go b/src/runtime/testdata/testprog/syscalls_linux.go
index b8ac087626..48f8014237 100644
--- a/src/runtime/testdata/testprog/syscalls_linux.go
+++ b/src/runtime/testdata/testprog/syscalls_linux.go
@@ -7,7 +7,6 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"syscall"
 )
@@ -17,7 +16,7 @@ func gettid() int {
 }
 
 func tidExists(tid int) (exists, supported bool) {
-	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	stat, err := os.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
 	if os.IsNotExist(err) {
 		return false, true
 	}
diff --git a/src/runtime/testdata/testprog/timeprof.go b/src/runtime/testdata/testprog/timeprof.go
index 0702885369..1e90af4033 100644
--- a/src/runtime/testdata/testprog/timeprof.go
+++ b/src/runtime/testdata/testprog/timeprof.go
@@ -6,7 +6,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime/pprof"
 	"time"
@@ -17,7 +16,7 @@ func init() {
 }
 
 func TimeProf() {
-	f, err := ioutil.TempFile("", "timeprof")
+	f, err := os.CreateTemp("", "timeprof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/src/runtime/testdata/testprog/vdso.go b/src/runtime/testdata/testprog/vdso.go
index ef92f48758..d2a300d8f2 100644
--- a/src/runtime/testdata/testprog/vdso.go
+++ b/src/runtime/testdata/testprog/vdso.go
@@ -8,7 +8,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime/pprof"
 	"time"
@@ -19,7 +18,7 @@ func init() {
 }
 
 func signalInVDSO() {
-	f, err := ioutil.TempFile("", "timeprofnow")
+	f, err := os.CreateTemp("", "timeprofnow")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/src/runtime/testdata/testprogcgo/eintr.go b/src/runtime/testdata/testprogcgo/eintr.go
index 791ff1bedc..1722a75eb9 100644
--- a/src/runtime/testdata/testprogcgo/eintr.go
+++ b/src/runtime/testdata/testprogcgo/eintr.go
@@ -32,7 +32,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net"
 	"os"
@@ -242,5 +241,5 @@ func testExec(wg *sync.WaitGroup) {
 
 // Block blocks until stdin is closed.
 func Block() {
-	io.Copy(ioutil.Discard, os.Stdin)
+	io.Copy(io.Discard, os.Stdin)
 }
diff --git a/src/runtime/testdata/testprogcgo/exec.go b/src/runtime/testdata/testprogcgo/exec.go
index 94da5dc526..15723c7369 100644
--- a/src/runtime/testdata/testprogcgo/exec.go
+++ b/src/runtime/testdata/testprogcgo/exec.go
@@ -31,6 +31,7 @@ import "C"
 
 import (
 	"fmt"
+	"io/fs"
 	"os"
 	"os/exec"
 	"os/signal"
@@ -98,7 +99,7 @@ func CgoExecSignalMask() {
 
 // isEAGAIN reports whether err is an EAGAIN error from a process execution.
 func isEAGAIN(err error) bool {
-	if p, ok := err.(*os.PathError); ok {
+	if p, ok := err.(*fs.PathError); ok {
 		err = p.Err
 	}
 	return err == syscall.EAGAIN
diff --git a/src/runtime/testdata/testprogcgo/needmdeadlock.go b/src/runtime/testdata/testprogcgo/needmdeadlock.go
new file mode 100644
index 0000000000..5a9c359006
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/needmdeadlock.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// This is for issue #42207.
+// During a call to needm we could get a SIGCHLD signal
+// which would itself call needm, causing a deadlock.
+
+/*
+#include <signal.h>
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+
+extern void GoNeedM();
+
+#define SIGNALERS 10
+
+static void* needmSignalThread(void* p) {
+	pthread_t* pt = (pthread_t*)(p);
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		if (pthread_kill(*pt, SIGCHLD) < 0) {
+			return NULL;
+		}
+		usleep(1);
+	}
+	return NULL;
+}
+
+// We don't need many calls, as the deadlock is only likely
+// to occur the first couple of times that needm is called.
+// After that there will likely be an extra M available.
+#define CALLS 10
+
+static void* needmCallbackThread(void* p) {
+	int i;
+
+	for (i = 0; i < SIGNALERS; i++) {
+		sched_yield(); // Help the signal threads get started.
+	}
+	for (i = 0; i < CALLS; i++) {
+		GoNeedM();
+	}
+	return NULL;
+}
+
+static void runNeedmSignalThread() {
+	int i;
+	pthread_t caller;
+	pthread_t s[SIGNALERS];
+
+	pthread_create(&caller, NULL, needmCallbackThread, NULL);
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_create(&s[i], NULL, needmSignalThread, &caller);
+	}
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_join(s[i], NULL);
+	}
+	pthread_join(caller, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"time"
+)
+
+func init() {
+	register("NeedmDeadlock", NeedmDeadlock)
+}
+
+//export GoNeedM
+func GoNeedM() {
+}
+
+func NeedmDeadlock() {
+	// The failure symptom is that the program hangs because of a
+	// deadlock in needm, so set an alarm.
+	go func() {
+		time.Sleep(5 * time.Second)
+		fmt.Println("Hung for 5 seconds")
+		os.Exit(1)
+	}()
+
+	C.runNeedmSignalThread()
+	fmt.Println("OK")
+}
diff --git a/src/runtime/testdata/testprogcgo/pprof.go b/src/runtime/testdata/testprogcgo/pprof.go
index 00f2c42e93..3b73fa0bdd 100644
--- a/src/runtime/testdata/testprogcgo/pprof.go
+++ b/src/runtime/testdata/testprogcgo/pprof.go
@@ -60,7 +60,6 @@ import "C"
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -75,7 +74,7 @@ func init() {
 func CgoPprof() {
 	runtime.SetCgoTraceback(0, unsafe.Pointer(C.pprofCgoTraceback), nil, nil)
 
-	f, err := ioutil.TempFile("", "prof")
+	f, err := os.CreateTemp("", "prof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/src/runtime/testdata/testprogcgo/threadpprof.go b/src/runtime/testdata/testprogcgo/threadpprof.go
index 37a2a1ab65..feb774ba59 100644
--- a/src/runtime/testdata/testprogcgo/threadpprof.go
+++ b/src/runtime/testdata/testprogcgo/threadpprof.go
@@ -74,7 +74,6 @@ import "C"
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -97,7 +96,7 @@ func CgoPprofThreadNoTraceback() {
 }
 
 func pprofThread() {
-	f, err := ioutil.TempFile("", "prof")
+	f, err := os.CreateTemp("", "prof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/src/runtime/testdata/testprogcgo/traceback.go b/src/runtime/testdata/testprogcgo/traceback.go
index 2a023f66ca..e2d7599131 100644
--- a/src/runtime/testdata/testprogcgo/traceback.go
+++ b/src/runtime/testdata/testprogcgo/traceback.go
@@ -11,58 +11,11 @@ package main
 /*
 #cgo CFLAGS: -g -O0
 
-#include <stdint.h>
-
-char *p;
-
-static int f3(void) {
-	*p = 0;
-	return 0;
-}
-
-static int f2(void) {
-	return f3();
-}
-
-static int f1(void) {
-	return f2();
-}
-
-struct cgoTracebackArg {
-	uintptr_t  context;
-	uintptr_t  sigContext;
-	uintptr_t* buf;
-	uintptr_t  max;
-};
-
-struct cgoSymbolizerArg {
-	uintptr_t   pc;
-	const char* file;
-	uintptr_t   lineno;
-	const char* func;
-	uintptr_t   entry;
-	uintptr_t   more;
-	uintptr_t   data;
-};
-
-void cgoTraceback(void* parg) {
-	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
-	arg->buf[0] = 1;
-	arg->buf[1] = 2;
-	arg->buf[2] = 3;
-	arg->buf[3] = 0;
-}
-
-void cgoSymbolizer(void* parg) {
-	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
-	if (arg->pc != arg->data + 1) {
-		arg->file = "unexpected data";
-	} else {
-		arg->file = "cgo symbolizer";
-	}
-	arg->lineno = arg->data + 1;
-	arg->data++;
-}
+// Defined in traceback_c.c.
+extern int crashInGo;
+int tracebackF1(void);
+void cgoTraceback(void* parg);
+void cgoSymbolizer(void* parg);
 */
 import "C"
 
@@ -73,9 +26,29 @@ import (
 
 func init() {
 	register("CrashTraceback", CrashTraceback)
+	register("CrashTracebackGo", CrashTracebackGo)
 }
 
 func CrashTraceback() {
 	runtime.SetCgoTraceback(0, unsafe.Pointer(C.cgoTraceback), nil, unsafe.Pointer(C.cgoSymbolizer))
-	C.f1()
+	C.tracebackF1()
+}
+
+func CrashTracebackGo() {
+	C.crashInGo = 1
+	CrashTraceback()
+}
+
+//export h1
+func h1() {
+	h2()
+}
+
+func h2() {
+	h3()
+}
+
+func h3() {
+	var x *int
+	*x = 0
 }
diff --git a/src/runtime/testdata/testprogcgo/traceback_c.c b/src/runtime/testdata/testprogcgo/traceback_c.c
new file mode 100644
index 0000000000..56eda8fa8c
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/traceback_c.c
@@ -0,0 +1,65 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The C definitions for traceback.go. That file uses //export so
+// it can't put function definitions in the "C" import comment.
+
+#include <stdint.h>
+
+char *p;
+
+int crashInGo;
+extern void h1(void);
+
+int tracebackF3(void) {
+	if (crashInGo)
+		h1();
+	else
+		*p = 0;
+	return 0;
+}
+
+int tracebackF2(void) {
+	return tracebackF3();
+}
+
+int tracebackF1(void) {
+	return tracebackF2();
+}
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+struct cgoSymbolizerArg {
+	uintptr_t   pc;
+	const char* file;
+	uintptr_t   lineno;
+	const char* func;
+	uintptr_t   entry;
+	uintptr_t   more;
+	uintptr_t   data;
+};
+
+void cgoTraceback(void* parg) {
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	arg->buf[0] = 1;
+	arg->buf[1] = 2;
+	arg->buf[2] = 3;
+	arg->buf[3] = 0;
+}
+
+void cgoSymbolizer(void* parg) {
+	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
+	if (arg->pc != arg->data + 1) {
+		arg->file = "unexpected data";
+	} else {
+		arg->file = "cgo symbolizer";
+	}
+	arg->lineno = arg->data + 1;
+	arg->data++;
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index f895bf8443..d338705b7c 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -23,6 +23,8 @@ type timer struct {
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
 	// a well-behaved function and not block.
+	//
+	// when must be positive on an active timer.
 	when   int64
 	period int64
 	f      func(interface{}, uintptr)
@@ -185,6 +187,9 @@ func timeSleep(ns int64) {
 	t.f = goroutineReady
 	t.arg = gp
 	t.nextwhen = nanotime() + ns
+	if t.nextwhen < 0 { // check for overflow.
+		t.nextwhen = maxWhen
+	}
 	gopark(resetForSleep, unsafe.Pointer(t), waitReasonSleep, traceEvGoSleep, 1)
 }
 
@@ -242,10 +247,14 @@ func goroutineReady(arg interface{}, seq uintptr) {
 // That avoids the risk of changing the when field of a timer in some P's heap,
 // which could cause the heap to become unsorted.
 func addtimer(t *timer) {
-	// when must never be negative; otherwise runtimer will overflow
-	// during its delta calculation and never expire other runtime timers.
-	if t.when < 0 {
-		t.when = maxWhen
+	// when must be positive. A negative value will cause runtimer to
+	// overflow during its delta calculation and never expire other runtime
+	// timers. Zero will cause checkTimers to fail to notice the timer.
+	if t.when <= 0 {
+		throw("timer when must be positive")
+	}
+	if t.period < 0 {
+		throw("timer period must be non-negative")
 	}
 	if t.status != timerNoStatus {
 		throw("addtimer called with initialized timer")
@@ -406,8 +415,11 @@ func dodeltimer0(pp *p) {
 // This is called by the netpoll code or time.Ticker.Reset or time.Timer.Reset.
 // Reports whether the timer was modified before it was run.
 func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
-	if when < 0 {
-		when = maxWhen
+	if when <= 0 {
+		throw("timer when must be positive")
+	}
+	if period < 0 {
+		throw("timer period must be non-negative")
 	}
 
 	status := uint32(timerNoStatus)
@@ -491,6 +503,8 @@ loop:
 			newStatus = timerModifiedEarlier
 		}
 
+		tpp := t.pp.ptr()
+
 		// Update the adjustTimers field.  Subtract one if we
 		// are removing a timerModifiedEarlier, add one if we
 		// are adding a timerModifiedEarlier.
@@ -500,9 +514,10 @@ loop:
 		}
 		if newStatus == timerModifiedEarlier {
 			adjust++
+			updateTimerModifiedEarliest(tpp, when)
 		}
 		if adjust != 0 {
-			atomic.Xadd(&t.pp.ptr().adjustTimers, adjust)
+			atomic.Xadd(&tpp.adjustTimers, adjust)
 		}
 
 		// Set the new status of the timer.
@@ -637,16 +652,36 @@ func moveTimers(pp *p, timers []*timer) {
 // the correct place in the heap. While looking for those timers,
 // it also moves timers that have been modified to run later,
 // and removes deleted timers. The caller must have locked the timers for pp.
-func adjusttimers(pp *p) {
-	if len(pp.timers) == 0 {
-		return
-	}
+func adjusttimers(pp *p, now int64) {
 	if atomic.Load(&pp.adjustTimers) == 0 {
 		if verifyTimers {
 			verifyTimerHeap(pp)
 		}
+		// There are no timers to adjust, so it is safe to clear
+		// timerModifiedEarliest. Do so in case it is stale.
+		// Everything will work if we don't do this,
+		// but clearing here may save future calls to adjusttimers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
 		return
 	}
+
+	// If we haven't yet reached the time of the first timerModifiedEarlier
+	// timer, don't do anything. This speeds up programs that adjust
+	// a lot of timers back and forth if the timers rarely expire.
+	// We'll postpone looking through all the adjusted timers until
+	// one would actually expire.
+	if first := atomic.Load64(&pp.timerModifiedEarliest); first != 0 {
+		if int64(first) > now {
+			if verifyTimers {
+				verifyTimerHeap(pp)
+			}
+			return
+		}
+
+		// We are going to clear all timerModifiedEarlier timers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
+	}
+
 	var moved []*timer
 loop:
 	for i := 0; i < len(pp.timers); i++ {
@@ -719,16 +754,15 @@ func addAdjustedTimers(pp *p, moved []*timer) {
 // nobarrierWakeTime looks at P's timers and returns the time when we
 // should wake up the netpoller. It returns 0 if there are no timers.
 // This function is invoked when dropping a P, and must run without
-// any write barriers. Therefore, if there are any timers that needs
-// to be moved earlier, it conservatively returns the current time.
-// The netpoller M will wake up and adjust timers before sleeping again.
+// any write barriers.
 //go:nowritebarrierrec
 func nobarrierWakeTime(pp *p) int64 {
-	if atomic.Load(&pp.adjustTimers) > 0 {
-		return nanotime()
-	} else {
-		return int64(atomic.Load64(&pp.timer0When))
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
 	}
+	return next
 }
 
 // runtimer examines the first timer in timers. If it is ready based on now,
@@ -824,6 +858,9 @@ func runOneTimer(pp *p, t *timer, now int64) {
 		// Leave in heap but adjust next time to fire.
 		delta := t.when - now
 		t.when += t.period * (1 + -delta/t.period)
+		if t.when < 0 { // check for overflow.
+			t.when = maxWhen
+		}
 		siftdownTimer(pp.timers, 0)
 		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
 			badTimer()
@@ -868,6 +905,10 @@ func runOneTimer(pp *p, t *timer, now int64) {
 //
 // The caller must have locked the timers for pp.
 func clearDeletedTimers(pp *p) {
+	// We are going to clear all timerModifiedEarlier timers.
+	// Do this now in case new ones show up while we are looping.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
 	cdel := int32(0)
 	cearlier := int32(0)
 	to := 0
@@ -977,6 +1018,21 @@ func updateTimer0When(pp *p) {
 	}
 }
 
+// updateTimerModifiedEarliest updates the recorded nextwhen field of the
+// earlier timerModifiedEarier value.
+// The timers for pp will not be locked.
+func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
+	for {
+		old := atomic.Load64(&pp.timerModifiedEarliest)
+		if old != 0 && int64(old) < nextwhen {
+			return
+		}
+		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+			return
+		}
+	}
+}
+
 // timeSleepUntil returns the time when the next timer should fire,
 // and the P that holds the timer heap that that timer is on.
 // This is only called by sysmon and checkdead.
@@ -993,48 +1049,17 @@ func timeSleepUntil() (int64, *p) {
 			continue
 		}
 
-		c := atomic.Load(&pp.adjustTimers)
-		if c == 0 {
-			w := int64(atomic.Load64(&pp.timer0When))
-			if w != 0 && w < next {
-				next = w
-				pret = pp
-			}
-			continue
+		w := int64(atomic.Load64(&pp.timer0When))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
 
-		lock(&pp.timersLock)
-		for _, t := range pp.timers {
-			switch s := atomic.Load(&t.status); s {
-			case timerWaiting:
-				if t.when < next {
-					next = t.when
-				}
-			case timerModifiedEarlier, timerModifiedLater:
-				if t.nextwhen < next {
-					next = t.nextwhen
-				}
-				if s == timerModifiedEarlier {
-					c--
-				}
-			}
-			// The timers are sorted, so we only have to check
-			// the first timer for each P, unless there are
-			// some timerModifiedEarlier timers. The number
-			// of timerModifiedEarlier timers is in the adjustTimers
-			// field, used to initialize c, above.
-			//
-			// We don't worry about cases like timerModifying.
-			// New timers can show up at any time,
-			// so this function is necessarily imprecise.
-			// Do a signed check here since we aren't
-			// synchronizing the read of pp.adjustTimers
-			// with the check of a timer status.
-			if int32(c) <= 0 {
-				break
-			}
+		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
-		unlock(&pp.timersLock)
 	}
 	unlock(&allpLock)
 
@@ -1054,6 +1079,9 @@ func siftupTimer(t []*timer, i int) {
 		badTimer()
 	}
 	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
 	tmp := t[i]
 	for i > 0 {
 		p := (i - 1) / 4 // parent
@@ -1074,6 +1102,9 @@ func siftdownTimer(t []*timer, i int) {
 		badTimer()
 	}
 	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
 	tmp := t[i]
 	for {
 		c := i*4 + 1 // left child
diff --git a/src/runtime/time_test.go b/src/runtime/time_test.go
index bf29561144..afd9af2af4 100644
--- a/src/runtime/time_test.go
+++ b/src/runtime/time_test.go
@@ -20,6 +20,10 @@ func TestFakeTime(t *testing.T) {
 		t.Skip("faketime not supported on windows")
 	}
 
+	// Faketime is advanced in checkdead. External linking brings in cgo,
+	// causing checkdead not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 
 	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
@@ -38,7 +42,7 @@ func TestFakeTime(t *testing.T) {
 	}
 
 	t.Logf("raw stdout: %q", stdout.String())
-	t.Logf("raw stderr: %q", stdout.String())
+	t.Logf("raw stderr: %q", stderr.String())
 
 	f1, err1 := parseFakeTime(stdout.Bytes())
 	if err1 != nil {
diff --git a/src/runtime/tls_arm64.h b/src/runtime/tls_arm64.h
index f60f4f6d5b..0804fa3502 100644
--- a/src/runtime/tls_arm64.h
+++ b/src/runtime/tls_arm64.h
@@ -15,6 +15,12 @@
 #endif
 
 #ifdef GOOS_darwin
+#define TLS_darwin
+#endif
+#ifdef GOOS_ios
+#define TLS_darwin
+#endif
+#ifdef TLS_darwin
 #define TPIDR TPIDRRO_EL0
 #define TLSG_IS_VARIABLE
 #define MRS_TPIDR_R0 WORD $0xd53bd060 // MRS TPIDRRO_EL0, R0
diff --git a/src/runtime/tls_arm64.s b/src/runtime/tls_arm64.s
index 999914d655..3f02974d5b 100644
--- a/src/runtime/tls_arm64.s
+++ b/src/runtime/tls_arm64.s
@@ -9,33 +9,35 @@
 #include "tls_arm64.h"
 
 TEXT runtime·load_g(SB),NOSPLIT,$0
+#ifndef TLS_darwin
 	MOVB	runtime·iscgo(SB), R0
 	CBZ	R0, nocgo
+#endif
 
 	MRS_TPIDR_R0
-#ifdef GOOS_darwin
+#ifdef TLS_darwin
 	// Darwin sometimes returns unaligned pointers
 	AND	$0xfffffffffffffff8, R0
 #endif
 	MOVD	runtime·tls_g(SB), R27
-	ADD	R27, R0
-	MOVD	0(R0), g
+	MOVD	(R0)(R27), g
 
 nocgo:
 	RET
 
 TEXT runtime·save_g(SB),NOSPLIT,$0
+#ifndef TLS_darwin
 	MOVB	runtime·iscgo(SB), R0
 	CBZ	R0, nocgo
+#endif
 
 	MRS_TPIDR_R0
-#ifdef GOOS_darwin
+#ifdef TLS_darwin
 	// Darwin sometimes returns unaligned pointers
 	AND	$0xfffffffffffffff8, R0
 #endif
 	MOVD	runtime·tls_g(SB), R27
-	ADD	R27, R0
-	MOVD	g, 0(R0)
+	MOVD	g, (R0)(R27)
 
 nocgo:
 	RET
diff --git a/src/runtime/tls_riscv64.s b/src/runtime/tls_riscv64.s
index 8386980421..22b550b761 100644
--- a/src/runtime/tls_riscv64.s
+++ b/src/runtime/tls_riscv64.s
@@ -9,10 +9,22 @@
 
 // If !iscgo, this is a no-op.
 //
-// NOTE: mcall() assumes this clobbers only R23 (REGTMP).
-// FIXME: cgo
+// NOTE: mcall() assumes this clobbers only X31 (REG_TMP).
 TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVB	runtime·iscgo(SB), X31
+	BEQ	X0, X31, nocgo
+
+	MOV	runtime·tls_g(SB), X31
+	ADD	X4, X31		// add offset to thread pointer (X4)
+	MOV	g, (X31)
+
+nocgo:
 	RET
 
 TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOV	runtime·tls_g(SB), X31
+	ADD	X4, X31		// add offset to thread pointer (X4)
+	MOV	(X31), g
 	RET
+
+GLOBL runtime·tls_g(SB), TLSBSS, $8
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 169b650eb4..bcd0b9d56c 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -13,6 +13,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -1063,7 +1064,7 @@ func traceGoStart() {
 	_g_ := getg().m.curg
 	_p_ := _g_.m.p
 	_g_.traceseq++
-	if _g_ == _p_.ptr().gcBgMarkWorker.ptr() {
+	if _p_.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
 		traceEvent(traceEvGoStartLabel, -1, uint64(_g_.goid), _g_.traceseq, trace.markWorkerLabels[_p_.ptr().gcMarkWorkerMode])
 	} else if _g_.tracelastp == _p_ {
 		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
@@ -1146,11 +1147,11 @@ func traceHeapAlloc() {
 }
 
 func traceNextGC() {
-	if memstats.next_gc == ^uint64(0) {
+	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
 		// Heap-based triggering is disabled.
 		traceEvent(traceEvNextGC, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, memstats.next_gc)
+		traceEvent(traceEvNextGC, -1, nextGC)
 	}
 }
 
diff --git a/src/runtime/trace/trace_test.go b/src/runtime/trace/trace_test.go
index 235845df4e..b316eafe4c 100644
--- a/src/runtime/trace/trace_test.go
+++ b/src/runtime/trace/trace_test.go
@@ -10,7 +10,6 @@ import (
 	"internal/race"
 	"internal/trace"
 	"io"
-	"io/ioutil"
 	"net"
 	"os"
 	"runtime"
@@ -586,7 +585,7 @@ func saveTrace(t *testing.T, buf *bytes.Buffer, name string) {
 	if !*saveTraces {
 		return
 	}
-	if err := ioutil.WriteFile(name+".trace", buf.Bytes(), 0600); err != nil {
+	if err := os.WriteFile(name+".trace", buf.Bytes(), 0600); err != nil {
 		t.Errorf("failed to write trace file: %s", err)
 	}
 }
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 94f4a44976..0825e9e707 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -396,13 +396,21 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 			// If there is inlining info, print the inner frames.
 			if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
 				inltree := (*[1 << 20]inlinedCall)(inldata)
+				var inlFunc _func
+				inlFuncInfo := funcInfo{&inlFunc, f.datap}
 				for {
 					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, nil)
 					if ix < 0 {
 						break
 					}
-					if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0, inltree[ix].funcID, lastFuncID) {
-						name := funcnameFromNameoff(f, inltree[ix].func_)
+
+					// Create a fake _func for the
+					// inlined function.
+					inlFunc.nameoff = inltree[ix].func_
+					inlFunc.funcID = inltree[ix].funcID
+
+					if (flags&_TraceRuntimeFrames) != 0 || showframe(inlFuncInfo, gp, nprint == 0, inlFuncInfo.funcID, lastFuncID) {
+						name := funcname(inlFuncInfo)
 						file, line := funcline(f, tracepc)
 						print(name, "(...)\n")
 						print("\t", file, ":", line, "\n")
@@ -450,7 +458,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		}
 		n++
 
-		if f.funcID == funcID_cgocallback_gofunc && len(cgoCtxt) > 0 {
+		if f.funcID == funcID_cgocallback && len(cgoCtxt) > 0 {
 			ctxt := cgoCtxt[len(cgoCtxt)-1]
 			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
 
@@ -811,6 +819,9 @@ func showframe(f funcInfo, gp *g, firstFrame bool, funcID, childID funcID) bool
 // showfuncinfo reports whether a function with the given characteristics should
 // be printed during a traceback.
 func showfuncinfo(f funcInfo, firstFrame bool, funcID, childID funcID) bool {
+	// Note that f may be a synthesized funcInfo for an inlined
+	// function, in which case only nameoff and funcID are set.
+
 	level, _, _ := gotraceback()
 	if level > 1 {
 		// Show all frames.
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 52b6cb30b4..81455f3532 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -217,7 +217,9 @@ func (t *_type) nameOff(off nameOff) name {
 }
 
 func resolveTypeOff(ptrInModule unsafe.Pointer, off typeOff) *_type {
-	if off == 0 {
+	if off == 0 || off == -1 {
+		// -1 is the sentinel value for unreachable code.
+		// See cmd/link/internal/ld/data.go:relocsym.
 		return nil
 	}
 	base := uintptr(ptrInModule)
@@ -257,6 +259,11 @@ func (t *_type) typeOff(off typeOff) *_type {
 }
 
 func (t *_type) textOff(off textOff) unsafe.Pointer {
+	if off == -1 {
+		// -1 is the sentinel value for unreachable code.
+		// See cmd/link/internal/ld/data.go:relocsym.
+		return unsafe.Pointer(^uintptr(0))
+	}
 	base := uintptr(unsafe.Pointer(t))
 	var md *moduledata
 	for next := &firstmoduledata; next != nil; next = next.next {
diff --git a/src/runtime/vdso_linux_amd64.go b/src/runtime/vdso_linux_amd64.go
index d9ab4ab3c6..4e9f748f4a 100644
--- a/src/runtime/vdso_linux_amd64.go
+++ b/src/runtime/vdso_linux_amd64.go
@@ -17,8 +17,7 @@ var vdsoSymbolKeys = []vdsoSymbolKey{
 	{"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &vdsoClockgettimeSym},
 }
 
-// initialize with vsyscall fallbacks
 var (
-	vdsoGettimeofdaySym uintptr = 0xffffffffff600000
-	vdsoClockgettimeSym uintptr = 0
+	vdsoGettimeofdaySym uintptr
+	vdsoClockgettimeSym uintptr
 )
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 38e0b32801..996c0611fd 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -263,7 +263,7 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 	return q, n
 }
 
-// Floating point control word values for GOARCH=386 GO386=387.
+// Floating point control word values.
 // Bits 0-5 are bits to disable floating-point exceptions.
 // Bits 8-9 are the precision control:
 //   0 = single precision a.k.a. float32
@@ -273,6 +273,5 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 //   3 = round toward zero
 var (
 	controlWord64      uint16 = 0x3f + 2<<8 + 0<<10
-	controlWord32             = 0x3f + 0<<8 + 0<<10
-	controlWord64trunc        = 0x3f + 2<<8 + 3<<10
+	controlWord64trunc uint16 = 0x3f + 2<<8 + 3<<10
 )
diff --git a/src/runtime/wincallback.go b/src/runtime/wincallback.go
index c022916422..fb452222da 100644
--- a/src/runtime/wincallback.go
+++ b/src/runtime/wincallback.go
@@ -11,7 +11,6 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 )
 
@@ -38,7 +37,7 @@ TEXT runtime·callbackasm(SB),7,$0
 	}
 
 	filename := fmt.Sprintf("zcallback_windows.s")
-	err := ioutil.WriteFile(filename, buf.Bytes(), 0666)
+	err := os.WriteFile(filename, buf.Bytes(), 0666)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "wincallback: %s\n", err)
 		os.Exit(2)
@@ -66,7 +65,7 @@ TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
 		buf.WriteString("\tB\truntime·callbackasm1(SB)\n")
 	}
 
-	err := ioutil.WriteFile("zcallback_windows_arm.s", buf.Bytes(), 0666)
+	err := os.WriteFile("zcallback_windows_arm.s", buf.Bytes(), 0666)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "wincallback: %s\n", err)
 		os.Exit(2)
@@ -82,7 +81,7 @@ package runtime
 
 const cb_max = %d // maximum number of windows callbacks allowed
 `, maxCallback))
-	err := ioutil.WriteFile("zcallback_windows.go", buf.Bytes(), 0666)
+	err := os.WriteFile("zcallback_windows.go", buf.Bytes(), 0666)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "wincallback: %s\n", err)
 		os.Exit(2)
author	Katie Hockman <katie@golang.org>	2020-12-14 10:03:05 -0500
committer	Katie Hockman <katie@golang.org>	2020-12-14 10:06:13 -0500
commit	0345ede87ee12698988973884cfc0fd3d499dffd (patch)
tree	7123cff141ee5661208d2f5f437b8f5252ac7f6a /src/runtime
parent	4651d6b267818b0e0d128a5443289717c4bb8cbc (diff)
parent	0a02371b0576964e81c3b40d328db9a3ef3b031b (diff)
download	go-0345ede87ee12698988973884cfc0fd3d499dffd.tar.xz