[dev.boringcrypto] all: merge master into dev.boringcrypto

Change-Id: Ia8ddd4e52dcfe87f9daef2edd37c8155fcae7f5a
author: Filippo Valsorda <filippo@golang.org> 2018-09-06 13:25:27 -0400
committer: Filippo Valsorda <filippo@golang.org> 2018-09-06 13:25:27 -0400
commit: 4d1aa482b8754c081d8f3f6b39fe61dd2e6504fc (patch)
tree: b12a76ad02035d1206d09a7aa14a6cda0c411c3c /src/runtime
parent: 7eb1677c01c3decc510270d532ed69d0bf42bffa (diff)
parent: 3e5b5d69dcdb82494f550049986426d84dd6b8f8 (diff)
download: go-4d1aa482b8754c081d8f3f6b39fe61dd2e6504fc.tar.xz
144 files changed, 1697 insertions, 648 deletions
diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md
index 0b390c34d9..72ba61970b 100644
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md
@@ -135,6 +135,47 @@ In summary,
 <tr><td>park</td><td>Y</td><td>N</td><td>N</td></tr>
 </table>
 
+Atomics
+=======
+
+The runtime uses its own atomics package at `runtime/internal/atomic`.
+This corresponds to `sync/atomic`, but functions have different names
+for historical reasons and there are a few additional functions needed
+by the runtime.
+
+In general, we think hard about the uses of atomics in the runtime and
+try to avoid unnecessary atomic operations. If access to a variable is
+sometimes protected by another synchronization mechanism, the
+already-protected accesses generally don't need to be atomic. There
+are several reasons for this:
+
+1. Using non-atomic or atomic access where appropriate makes the code
+   more self-documenting. Atomic access to a variable implies there's
+   somewhere else that may concurrently access the variable.
+
+2. Non-atomic access allows for automatic race detection. The runtime
+   doesn't currently have a race detector, but it may in the future.
+   Atomic access defeats the race detector, while non-atomic access
+   allows the race detector to check your assumptions.
+
+3. Non-atomic access may improve performance.
+
+Of course, any non-atomic access to a shared variable should be
+documented to explain how that access is protected.
+
+Some common patterns that mix atomic and non-atomic access are:
+
+* Read-mostly variables where updates are protected by a lock. Within
+  the locked region, reads do not need to be atomic, but the write
+  does. Outside the locked region, reads need to be atomic.
+
+* Reads that only happen during STW, where no writes can happen during
+  STW, do not need to be atomic.
+
+That said, the advice from the Go memory model stands: "Don't be
+[too] clever." The performance of the runtime matters, but its
+robustness matters more.
+
 Unmanaged memory
 ================
 
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index a6a81c3f63..7761415ecd 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -881,7 +881,7 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
 
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-8
-	CMPB	runtime·support_sse2(SB), $1
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE	done
 	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
 	JNE	mfence
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 6902ce2c22..2a15910aea 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -228,7 +228,7 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0
  *  go-routine
  */
 
-// void gosave(Gobuf*)
+// func gosave(buf *gobuf)
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB), NOSPLIT, $0-8
 	MOVQ	buf+0(FP), AX		// gobuf
@@ -248,7 +248,7 @@ TEXT runtime·gosave(SB), NOSPLIT, $0-8
 	MOVQ	BX, gobuf_g(AX)
 	RET
 
-// void gogo(Gobuf*)
+// func gogo(buf *gobuf)
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), NOSPLIT, $16-8
 	MOVQ	buf+0(FP), BX		// gobuf
@@ -560,7 +560,8 @@ TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// compile barrier.
 	RET
 
-// void jmpdefer(fn, sp);
+// func jmpdefer(fv *funcval, argp uintptr)
+// argp is a caller SP.
 // called from deferreturn.
 // 1. pop the caller
 // 2. sub 5 bytes from the callers return
@@ -670,7 +671,7 @@ nosave:
 	MOVL	AX, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
@@ -686,7 +687,7 @@ TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
 	CALL	AX
 	RET
 
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback_gofunc(fn, frame, framesize, ctxt uintptr)
 // See cgocall.go for more details.
 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
 	NO_LOCAL_POINTERS
@@ -811,7 +812,8 @@ havem:
 	// Done!
 	RET
 
-// void setg(G*); set g. for use by needm.
+// func setg(gg *g)
+// set g. for use by needm.
 TEXT runtime·setg(SB), NOSPLIT, $0-8
 	MOVQ	gg+0(FP), BX
 #ifdef GOOS_windows
@@ -866,6 +868,7 @@ done:
 	MOVQ	AX, ret+0(FP)
 	RET
 
+// func aeshash(p unsafe.Pointer, h, s uintptr) uintptr
 // hash function using AES hardware instructions
 TEXT runtime·aeshash(SB),NOSPLIT,$0-32
 	MOVQ	p+0(FP), AX	// ptr to data
@@ -873,6 +876,7 @@ TEXT runtime·aeshash(SB),NOSPLIT,$0-32
 	LEAQ	ret+24(FP), DX
 	JMP	runtime·aeshashbody(SB)
 
+// func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
 	MOVQ	p+0(FP), AX	// ptr to string struct
 	MOVQ	8(AX), CX	// length of string
@@ -1210,7 +1214,8 @@ aesloop:
 	PXOR	X9, X8
 	MOVQ	X8, (DX)
 	RET
-	
+
+// func aeshash32(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
@@ -1221,6 +1226,7 @@ TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
 	MOVQ	X0, ret+16(FP)
 	RET
 
+// func aeshash64(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
@@ -1266,6 +1272,7 @@ DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
 GLOBL masks<>(SB),RODATA,$256
 
+// func checkASM() bool
 TEXT ·checkASM(SB),NOSPLIT,$0-1
 	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
 	MOVQ	$masks<>(SB), AX
@@ -1465,7 +1472,7 @@ GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $0x14	// Size duplicated below
 // This function communicates back to the debugger by setting RAX and
 // invoking INT3 to raise a breakpoint signal. See the comments in the
 // implementation for the protocol the debugger is expected to
-// follow. InjectDebugCall in the runtime tests demonstates this protocol.
+// follow. InjectDebugCall in the runtime tests demonstrates this protocol.
 //
 // The debugger must ensure that any pointers passed to the function
 // obey escape analysis requirements. Specifically, it must not pass
@@ -1616,6 +1623,7 @@ DEBUG_CALL_FN(debugCall16384<>, 16384)
 DEBUG_CALL_FN(debugCall32768<>, 32768)
 DEBUG_CALL_FN(debugCall65536<>, 65536)
 
+// func debugCallPanicked(val interface{})
 TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
 	// Copy the panic value to the top of stack.
 	MOVQ	val_type+0(FP), AX
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index af389be9fe..6a6a699241 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -39,10 +39,9 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
 #endif
 	MOVD	$setg_gcc<>(SB), R1	// arg 1: setg
 	MOVD	g, R0			// arg 0: G
+	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL	(R12)
-	MOVD	_cgo_init(SB), R12
-	CMP	$0, R12
-	BEQ	nocgo
+	ADD	$16, RSP
 
 nocgo:
 	// update stackguard after _cgo_init
@@ -107,6 +106,7 @@ TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
 	MOVD	buf+0(FP), R3
 	MOVD	RSP, R0
 	MOVD	R0, gobuf_sp(R3)
+	MOVD	R29, gobuf_bp(R3)
 	MOVD	LR, gobuf_pc(R3)
 	MOVD	g, gobuf_g(R3)
 	MOVD	ZR, gobuf_lr(R3)
@@ -128,10 +128,12 @@ TEXT runtime·gogo(SB), NOSPLIT, $24-8
 	MOVD	0(g), R4	// make sure g is not nil
 	MOVD	gobuf_sp(R5), R0
 	MOVD	R0, RSP
+	MOVD	gobuf_bp(R5), R29
 	MOVD	gobuf_lr(R5), LR
 	MOVD	gobuf_ret(R5), R0
 	MOVD	gobuf_ctxt(R5), R26
 	MOVD	$0, gobuf_sp(R5)
+	MOVD	$0, gobuf_bp(R5)
 	MOVD	$0, gobuf_ret(R5)
 	MOVD	$0, gobuf_lr(R5)
 	MOVD	$0, gobuf_ctxt(R5)
@@ -147,6 +149,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
 	// Save caller state in g->sched
 	MOVD	RSP, R0
 	MOVD	R0, (g_sched+gobuf_sp)(g)
+	MOVD	R29, (g_sched+gobuf_bp)(g)
 	MOVD	LR, (g_sched+gobuf_pc)(g)
 	MOVD	$0, (g_sched+gobuf_lr)(g)
 	MOVD	g, (g_sched+gobuf_g)(g)
@@ -163,6 +166,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
 	MOVD	0(R26), R4			// code pointer
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP	// sp = m->g0->sched.sp
+	MOVD	(g_sched+gobuf_bp)(g), R29
 	MOVD	R3, -8(RSP)
 	MOVD	$0, -16(RSP)
 	SUB	$16, RSP
@@ -211,6 +215,7 @@ switch:
 	MOVD	R6, (g_sched+gobuf_pc)(g)
 	MOVD	RSP, R0
 	MOVD	R0, (g_sched+gobuf_sp)(g)
+	MOVD	R29, (g_sched+gobuf_bp)(g)
 	MOVD	$0, (g_sched+gobuf_lr)(g)
 	MOVD	g, (g_sched+gobuf_g)(g)
 
@@ -224,6 +229,7 @@ switch:
 	MOVD	$runtime·mstart(SB), R4
 	MOVD	R4, 0(R3)
 	MOVD	R3, RSP
+	MOVD	(g_sched+gobuf_bp)(g), R29
 
 	// call target function
 	MOVD	0(R26), R3	// code pointer
@@ -235,7 +241,9 @@ switch:
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP
+	MOVD	(g_sched+gobuf_bp)(g), R29
 	MOVD	$0, (g_sched+gobuf_sp)(g)
+	MOVD	$0, (g_sched+gobuf_bp)(g)
 	RET
 
 noswitch:
@@ -244,6 +252,7 @@ noswitch:
 	// at an intermediate systemstack.
 	MOVD	0(R26), R3	// code pointer
 	MOVD.P	16(RSP), R30	// restore LR
+	SUB	$8, RSP, R29	// restore FP
 	B	(R3)
 
 /*
@@ -278,6 +287,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	// Set g->sched to context in f
 	MOVD	RSP, R0
 	MOVD	R0, (g_sched+gobuf_sp)(g)
+	MOVD	R29, (g_sched+gobuf_bp)(g)
 	MOVD	LR, (g_sched+gobuf_pc)(g)
 	MOVD	R3, (g_sched+gobuf_lr)(g)
 	MOVD	R26, (g_sched+gobuf_ctxt)(g)
@@ -294,6 +304,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP
+	MOVD	(g_sched+gobuf_bp)(g), R29
 	MOVD.W	$0, -16(RSP)	// create a call frame on g0 (saved LR; keep 16-aligned)
 	BL	runtime·newstack(SB)
 
@@ -843,8 +854,9 @@ TEXT runtime·jmpdefer(SB), NOSPLIT|NOFRAME, $0-16
 // Save state of caller into g->sched. Smashes R0.
 TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
 	MOVD	LR, (g_sched+gobuf_pc)(g)
-	MOVD RSP, R0
+	MOVD	RSP, R0
 	MOVD	R0, (g_sched+gobuf_sp)(g)
+	MOVD	R29, (g_sched+gobuf_bp)(g)
 	MOVD	$0, (g_sched+gobuf_lr)(g)
 	MOVD	$0, (g_sched+gobuf_ret)(g)
 	// Assert ctxt is zero. See func save.
@@ -885,6 +897,7 @@ TEXT ·asmcgocall(SB),NOSPLIT,$0-20
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R0
 	MOVD	R0, RSP
+	MOVD	(g_sched+gobuf_bp)(g), R29
 	MOVD	R9, R0
 
 	// Now on a scheduling stack (a pthread-created stack).
@@ -996,6 +1009,7 @@ needm:
 	MOVD	m_g0(R8), R3
 	MOVD	RSP, R0
 	MOVD	R0, (g_sched+gobuf_sp)(R3)
+	MOVD	R29, (g_sched+gobuf_bp)(R3)
 
 havem:
 	// Now there's a valid m, and we're running on its m->g0.
@@ -1003,7 +1017,7 @@ havem:
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
 	// NOTE: unwindm knows that the saved g->sched.sp is at 16(RSP) aka savedsp-16(SP).
-	// Beware that the frame size is actually 32.
+	// Beware that the frame size is actually 32+16.
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
 	MOVD	R4, savedsp-16(SP)
@@ -1030,10 +1044,12 @@ havem:
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -(24+8)(R4)
+	MOVD	R5, -48(R4)
+	MOVD	(g_sched+gobuf_bp)(g), R5
+	MOVD	R5, -56(R4)
 	MOVD	ctxt+24(FP), R0
-	MOVD	R0, -(16+8)(R4)
-	MOVD	$-(24+8)(R4), R0 // maintain 16-byte SP alignment
+	MOVD	R0, -40(R4)
+	MOVD	$-48(R4), R0 // maintain 16-byte SP alignment
 	MOVD	R0, RSP
 	BL	runtime·cgocallbackg(SB)
 
@@ -1041,7 +1057,7 @@ havem:
 	MOVD	0(RSP), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
 	MOVD	RSP, R4
-	ADD	$(24+8), R4, R4
+	ADD	$48, R4, R4
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 0886de9f2b..57877c0194 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -723,18 +723,11 @@ TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R0
 	UNDEF
 
-#define	TBRL	268
-#define	TBRU	269		/* Time base Upper/Lower */
+#define	TBR	268
 
 // int64 runtime·cputicks(void)
 TEXT runtime·cputicks(SB),NOSPLIT,$0-8
-	MOVW	SPR(TBRU), R4
-	MOVW	SPR(TBRL), R3
-	MOVW	SPR(TBRU), R5
-	CMPW	R4, R5
-	BNE	-4(PC)
-	SLD	$32, R5
-	OR	R5, R3
+	MOVD	SPR(TBR), R3
 	MOVD	R3, ret+0(FP)
 	RET
 
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index 09cfbda9b1..b8f0c22c63 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -13,8 +13,6 @@ import (
 // because while ptr does not escape, new does.
 // If new is marked as not escaping, the compiler will make incorrect
 // escape analysis decisions about the pointer value being stored.
-// Instead, these are wrappers around the actual atomics (casp1 and so on)
-// that use noescape to convey which arguments do not escape.
 
 // atomicwb performs a write barrier before an atomic pointer write.
 // The caller should guard the call with "if writeBarrier.enabled".
@@ -37,17 +35,6 @@ func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 	atomic.StorepNoWB(noescape(ptr), new)
 }
 
-//go:nosplit
-func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
-	// The write barrier is only necessary if the CAS succeeds,
-	// but since it needs to happen before the write becomes
-	// public, we have to do it conservatively all the time.
-	if writeBarrier.enabled {
-		atomicwb(ptr, new)
-	}
-	return atomic.Casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new)
-}
-
 // Like above, but implement in terms of sync/atomic's uintptr operations.
 // We cannot just call the runtime routines, because the race detector expects
 // to be able to intercept the sync/atomic forms but not the runtime forms.
diff --git a/src/runtime/cgo/callbacks_traceback.go b/src/runtime/cgo/callbacks_traceback.go
index f754846722..cdadf9e66f 100644
--- a/src/runtime/cgo/callbacks_traceback.go
+++ b/src/runtime/cgo/callbacks_traceback.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux
+// +build darwin linux
 
 package cgo
 
diff --git a/src/runtime/cgo/gcc_traceback.c b/src/runtime/cgo/gcc_traceback.c
index 667ea4c0cf..d86331c583 100644
--- a/src/runtime/cgo/gcc_traceback.c
+++ b/src/runtime/cgo/gcc_traceback.c
@@ -2,17 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build cgo
-// +build linux
+// +build cgo,darwin cgo,linux
 
 #include <stdint.h>
-
-struct cgoTracebackArg {
-	uintptr_t  Context;
-	uintptr_t  SigContext;
-	uintptr_t* Buf;
-	uintptr_t  Max;
-};
+#include "libcgo.h"
 
 // Call the user's traceback function and then call sigtramp.
 // The runtime signal handler will jump to this code.
diff --git a/src/runtime/cgo/gcc_windows_386.c b/src/runtime/cgo/gcc_windows_386.c
index fa0c69bc13..f2ff710f60 100644
--- a/src/runtime/cgo/gcc_windows_386.c
+++ b/src/runtime/cgo/gcc_windows_386.c
@@ -11,16 +11,9 @@
 
 static void threadentry(void*);
 
-/* 1MB is default stack size for 32-bit Windows.
-   Allocation granularity on Windows is typically 64 KB.
-   The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
-#define STACKSIZE (1*1024*1024)
-
 void
 x_cgo_init(G *g)
 {
-	int tmp;
-	g->stacklo = (uintptr)&tmp - STACKSIZE + 8*1024;
 }
 
 
@@ -44,8 +37,7 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	ts.g->stackhi = (uintptr)&ts;
-	ts.g->stacklo = (uintptr)&ts - STACKSIZE + 8*1024;
+	// minit queries stack bounds from the OS.
 
 	/*
 	 * Set specific keys in thread local storage.
diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
index a3c3896edf..511ab44fa9 100644
--- a/src/runtime/cgo/gcc_windows_amd64.c
+++ b/src/runtime/cgo/gcc_windows_amd64.c
@@ -11,16 +11,9 @@
 
 static void threadentry(void*);
 
-/* 2MB is default stack size for 64-bit Windows.
-   Allocation granularity on Windows is typically 64 KB.
-   The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
-#define STACKSIZE (2*1024*1024)
-
 void
 x_cgo_init(G *g)
 {
-	int tmp;
-	g->stacklo = (uintptr)&tmp - STACKSIZE + 8*1024;
 }
 
 
@@ -44,8 +37,7 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	ts.g->stackhi = (uintptr)&ts;
-	ts.g->stacklo = (uintptr)&ts - STACKSIZE + 8*1024;
+	// minit queries stack bounds from the OS.
 
 	/*
 	 * Set specific keys in thread local storage.
diff --git a/src/runtime/cgo/libcgo.h b/src/runtime/cgo/libcgo.h
index c38fb643ff..60326720a7 100644
--- a/src/runtime/cgo/libcgo.h
+++ b/src/runtime/cgo/libcgo.h
@@ -97,6 +97,16 @@ struct context_arg {
 extern void (*(_cgo_get_context_function(void)))(struct context_arg*);
 
 /*
+ * The argument for the cgo traceback callback. See runtime.SetCgoTraceback.
+ */
+struct cgoTracebackArg {
+	uintptr_t  Context;
+	uintptr_t  SigContext;
+	uintptr_t* Buf;
+	uintptr_t  Max;
+};
+
+/*
  * TSAN support.  This is only useful when building with
  *   CGO_CFLAGS="-fsanitize=thread" CGO_LDFLAGS="-fsanitize=thread" go install
  */
diff --git a/src/runtime/cgo/netbsd.go b/src/runtime/cgo/netbsd.go
index 2cecd0c57a..74d0aed014 100644
--- a/src/runtime/cgo/netbsd.go
+++ b/src/runtime/cgo/netbsd.go
@@ -14,6 +14,8 @@ import _ "unsafe" // for go:linkname
 
 //go:linkname _environ environ
 //go:linkname _progname __progname
+//go:linkname ___ps_strings __ps_strings
 
 var _environ uintptr
 var _progname uintptr
+var ___ps_strings uintptr
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index c85033f4bc..86bd2fb01c 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -268,7 +268,8 @@ func cgocallbackg1(ctxt uintptr) {
 	case "arm64":
 		// On arm64, stack frame is four words and there's a saved LR between
 		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 5*sys.PtrSize))
+		// Additional two words (16-byte alignment) are for saving FP.
+		cb = (*args)(unsafe.Pointer(sp + 7*sys.PtrSize))
 	case "amd64":
 		// On amd64, stack frame is two words, plus caller PC.
 		if framepointer_enabled {
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index ce71cee4c5..615643e6a6 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -343,7 +343,7 @@ func closechan(c *hchan) {
 
 	c.closed = 1
 
-	var glist *g
+	var glist gList
 
 	// release all readers
 	for {
@@ -363,8 +363,7 @@ func closechan(c *hchan) {
 		if raceenabled {
 			raceacquireg(gp, unsafe.Pointer(c))
 		}
-		gp.schedlink.set(glist)
-		glist = gp
+		glist.push(gp)
 	}
 
 	// release all writers (they will panic)
@@ -382,15 +381,13 @@ func closechan(c *hchan) {
 		if raceenabled {
 			raceacquireg(gp, unsafe.Pointer(c))
 		}
-		gp.schedlink.set(glist)
-		glist = gp
+		glist.push(gp)
 	}
 	unlock(&c.lock)
 
 	// Ready all Gs now that we've dropped the channel lock.
-	for glist != nil {
-		gp := glist
-		glist = glist.schedlink.ptr()
+	for !glist.empty() {
+		gp := glist.pop()
 		gp.schedlink = 0
 		goready(gp, 3)
 	}
diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
new file mode 100644
index 0000000000..b65523766a
--- /dev/null
+++ b/src/runtime/cpuflags.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
+	offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
+
+	offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
+)
diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
index 10ab5f5b00..8cca4bca8f 100644
--- a/src/runtime/cpuflags_amd64.go
+++ b/src/runtime/cpuflags_amd64.go
@@ -6,12 +6,6 @@ package runtime
 
 import (
 	"internal/cpu"
-	"unsafe"
-)
-
-// Offsets into internal/cpu records for use in assembly.
-const (
-	offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
 )
 
 var useAVXmemmove bool
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index d8f75a468b..6da8341e84 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -239,8 +239,12 @@ func TestCgoCCodeSIGPROF(t *testing.T) {
 
 func TestCgoCrashTraceback(t *testing.T) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
-		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
+	switch platform := runtime.GOOS + "/" + runtime.GOARCH; platform {
+	case "darwin/amd64":
+	case "linux/amd64":
+	case "linux/ppc64le":
+	default:
+		t.Skipf("not yet supported on %s", platform)
 	}
 	got := runTestProg(t, "testprogcgo", "CrashTraceback")
 	for i := 1; i <= 3; i++ {
@@ -489,3 +493,19 @@ func TestCgoTracebackSigpanic(t *testing.T) {
 		t.Fatalf("failure incorrectly contains %q. output:\n%s\n", nowant, got)
 	}
 }
+
+// Test that C code called via cgo can use large Windows thread stacks
+// and call back in to Go without crashing. See issue #20975.
+//
+// See also TestBigStackCallbackSyscall.
+func TestBigStackCallbackCgo(t *testing.T) {
+	if runtime.GOOS != "windows" {
+		t.Skip("skipping windows specific test")
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "BigStack")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 843b415006..2766b8850a 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -640,17 +640,88 @@ func TestTimePprof(t *testing.T) {
 
 // Test that runtime.abort does so.
 func TestAbort(t *testing.T) {
-	output := runTestProg(t, "testprog", "Abort")
+	// Pass GOTRACEBACK to ensure we get runtime frames.
+	output := runTestProg(t, "testprog", "Abort", "GOTRACEBACK=system")
 	if want := "runtime.abort"; !strings.Contains(output, want) {
 		t.Errorf("output does not contain %q:\n%s", want, output)
 	}
 	if strings.Contains(output, "BAD") {
 		t.Errorf("output contains BAD:\n%s", output)
 	}
-	// Check that it's a signal-style traceback.
-	if runtime.GOOS != "windows" {
-		if want := "PC="; !strings.Contains(output, want) {
-			t.Errorf("output does not contain %q:\n%s", want, output)
+	// Check that it's a signal traceback.
+	want := "PC="
+	// For systems that use a breakpoint, check specifically for that.
+	switch runtime.GOARCH {
+	case "386", "amd64":
+		switch runtime.GOOS {
+		case "plan9":
+			want = "sys: breakpoint"
+		case "windows":
+			want = "Exception 0x80000003"
+		default:
+			want = "SIGTRAP"
+		}
+	}
+	if !strings.Contains(output, want) {
+		t.Errorf("output does not contain %q:\n%s", want, output)
+	}
+}
+
+// For TestRuntimePanic: test a panic in the runtime package without
+// involving the testing harness.
+func init() {
+	if os.Getenv("GO_TEST_RUNTIME_PANIC") == "1" {
+		defer func() {
+			if r := recover(); r != nil {
+				// We expect to crash, so exit 0
+				// to indicate failure.
+				os.Exit(0)
+			}
+		}()
+		runtime.PanicForTesting(nil, 1)
+		// We expect to crash, so exit 0 to indicate failure.
+		os.Exit(0)
+	}
+}
+
+func TestRuntimePanic(t *testing.T) {
+	testenv.MustHaveExec(t)
+	cmd := exec.Command(os.Args[0], "-test.run=TestRuntimePanic")
+	cmd.Env = append(cmd.Env, "GO_TEST_RUNTIME_PANIC=1")
+	out, err := cmd.CombinedOutput()
+	t.Logf("%s", out)
+	if err == nil {
+		t.Error("child process did not fail")
+	} else if want := "runtime.unexportedPanicForTesting"; !bytes.Contains(out, []byte(want)) {
+		t.Errorf("output did not contain expected string %q", want)
+	}
+}
+
+// Test that g0 stack overflows are handled gracefully.
+func TestG0StackOverflow(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "android":
+		t.Skipf("g0 stack is wrong on pthread platforms (see golang.org/issue/26061)")
+	}
+
+	if os.Getenv("TEST_G0_STACK_OVERFLOW") != "1" {
+		cmd := testenv.CleanCmdEnv(exec.Command(os.Args[0], "-test.run=TestG0StackOverflow", "-test.v"))
+		cmd.Env = append(cmd.Env, "TEST_G0_STACK_OVERFLOW=1")
+		out, err := cmd.CombinedOutput()
+		// Don't check err since it's expected to crash.
+		if n := strings.Count(string(out), "morestack on g0\n"); n != 1 {
+			t.Fatalf("%s\n(exit status %v)", out, err)
+		}
+		// Check that it's a signal-style traceback.
+		if runtime.GOOS != "windows" {
+			if want := "PC="; !strings.Contains(string(out), want) {
+				t.Errorf("output does not contain %q:\n%s", want, out)
+			}
 		}
+		return
 	}
+
+	runtime.G0StackOverflow()
 }
diff --git a/src/runtime/debug_test.go b/src/runtime/debug_test.go
index 4181d59c1f..a34f4c77f7 100644
--- a/src/runtime/debug_test.go
+++ b/src/runtime/debug_test.go
@@ -69,8 +69,8 @@ func debugCallWorker2(stop *uint32, x *int) {
 	*x = 1
 }
 
-func debugCallTKill(tid int) {
-	syscall.Tgkill(syscall.Getpid(), tid, syscall.SIGTRAP)
+func debugCallTKill(tid int) error {
+	return syscall.Tgkill(syscall.Getpid(), tid, syscall.SIGTRAP)
 }
 
 func TestDebugCall(t *testing.T) {
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index 9ff13dfcbf..a328d25db3 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -37,6 +37,7 @@ const (
 	MAP_ANON    = C.MAP_ANON
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
+	MAP_STACK   = C.MAP_STACK
 
 	MADV_FREE = C.MADV_FREE
 
diff --git a/src/runtime/defs_openbsd_386.go b/src/runtime/defs_openbsd_386.go
index 1185530964..7b956c44f0 100644
--- a/src/runtime/defs_openbsd_386.go
+++ b/src/runtime/defs_openbsd_386.go
@@ -17,6 +17,7 @@ const (
 	_MAP_ANON    = 0x1000
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
+	_MAP_STACK   = 0x4000
 
 	_MADV_FREE = 0x6
 
diff --git a/src/runtime/defs_openbsd_amd64.go b/src/runtime/defs_openbsd_amd64.go
index 4bb8eac08f..0a93905717 100644
--- a/src/runtime/defs_openbsd_amd64.go
+++ b/src/runtime/defs_openbsd_amd64.go
@@ -17,6 +17,7 @@ const (
 	_MAP_ANON    = 0x1000
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
+	_MAP_STACK   = 0x4000
 
 	_MADV_FREE = 0x6
 
diff --git a/src/runtime/defs_openbsd_arm.go b/src/runtime/defs_openbsd_arm.go
index 38b77c92d0..1eea9ad45a 100644
--- a/src/runtime/defs_openbsd_arm.go
+++ b/src/runtime/defs_openbsd_arm.go
@@ -17,6 +17,7 @@ const (
 	_MAP_ANON    = 0x1000
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
+	_MAP_STACK   = 0x4000
 
 	_MADV_FREE = 0x6
 
diff --git a/src/runtime/defs_windows.go b/src/runtime/defs_windows.go
index 7ce6797414..9bd9107476 100644
--- a/src/runtime/defs_windows.go
+++ b/src/runtime/defs_windows.go
@@ -71,3 +71,4 @@ type FloatingSaveArea C.FLOATING_SAVE_AREA
 type M128a C.M128A
 type Context C.CONTEXT
 type Overlapped C.OVERLAPPED
+type MemoryBasicInformation C.MEMORY_BASIC_INFORMATION
diff --git a/src/runtime/defs_windows_386.go b/src/runtime/defs_windows_386.go
index bac6ce78ce..589a7884cd 100644
--- a/src/runtime/defs_windows_386.go
+++ b/src/runtime/defs_windows_386.go
@@ -129,3 +129,13 @@ type overlapped struct {
 	anon0        [8]byte
 	hevent       *byte
 }
+
+type memoryBasicInformation struct {
+	baseAddress       uintptr
+	allocationBase    uintptr
+	allocationProtect uint32
+	regionSize        uintptr
+	state             uint32
+	protect           uint32
+	type_             uint32
+}
diff --git a/src/runtime/defs_windows_amd64.go b/src/runtime/defs_windows_amd64.go
index 6e04568114..1e173e934d 100644
--- a/src/runtime/defs_windows_amd64.go
+++ b/src/runtime/defs_windows_amd64.go
@@ -151,3 +151,13 @@ type overlapped struct {
 	anon0        [8]byte
 	hevent       *byte
 }
+
+type memoryBasicInformation struct {
+	baseAddress       uintptr
+	allocationBase    uintptr
+	allocationProtect uint32
+	regionSize        uintptr
+	state             uint32
+	protect           uint32
+	type_             uint32
+}
diff --git a/src/runtime/error.go b/src/runtime/error.go
index 1f77c0a0b5..9a2beaeb95 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -19,32 +19,37 @@ type Error interface {
 
 // A TypeAssertionError explains a failed type assertion.
 type TypeAssertionError struct {
-	interfaceString string
-	concreteString  string
-	assertedString  string
-	missingMethod   string // one method needed by Interface, missing from Concrete
+	_interface    *_type
+	concrete      *_type
+	asserted      *_type
+	missingMethod string // one method needed by Interface, missing from Concrete
 }
 
 func (*TypeAssertionError) RuntimeError() {}
 
 func (e *TypeAssertionError) Error() string {
-	inter := e.interfaceString
-	if inter == "" {
-		inter = "interface"
+	inter := "interface"
+	if e._interface != nil {
+		inter = e._interface.string()
 	}
-	if e.concreteString == "" {
-		return "interface conversion: " + inter + " is nil, not " + e.assertedString
+	as := e.asserted.string()
+	if e.concrete == nil {
+		return "interface conversion: " + inter + " is nil, not " + as
 	}
+	cs := e.concrete.string()
 	if e.missingMethod == "" {
-		msg := "interface conversion: " + inter + " is " + e.concreteString +
-			", not " + e.assertedString
-		if e.concreteString == e.assertedString {
+		msg := "interface conversion: " + inter + " is " + cs + ", not " + as
+		if cs == as {
 			// provide slightly clearer error message
-			msg += " (types from different packages)"
+			if e.concrete.pkgpath() != e.asserted.pkgpath() {
+				msg += " (types from different packages)"
+			} else {
+				msg += " (types from different scopes)"
+			}
 		}
 		return msg
 	}
-	return "interface conversion: " + e.concreteString + " is not " + e.assertedString +
+	return "interface conversion: " + cs + " is not " + as +
 		": missing method " + e.missingMethod
 }
 
diff --git a/src/runtime/export_debug_test.go b/src/runtime/export_debug_test.go
index 78436f36cf..74f8855de6 100644
--- a/src/runtime/export_debug_test.go
+++ b/src/runtime/export_debug_test.go
@@ -20,7 +20,7 @@ import (
 //
 // On success, InjectDebugCall returns the panic value of fn or nil.
 // If fn did not panic, its results will be available in args.
-func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int)) (interface{}, error) {
+func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (interface{}, error) {
 	if gp.lockedm == 0 {
 		return nil, plainError("goroutine not locked to thread")
 	}
@@ -54,7 +54,9 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int)) (interfac
 
 	defer func() { testSigtrap = nil }()
 	testSigtrap = h.inject
-	tkill(tid)
+	if err := tkill(tid); err != nil {
+		return nil, err
+	}
 	// Wait for completion.
 	notetsleepg(&h.done, -1)
 	if len(h.err) != 0 {
@@ -113,7 +115,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		return false
 	}
 	f := findfunc(uintptr(ctxt.rip()))
-	if !(hasprefix(funcname(f), "runtime.debugCall") || hasprefix(funcname(f), "debugCall")) {
+	if !(hasPrefix(funcname(f), "runtime.debugCall") || hasPrefix(funcname(f), "debugCall")) {
 		println("trap in unknown function", funcname(f))
 		return false
 	}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index b21179cc8c..89f887b765 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -451,3 +451,24 @@ type G = g
 func Getg() *G {
 	return getg()
 }
+
+//go:noinline
+func PanicForTesting(b []byte, i int) byte {
+	return unexportedPanicForTesting(b, i)
+}
+
+//go:noinline
+func unexportedPanicForTesting(b []byte, i int) byte {
+	return b[i]
+}
+
+func G0StackOverflow() {
+	systemstack(func() {
+		stackOverflow(nil)
+	})
+}
+
+func stackOverflow(x *byte) {
+	var buf [256]byte
+	stackOverflow(&buf[0])
+}
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 7171b139c3..1773c8fe7e 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -176,7 +176,7 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
 	// what it called, so that CallersFrames can see if it "called"
 	// sigpanic, and possibly a PC for skipPleaseUseCallersFrames.
 	var rpc [3]uintptr
-	if callers(1+skip-1, rpc[:]) < 2 {
+	if callers(skip, rpc[:]) < 2 {
 		return
 	}
 	var stackExpander stackExpander
@@ -238,6 +238,7 @@ func Version() string {
 
 // GOOS is the running program's operating system target:
 // one of darwin, freebsd, linux, and so on.
+// To view possible combinations of GOOS and GOARCH, run "go tool dist list".
 const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 4c290b9b9a..e6e0306e65 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -6,7 +6,7 @@
 // in Go binaries. It is included by assembly sources, so it must
 // be written using #defines.
 //
-// These must agree with symtab.go and ../cmd/internal/obj/funcdata.go.
+// These must agree with symtab.go and ../cmd/internal/objabi/funcdata.go.
 
 #define PCDATA_StackMapIndex 0
 #define PCDATA_InlTreeIndex 1
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 4895a0e2ac..0da19cdf34 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -21,6 +21,9 @@ func TestGcSys(t *testing.T) {
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
+	if runtime.GOOS == "windows" {
+		t.Skip("skipping test; GOOS=windows http://golang.org/issue/27156")
+	}
 	got := runTestProg(t, "testprog", "GCSys")
 	want := "OK\n"
 	if got != want {
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 15c412c4e6..7ab731151e 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -41,7 +41,7 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 			return nil
 		}
 		name := inter.typ.nameOff(inter.mhdr[0].name)
-		panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), name.name()})
+		panic(&TypeAssertionError{nil, typ, &inter.typ, name.name()})
 	}
 
 	var m *itab
@@ -82,7 +82,7 @@ finish:
 	// The cached result doesn't record which
 	// interface function was missing, so initialize
 	// the itab again to get the missing function name.
-	panic(&TypeAssertionError{concreteString: typ.string(), assertedString: inter.typ.string(), missingMethod: m.init()})
+	panic(&TypeAssertionError{concrete: typ, asserted: &inter.typ, missingMethod: m.init()})
 }
 
 // find finds the given interface/type pair in t.
@@ -245,11 +245,7 @@ func itabsinit() {
 // want = the static type we're trying to convert to.
 // iface = the static type we're converting from.
 func panicdottypeE(have, want, iface *_type) {
-	haveString := ""
-	if have != nil {
-		haveString = have.string()
-	}
-	panic(&TypeAssertionError{iface.string(), haveString, want.string(), ""})
+	panic(&TypeAssertionError{iface, have, want, ""})
 }
 
 // panicdottypeI is called when doing an i.(T) conversion and the conversion fails.
@@ -265,7 +261,7 @@ func panicdottypeI(have *itab, want, iface *_type) {
 // panicnildottype is called when doing a i.(T) conversion and the interface i is nil.
 // want = the static type we're trying to convert to.
 func panicnildottype(want *_type) {
-	panic(&TypeAssertionError{"", "", want.string(), ""})
+	panic(&TypeAssertionError{nil, nil, want, ""})
 	// TODO: Add the static type we're converting from as well.
 	// It might generate a better error message.
 	// Just to match other nil conversion errors, we don't for now.
@@ -516,7 +512,7 @@ func assertI2I(inter *interfacetype, i iface) (r iface) {
 	tab := i.tab
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
+		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
 	}
 	if tab.inter == inter {
 		r.tab = tab
@@ -549,7 +545,7 @@ func assertE2I(inter *interfacetype, e eface) (r iface) {
 	t := e._type
 	if t == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
+		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
 	}
 	r.tab = getitab(inter, t, false)
 	r.data = e.data
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 4ed7e991fe..1ecdb11db9 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -7,7 +7,7 @@
 package atomic
 
 import (
-	"runtime/internal/sys"
+	"internal/cpu"
 	"unsafe"
 )
 
@@ -31,7 +31,7 @@ func (l *spinlock) unlock() {
 
 var locktab [57]struct {
 	l   spinlock
-	pad [sys.CacheLineSize - unsafe.Sizeof(spinlock{})]byte
+	pad [cpu.CacheLinePadSize - unsafe.Sizeof(spinlock{})]byte
 }
 
 func addrLock(addr *uint64) *spinlock {
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 32be1c779d..55943f6925 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -7,14 +7,14 @@
 package atomic
 
 import (
-	"runtime/internal/sys"
+	"internal/cpu"
 	"unsafe"
 )
 
 // TODO implement lock striping
 var lock struct {
 	state uint32
-	pad   [sys.CacheLineSize - 4]byte
+	pad   [cpu.CacheLinePadSize - 4]byte
 }
 
 //go:noescape
diff --git a/src/runtime/internal/sys/arch_386.go b/src/runtime/internal/sys/arch_386.go
index 5fb1fba02b..5375701337 100644
--- a/src/runtime/internal/sys/arch_386.go
+++ b/src/runtime/internal/sys/arch_386.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = I386
 	BigEndian           = false
-	CacheLineSize       = 64
 	DefaultPhysPageSize = GoosNacl*65536 + (1-GoosNacl)*4096 // 4k normally; 64k on NaCl
 	PCQuantum           = 1
 	Int64Align          = 4
diff --git a/src/runtime/internal/sys/arch_amd64.go b/src/runtime/internal/sys/arch_amd64.go
index 2f32bc469f..86fed4d531 100644
--- a/src/runtime/internal/sys/arch_amd64.go
+++ b/src/runtime/internal/sys/arch_amd64.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = AMD64
 	BigEndian           = false
-	CacheLineSize       = 64
 	DefaultPhysPageSize = 4096
 	PCQuantum           = 1
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_amd64p32.go b/src/runtime/internal/sys/arch_amd64p32.go
index c560907c67..749d724809 100644
--- a/src/runtime/internal/sys/arch_amd64p32.go
+++ b/src/runtime/internal/sys/arch_amd64p32.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = AMD64
 	BigEndian           = false
-	CacheLineSize       = 64
 	DefaultPhysPageSize = 65536*GoosNacl + 4096*(1-GoosNacl)
 	PCQuantum           = 1
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_arm.go b/src/runtime/internal/sys/arch_arm.go
index f383d82027..2af09e0e35 100644
--- a/src/runtime/internal/sys/arch_arm.go
+++ b/src/runtime/internal/sys/arch_arm.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = ARM
 	BigEndian           = false
-	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 4
diff --git a/src/runtime/internal/sys/arch_arm64.go b/src/runtime/internal/sys/arch_arm64.go
index cb83ecc445..f13d2de129 100644
--- a/src/runtime/internal/sys/arch_arm64.go
+++ b/src/runtime/internal/sys/arch_arm64.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = ARM64
 	BigEndian           = false
-	CacheLineSize       = 64
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_mips.go b/src/runtime/internal/sys/arch_mips.go
index e12f32d0ee..e9bd69c928 100644
--- a/src/runtime/internal/sys/arch_mips.go
+++ b/src/runtime/internal/sys/arch_mips.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = MIPS
 	BigEndian           = true
-	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 4
diff --git a/src/runtime/internal/sys/arch_mips64.go b/src/runtime/internal/sys/arch_mips64.go
index 973ec10e17..5eb7b2b7b1 100644
--- a/src/runtime/internal/sys/arch_mips64.go
+++ b/src/runtime/internal/sys/arch_mips64.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = MIPS64
 	BigEndian           = true
-	CacheLineSize       = 32
 	DefaultPhysPageSize = 16384
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_mips64le.go b/src/runtime/internal/sys/arch_mips64le.go
index e96d962f36..14c804ed85 100644
--- a/src/runtime/internal/sys/arch_mips64le.go
+++ b/src/runtime/internal/sys/arch_mips64le.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = MIPS64
 	BigEndian           = false
-	CacheLineSize       = 32
 	DefaultPhysPageSize = 16384
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_mipsle.go b/src/runtime/internal/sys/arch_mipsle.go
index 25742ae9d3..91badb17d5 100644
--- a/src/runtime/internal/sys/arch_mipsle.go
+++ b/src/runtime/internal/sys/arch_mipsle.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = MIPS
 	BigEndian           = false
-	CacheLineSize       = 32
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 4
diff --git a/src/runtime/internal/sys/arch_ppc64.go b/src/runtime/internal/sys/arch_ppc64.go
index a538bbdec0..8cde4e18d0 100644
--- a/src/runtime/internal/sys/arch_ppc64.go
+++ b/src/runtime/internal/sys/arch_ppc64.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = PPC64
 	BigEndian           = true
-	CacheLineSize       = 128
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_ppc64le.go b/src/runtime/internal/sys/arch_ppc64le.go
index aa50689181..10c0066849 100644
--- a/src/runtime/internal/sys/arch_ppc64le.go
+++ b/src/runtime/internal/sys/arch_ppc64le.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = PPC64
 	BigEndian           = false
-	CacheLineSize       = 128
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 4
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_s390x.go b/src/runtime/internal/sys/arch_s390x.go
index e42c420a54..77fd4bf07d 100644
--- a/src/runtime/internal/sys/arch_s390x.go
+++ b/src/runtime/internal/sys/arch_s390x.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = S390X
 	BigEndian           = true
-	CacheLineSize       = 256
 	DefaultPhysPageSize = 4096
 	PCQuantum           = 2
 	Int64Align          = 8
diff --git a/src/runtime/internal/sys/arch_wasm.go b/src/runtime/internal/sys/arch_wasm.go
index 5463f934d6..203fc2e472 100644
--- a/src/runtime/internal/sys/arch_wasm.go
+++ b/src/runtime/internal/sys/arch_wasm.go
@@ -7,7 +7,6 @@ package sys
 const (
 	ArchFamily          = WASM
 	BigEndian           = false
-	CacheLineSize       = 64
 	DefaultPhysPageSize = 65536
 	PCQuantum           = 1
 	Int64Align          = 8
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index e75edf05fd..07e0a67240 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -427,6 +427,14 @@ func mallocinit() {
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
+			case raceenabled:
+				// The TSAN runtime requires the heap
+				// to be in the range [0x00c000000000,
+				// 0x00e000000000).
+				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
+				if p >= uintptrMask&0x00e000000000 {
+					continue
+				}
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 0bce059f7f..e6afc25ea9 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -19,7 +19,11 @@ import (
 	"unsafe"
 )
 
+var testMemStatsCount int
+
 func TestMemStats(t *testing.T) {
+	testMemStatsCount++
+
 	// Make sure there's at least one forced GC.
 	GC()
 
@@ -35,6 +39,13 @@ func TestMemStats(t *testing.T) {
 	}
 	le := func(thresh float64) func(interface{}) error {
 		return func(x interface{}) error {
+			// These sanity tests aren't necessarily valid
+			// with high -test.count values, so only run
+			// them once.
+			if testMemStatsCount > 1 {
+				return nil
+			}
+
 			if reflect.ValueOf(x).Convert(reflect.TypeOf(thresh)).Float() < thresh {
 				return nil
 			}
diff --git a/src/runtime/map.go b/src/runtime/map.go
index 0e00f12974..c3fcfbfdbe 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go
@@ -126,7 +126,7 @@ type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
-	// alive, we store pointers to all overflow buckets in hmap.overflow and h.map.oldoverflow.
+	// alive, we store pointers to all overflow buckets in hmap.extra.overflow and hmap.extra.oldoverflow.
 	// overflow and oldoverflow are only used if key and value do not contain pointers.
 	// overflow contains overflow buckets for hmap.buckets.
 	// oldoverflow contains overflow buckets for hmap.oldbuckets.
@@ -567,7 +567,7 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 
 	// Set hashWriting after calling alg.hash, since alg.hash may panic,
 	// in which case we have not actually done a write.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -679,7 +679,7 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 
 	// Set hashWriting after calling alg.hash, since alg.hash may panic,
 	// in which case we have not actually done a write (delete).
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
@@ -921,7 +921,7 @@ func mapclear(t *maptype, h *hmap) {
 		throw("concurrent map writes")
 	}
 
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	h.flags &^= sameSizeGrow
 	h.oldbuckets = nil
@@ -1282,6 +1282,11 @@ func reflect_mapiterkey(it *hiter) unsafe.Pointer {
 	return it.key
 }
 
+//go:linkname reflect_mapitervalue reflect.mapitervalue
+func reflect_mapitervalue(it *hiter) unsafe.Pointer {
+	return it.value
+}
+
 //go:linkname reflect_maplen reflect.maplen
 func reflect_maplen(h *hmap) int {
 	if h == nil {
diff --git a/src/runtime/map_fast32.go b/src/runtime/map_fast32.go
index bf0b23604b..671558545a 100644
--- a/src/runtime/map_fast32.go
+++ b/src/runtime/map_fast32.go
@@ -103,7 +103,7 @@ func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -189,7 +189,7 @@ func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -276,7 +276,7 @@ func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
diff --git a/src/runtime/map_fast64.go b/src/runtime/map_fast64.go
index 4bde9e2be0..164a4dd1ce 100644
--- a/src/runtime/map_fast64.go
+++ b/src/runtime/map_fast64.go
@@ -103,7 +103,7 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -189,7 +189,7 @@ func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -276,7 +276,7 @@ func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
diff --git a/src/runtime/map_faststr.go b/src/runtime/map_faststr.go
index 415bbff143..bee62dfb03 100644
--- a/src/runtime/map_faststr.go
+++ b/src/runtime/map_faststr.go
@@ -202,7 +202,7 @@ func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -294,7 +294,7 @@ func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index b6c5ee0658..5142f4327a 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -226,8 +226,6 @@ func reflectcallmove(typ *_type, dst, src unsafe.Pointer, size uintptr) {
 
 //go:nosplit
 func typedslicecopy(typ *_type, dst, src slice) int {
-	// TODO(rsc): If typedslicecopy becomes faster than calling
-	// typedmemmove repeatedly, consider using during func growslice.
 	n := dst.len
 	if n > src.len {
 		n = src.len
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 75f23a16b4..e217e7695f 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -647,6 +647,35 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 	}
 }
 
+// bulkBarrierPreWriteSrcOnly is like bulkBarrierPreWrite but
+// does not execute write barriers for [dst, dst+size).
+//
+// In addition to the requirements of bulkBarrierPreWrite
+// callers need to ensure [dst, dst+size) is zeroed.
+//
+// This is used for special cases where e.g. dst was just
+// created and zeroed with malloc.
+//go:nosplit
+func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
+	if (dst|src|size)&(sys.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	buf := &getg().m.p.ptr().wbBuf
+	h := heapBitsForAddr(dst)
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if h.isPointer() {
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			if !buf.putFast(0, *srcx) {
+				wbBufFlush(nil, 0)
+			}
+		}
+		h = h.next()
+	}
+}
+
 // bulkBarrierBitmap executes write barriers for copying from [src,
 // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
 // assumed to start maskOffset bytes into the data covered by the
diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s
index a6703b3641..65f7196312 100644
--- a/src/runtime/memclr_386.s
+++ b/src/runtime/memclr_386.s
@@ -4,6 +4,7 @@
 
 // +build !plan9
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // NOTE: Windows externalthreadhandler expects memclr to preserve DX.
@@ -28,7 +29,7 @@ tail:
 	JBE	_5through8
 	CMPL	BX, $16
 	JBE	_9through16
-	CMPB	runtime·support_sse2(SB), $1
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE	nosse2
 	PXOR	X0, X0
 	CMPL	BX, $32
diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
index 172ea40820..7b54070f59 100644
--- a/src/runtime/memmove_386.s
+++ b/src/runtime/memmove_386.s
@@ -25,6 +25,7 @@
 
 // +build !plan9
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // func memmove(to, from unsafe.Pointer, n uintptr)
@@ -51,7 +52,7 @@ tail:
 	JBE	move_5through8
 	CMPL	BX, $16
 	JBE	move_9through16
-	CMPB	runtime·support_sse2(SB), $1
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE	nosse2
 	CMPL	BX, $32
 	JBE	move_17through32
@@ -72,7 +73,7 @@ nosse2:
  */
 forward:
 	// If REP MOVSB isn't fast, don't use it
-	CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
+	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
 	JNE	fwdBy4
 
 	// Check alignment
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index cb5cd02e45..b4243a833b 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -25,6 +25,7 @@
 
 // +build !plan9
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // func memmove(to, from unsafe.Pointer, n uintptr)
@@ -83,7 +84,7 @@ forward:
 	JLS	move_256through2048
 
 	// If REP MOVSB isn't fast, don't use it
-	CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
+	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
 	JNE	fwdBy8
 
 	// Check alignment
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 6ce0312712..a8c51e3e02 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -267,8 +267,8 @@ func runfinq() {
 // is not guaranteed to run, because there is no ordering that
 // respects the dependencies.
 //
-// The finalizer for obj is scheduled to run at some arbitrary time after
-// obj becomes unreachable.
+// The finalizer is scheduled to run at some arbitrary time after the
+// program can no longer reach the object to which obj points.
 // There is no guarantee that finalizers will run before a program exits,
 // so typically they are useful only for releasing non-memory resources
 // associated with an object during a long-running program.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 6a3219de73..c95b5ed37f 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -137,8 +137,8 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -407,14 +407,14 @@ type gcControllerState struct {
 	// each P that isn't running a dedicated worker.
 	//
 	// For example, if the utilization goal is 25% and there are
-	// no dedicated workers, this will be 0.25. If there goal is
+	// no dedicated workers, this will be 0.25. If the goal is
 	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
 	// this will be 0.05 to make up the missing 5%.
 	//
 	// If this is zero, no fractional workers are needed.
 	fractionalUtilizationGoal float64
 
-	_ [sys.CacheLineSize]byte
+	_ cpu.CacheLinePad
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -919,9 +919,9 @@ const gcAssistTimeSlack = 5000
 const gcOverAssistWork = 64 << 10
 
 var work struct {
-	full  lfstack                  // lock-free list of full blocks workbuf
-	empty lfstack                  // lock-free list of empty blocks workbuf
-	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+	full  lfstack          // lock-free list of full blocks workbuf
+	empty lfstack          // lock-free list of empty blocks workbuf
+	pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
 
 	wbufSpans struct {
 		lock mutex
@@ -1023,15 +1023,15 @@ var work struct {
 	// there was neither enough credit to steal or enough work to
 	// do.
 	assistQueue struct {
-		lock       mutex
-		head, tail guintptr
+		lock mutex
+		q    gQueue
 	}
 
 	// sweepWaiters is a list of blocked goroutines to wake when
 	// we transition from mark termination to sweep.
 	sweepWaiters struct {
 		lock mutex
-		head guintptr
+		list gList
 	}
 
 	// cycles is the number of completed GC cycles, where a GC
@@ -1146,9 +1146,7 @@ func gcWaitOnMark(n uint32) {
 
 		// Wait until sweep termination, mark, and mark
 		// termination of cycle N complete.
-		gp := getg()
-		gp.schedlink = work.sweepWaiters.head
-		work.sweepWaiters.head.set(gp)
+		work.sweepWaiters.list.push(getg())
 		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
 	}
 }
@@ -1632,8 +1630,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// Bump GC cycle count and wake goroutines waiting on sweep.
 	lock(&work.sweepWaiters.lock)
 	memstats.numgc++
-	injectglist(work.sweepWaiters.head.ptr())
-	work.sweepWaiters.head = 0
+	injectglist(&work.sweepWaiters.list)
 	unlock(&work.sweepWaiters.lock)
 
 	// Finish the current heap profiling cycle and start a new
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index e8cfdce4fc..69ff895512 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -302,26 +302,27 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
 //TODO go:nowritebarrier
 func markrootFreeGStacks() {
 	// Take list of dead Gs with stacks.
-	lock(&sched.gflock)
-	list := sched.gfreeStack
-	sched.gfreeStack = nil
-	unlock(&sched.gflock)
-	if list == nil {
+	lock(&sched.gFree.lock)
+	list := sched.gFree.stack
+	sched.gFree.stack = gList{}
+	unlock(&sched.gFree.lock)
+	if list.empty() {
 		return
 	}
 
 	// Free stacks.
-	tail := list
-	for gp := list; gp != nil; gp = gp.schedlink.ptr() {
+	q := gQueue{list.head, list.head}
+	for gp := list.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
 		shrinkstack(gp)
-		tail = gp
+		// Manipulate the queue directly since the Gs are
+		// already all linked the right way.
+		q.tail.set(gp)
 	}
 
 	// Put Gs back on the free list.
-	lock(&sched.gflock)
-	tail.schedlink.set(sched.gfreeNoStack)
-	sched.gfreeNoStack = list
-	unlock(&sched.gflock)
+	lock(&sched.gFree.lock)
+	sched.gFree.noStack.pushAll(q)
+	unlock(&sched.gFree.lock)
 }
 
 // markrootSpans marks roots for one shard of work.spans.
@@ -602,9 +603,8 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 // new assists from going to sleep after this point.
 func gcWakeAllAssists() {
 	lock(&work.assistQueue.lock)
-	injectglist(work.assistQueue.head.ptr())
-	work.assistQueue.head.set(nil)
-	work.assistQueue.tail.set(nil)
+	list := work.assistQueue.q.popList()
+	injectglist(&list)
 	unlock(&work.assistQueue.lock)
 }
 
@@ -625,24 +625,17 @@ func gcParkAssist() bool {
 	}
 
 	gp := getg()
-	oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
-	if oldHead == 0 {
-		work.assistQueue.head.set(gp)
-	} else {
-		oldTail.ptr().schedlink.set(gp)
-	}
-	work.assistQueue.tail.set(gp)
-	gp.schedlink.set(nil)
+	oldList := work.assistQueue.q
+	work.assistQueue.q.pushBack(gp)
 
 	// Recheck for background credit now that this G is in
 	// the queue, but can still back out. This avoids a
 	// race in case background marking has flushed more
 	// credit since we checked above.
 	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
-		work.assistQueue.head = oldHead
-		work.assistQueue.tail = oldTail
-		if oldTail != 0 {
-			oldTail.ptr().schedlink.set(nil)
+		work.assistQueue.q = oldList
+		if oldList.tail != 0 {
+			oldList.tail.ptr().schedlink.set(nil)
 		}
 		unlock(&work.assistQueue.lock)
 		return false
@@ -663,7 +656,7 @@ func gcParkAssist() bool {
 //
 //go:nowritebarrierrec
 func gcFlushBgCredit(scanWork int64) {
-	if work.assistQueue.head == 0 {
+	if work.assistQueue.q.empty() {
 		// Fast path; there are no blocked assists. There's a
 		// small window here where an assist may add itself to
 		// the blocked queue and park. If that happens, we'll
@@ -675,23 +668,21 @@ func gcFlushBgCredit(scanWork int64) {
 	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
-	gp := work.assistQueue.head.ptr()
-	for gp != nil && scanBytes > 0 {
+	for !work.assistQueue.q.empty() && scanBytes > 0 {
+		gp := work.assistQueue.q.pop()
 		// Note that gp.gcAssistBytes is negative because gp
 		// is in debt. Think carefully about the signs below.
 		if scanBytes+gp.gcAssistBytes >= 0 {
 			// Satisfy this entire assist debt.
 			scanBytes += gp.gcAssistBytes
 			gp.gcAssistBytes = 0
-			xgp := gp
-			gp = gp.schedlink.ptr()
-			// It's important that we *not* put xgp in
+			// It's important that we *not* put gp in
 			// runnext. Otherwise, it's possible for user
 			// code to exploit the GC worker's high
 			// scheduler priority to get itself always run
 			// before other goroutines and always in the
 			// fresh quantum started by GC.
-			ready(xgp, 0, false)
+			ready(gp, 0, false)
 		} else {
 			// Partially satisfy this assist.
 			gp.gcAssistBytes += scanBytes
@@ -700,23 +691,10 @@ func gcFlushBgCredit(scanWork int64) {
 			// back of the queue so that large assists
 			// can't clog up the assist queue and
 			// substantially delay small assists.
-			xgp := gp
-			gp = gp.schedlink.ptr()
-			if gp == nil {
-				// gp is the only assist in the queue.
-				gp = xgp
-			} else {
-				xgp.schedlink = 0
-				work.assistQueue.tail.ptr().schedlink.set(xgp)
-				work.assistQueue.tail.set(xgp)
-			}
+			work.assistQueue.q.pushBack(gp)
 			break
 		}
 	}
-	work.assistQueue.head.set(gp)
-	if gp == nil {
-		work.assistQueue.tail.set(nil)
-	}
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
diff --git a/src/runtime/mgcsweepbuf.go b/src/runtime/mgcsweepbuf.go
index 6c1118e385..0491f7ccf6 100644
--- a/src/runtime/mgcsweepbuf.go
+++ b/src/runtime/mgcsweepbuf.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -83,7 +84,7 @@ retry:
 			if newCap == 0 {
 				newCap = gcSweepBufInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -102,7 +103,7 @@ retry:
 		}
 
 		// Allocate a new block and add it to the spine.
-		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))
+		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
 		blockp := add(b.spine, sys.PtrSize*top)
 		// Blocks are allocated off-heap, so no write barrier.
 		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index b11853ca18..00ecfa2d66 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -9,6 +9,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -137,12 +138,12 @@ type mheap struct {
 
 	// central free lists for small size classes.
 	// the padding makes sure that the MCentrals are
-	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// spaced CacheLinePadSize bytes apart, so that each MCentral.lock
 	// gets its own cache line.
 	// central is indexed by spanClass.
 	central [numSpanClasses]struct {
 		mcentral mcentral
-		pad      [sys.CacheLineSize - unsafe.Sizeof(mcentral{})%sys.CacheLineSize]byte
+		pad      [cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
diff --git a/src/runtime/mknacl.sh b/src/runtime/mknacl.sh
index 3454b624d6..306ae3d9c1 100644
--- a/src/runtime/mknacl.sh
+++ b/src/runtime/mknacl.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Copyright 2013 The Go Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index c02ccd8ab7..4df16d55b8 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -163,6 +163,13 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Note: Every possible return from this function must reset
 	// the buffer's next pointer to prevent buffer overflow.
 
+	// This *must not* modify its arguments because this
+	// function's argument slots do double duty in gcWriteBarrier
+	// as register spill slots. Currently, not modifying the
+	// arguments is sufficient to keep the spill slots unmodified
+	// (which seems unlikely to change since it costs little and
+	// helps with debugging).
+
 	if getg().m.dying > 0 {
 		// We're going down. Not much point in write barriers
 		// and this way we can allow write barriers in the
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index c8fb95d3aa..f9c422650a 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -289,24 +289,22 @@ func poll_runtime_pollUnblock(pd *pollDesc) {
 	}
 }
 
-// make pd ready, newly runnable goroutines (if any) are returned in rg/wg
+// make pd ready, newly runnable goroutines (if any) are added to toRun.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrier
-func netpollready(gpp *guintptr, pd *pollDesc, mode int32) {
-	var rg, wg guintptr
+func netpollready(toRun *gList, pd *pollDesc, mode int32) {
+	var rg, wg *g
 	if mode == 'r' || mode == 'r'+'w' {
-		rg.set(netpollunblock(pd, 'r', true))
+		rg = netpollunblock(pd, 'r', true)
 	}
 	if mode == 'w' || mode == 'r'+'w' {
-		wg.set(netpollunblock(pd, 'w', true))
+		wg = netpollunblock(pd, 'w', true)
 	}
-	if rg != 0 {
-		rg.ptr().schedlink = *gpp
-		*gpp = rg
+	if rg != nil {
+		toRun.push(rg)
 	}
-	if wg != 0 {
-		wg.ptr().schedlink = *gpp
-		*gpp = wg
+	if wg != nil {
+		toRun.push(wg)
 	}
 }
 
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index 1908220ebb..f764d6ff7c 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go
@@ -58,9 +58,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // polls for ready network connections
 // returns list of goroutines that become runnable
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if epfd == -1 {
-		return nil
+		return gList{}
 	}
 	waitms := int32(-1)
 	if !block {
@@ -76,7 +76,7 @@ retry:
 		}
 		goto retry
 	}
-	var gp guintptr
+	var toRun gList
 	for i := int32(0); i < n; i++ {
 		ev := &events[i]
 		if ev.events == 0 {
@@ -92,11 +92,11 @@ retry:
 		if mode != 0 {
 			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
 
-			netpollready(&gp, pd, mode)
+			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/src/runtime/netpoll_fake.go b/src/runtime/netpoll_fake.go
index aab18dc846..5b1a63a878 100644
--- a/src/runtime/netpoll_fake.go
+++ b/src/runtime/netpoll_fake.go
@@ -27,6 +27,6 @@ func netpollclose(fd uintptr) int32 {
 func netpollarm(pd *pollDesc, mode int) {
 }
 
-func netpoll(block bool) *g {
-	return nil
+func netpoll(block bool) gList {
+	return gList{}
 }
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 0f73bf385e..fdaa1cd80d 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -59,9 +59,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if kq == -1 {
-		return nil
+		return gList{}
 	}
 	var tp *timespec
 	var ts timespec
@@ -78,7 +78,7 @@ retry:
 		}
 		goto retry
 	}
-	var gp guintptr
+	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 		var mode int32
@@ -102,11 +102,11 @@ retry:
 			mode += 'w'
 		}
 		if mode != 0 {
-			netpollready(&gp, (*pollDesc)(unsafe.Pointer(ev.udata)), mode)
+			netpollready(&toRun, (*pollDesc)(unsafe.Pointer(ev.udata)), mode)
 		}
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index 853e5f63e3..6bd484afaa 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go
@@ -180,9 +180,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // polls for ready network connections
 // returns list of goroutines that become runnable
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if portfd == -1 {
-		return nil
+		return gList{}
 	}
 
 	var wait *timespec
@@ -202,7 +202,7 @@ retry:
 		goto retry
 	}
 
-	var gp guintptr
+	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 
@@ -233,12 +233,12 @@ retry:
 		}
 
 		if mode != 0 {
-			netpollready(&gp, pd, mode)
+			netpollready(&toRun, pd, mode)
 		}
 	}
 
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/src/runtime/netpoll_stub.go b/src/runtime/netpoll_stub.go
index a4d6b4608a..f585333579 100644
--- a/src/runtime/netpoll_stub.go
+++ b/src/runtime/netpoll_stub.go
@@ -10,10 +10,10 @@ var netpollWaiters uint32
 
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) (gp *g) {
+func netpoll(block bool) gList {
 	// Implementation for platforms that do not support
 	// integrated network poller.
-	return
+	return gList{}
 }
 
 func netpollinited() bool {
diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 134071f5e3..07ef15ce2f 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go
@@ -63,17 +63,17 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // Polls for completed network IO.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	var entries [64]overlappedEntry
 	var wait, qty, key, flags, n, i uint32
 	var errno int32
 	var op *net_op
-	var gp guintptr
+	var toRun gList
 
 	mp := getg().m
 
 	if iocphandle == _INVALID_HANDLE_VALUE {
-		return nil
+		return gList{}
 	}
 	wait = 0
 	if block {
@@ -92,7 +92,7 @@ retry:
 			mp.blocked = false
 			errno = int32(getlasterror())
 			if !block && errno == _WAIT_TIMEOUT {
-				return nil
+				return gList{}
 			}
 			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
 			throw("runtime: netpoll failed")
@@ -105,7 +105,7 @@ retry:
 			if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
 				errno = int32(getlasterror())
 			}
-			handlecompletion(&gp, op, errno, qty)
+			handlecompletion(&toRun, op, errno, qty)
 		}
 	} else {
 		op = nil
@@ -118,7 +118,7 @@ retry:
 			mp.blocked = false
 			errno = int32(getlasterror())
 			if !block && errno == _WAIT_TIMEOUT {
-				return nil
+				return gList{}
 			}
 			if op == nil {
 				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
@@ -127,15 +127,15 @@ retry:
 			// dequeued failed IO packet, so report that
 		}
 		mp.blocked = false
-		handlecompletion(&gp, op, errno, qty)
+		handlecompletion(&toRun, op, errno, qty)
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
 
-func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
+func handlecompletion(toRun *gList, op *net_op, errno int32, qty uint32) {
 	if op == nil {
 		println("runtime: GetQueuedCompletionStatus returned op == nil")
 		throw("runtime: netpoll failed")
@@ -147,5 +147,5 @@ func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
 	}
 	op.errno = errno
 	op.qty = qty
-	netpollready(gpp, op.pd, mode)
+	netpollready(toRun, op.pd, mode)
 }
diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index 0e3a4c8024..15ca3359d2 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go
@@ -44,7 +44,7 @@ func sighandler(_ureg *ureg, note *byte, gp *g) int {
 	// level by the program but will otherwise be ignored.
 	flags = _SigNotify
 	for sig, t = range sigtable {
-		if hasprefix(notestr, t.name) {
+		if hasPrefix(notestr, t.name) {
 			flags = t.flags
 			break
 		}
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index ff375004a3..d2144edf2e 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -274,7 +274,11 @@ func setsig(i uint32, fn uintptr) {
 	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTART
 	sa.sa_mask = ^uint32(0)
 	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
+		if iscgo {
+			fn = funcPC(cgoSigtramp)
+		} else {
+			fn = funcPC(sigtramp)
+		}
 	}
 	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
 	sigaction(i, &sa, nil)
@@ -283,6 +287,7 @@ func setsig(i uint32, fn uintptr) {
 // sigtramp is the callback from libc when a signal is received.
 // It is called with the C calling convention.
 func sigtramp()
+func cgoSigtramp()
 
 //go:nosplit
 //go:nowritebarrierrec
diff --git a/src/runtime/os_darwin_arm.go b/src/runtime/os_darwin_arm.go
index 8eb5655969..ee1bd174f1 100644
--- a/src/runtime/os_darwin_arm.go
+++ b/src/runtime/os_darwin_arm.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-var hardDiv bool // TODO: set if a hardware divider is available
-
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 
diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index 631dc20ab4..08f7b0ecf0 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go
@@ -389,6 +389,7 @@ const (
 	_AT_PAGESZ   = 6  // Page size in bytes
 	_AT_TIMEKEEP = 22 // Pointer to timehands.
 	_AT_HWCAP    = 25 // CPU feature flags
+	_AT_HWCAP2   = 26 // CPU feature flags 2
 )
 
 func sysauxv(auxv []uintptr) {
diff --git a/src/runtime/os_freebsd_arm.go b/src/runtime/os_freebsd_arm.go
index d2dc26f0c4..eb4de9bc21 100644
--- a/src/runtime/os_freebsd_arm.go
+++ b/src/runtime/os_freebsd_arm.go
@@ -4,22 +4,29 @@
 
 package runtime
 
+import "internal/cpu"
+
 const (
 	_HWCAP_VFP   = 1 << 6
 	_HWCAP_VFPv3 = 1 << 13
-	_HWCAP_IDIVA = 1 << 17
 )
 
-var hwcap = ^uint32(0) // set by archauxv
-var hardDiv bool       // set if a hardware divider is available
+// AT_HWCAP is not available on FreeBSD-11.1-RELEASE or earlier.
+// Default to mandatory VFP hardware support for arm being available.
+// If AT_HWCAP is available goarmHWCap will be updated in archauxv.
+// TODO(moehrmann) remove once all go supported FreeBSD versions support _AT_HWCAP.
+var goarmHWCap uint = (_HWCAP_VFP | _HWCAP_VFPv3)
 
 func checkgoarm() {
-	if goarm > 5 && hwcap&_HWCAP_VFP == 0 {
+	// Update cpu.HWCap to match goarmHWCap in case they were not updated in archauxv.
+	cpu.HWCap = goarmHWCap
+
+	if goarm > 5 && cpu.HWCap&_HWCAP_VFP == 0 {
 		print("runtime: this CPU has no floating point hardware, so it cannot run\n")
 		print("this GOARM=", goarm, " binary. Recompile using GOARM=5.\n")
 		exit(1)
 	}
-	if goarm > 6 && hwcap&_HWCAP_VFPv3 == 0 {
+	if goarm > 6 && cpu.HWCap&_HWCAP_VFPv3 == 0 {
 		print("runtime: this CPU has no VFPv3 floating point hardware, so it cannot run\n")
 		print("this GOARM=", goarm, " binary. Recompile using GOARM=5 or GOARM=6.\n")
 		exit(1)
@@ -35,9 +42,11 @@ func checkgoarm() {
 
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_HWCAP: // CPU capability bit flags
-		hwcap = uint32(val)
-		hardDiv = (hwcap & _HWCAP_IDIVA) != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
+		goarmHWCap = cpu.HWCap
+	case _AT_HWCAP2:
+		cpu.HWCap2 = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 68f99de115..a04c995c00 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -415,9 +415,18 @@ func (c *sigctxt) fixsigcode(sig uint32) {
 //go:nosplit
 func sysSigaction(sig uint32, new, old *sigactiont) {
 	if rt_sigaction(uintptr(sig), new, old, unsafe.Sizeof(sigactiont{}.sa_mask)) != 0 {
-		// Workaround for bug in Qemu user mode emulation. (qemu
-		// rejects rt_sigaction of signal 64, SIGRTMAX).
-		if sig != 64 {
+		// Workaround for bugs in QEMU user mode emulation.
+		//
+		// QEMU turns calls to the sigaction system call into
+		// calls to the C library sigaction call; the C
+		// library call rejects attempts to call sigaction for
+		// SIGCANCEL (32) or SIGSETXID (33).
+		//
+		// QEMU rejects calling sigaction on SIGRTMAX (64).
+		//
+		// Just ignore the error in these case. There isn't
+		// anything we can do about it anyhow.
+		if sig != 32 && sig != 33 && sig != 64 {
 			// Use system stack to avoid split stack overflow on ppc64/ppc64le.
 			systemstack(func() {
 				throw("sigaction failed")
diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index 14f1cfeaef..8f082ba6a0 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go
@@ -4,20 +4,20 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"internal/cpu"
+	"unsafe"
+)
 
 const (
 	_AT_PLATFORM = 15 //  introduced in at least 2.6.11
 
 	_HWCAP_VFP   = 1 << 6  // introduced in at least 2.6.11
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
-	_HWCAP_IDIVA = 1 << 17
 )
 
 var randomNumber uint32
 var armArch uint8 = 6 // we default to ARMv6
-var hwcap uint32      // set by archauxv
-var hardDiv bool      // set if a hardware divider is available
 
 func checkgoarm() {
 	// On Android, /proc/self/auxv might be unreadable and hwcap won't
@@ -26,12 +26,12 @@ func checkgoarm() {
 	if GOOS == "android" {
 		return
 	}
-	if goarm > 5 && hwcap&_HWCAP_VFP == 0 {
+	if goarm > 5 && cpu.HWCap&_HWCAP_VFP == 0 {
 		print("runtime: this CPU has no floating point hardware, so it cannot run\n")
 		print("this GOARM=", goarm, " binary. Recompile using GOARM=5.\n")
 		exit(1)
 	}
-	if goarm > 6 && hwcap&_HWCAP_VFPv3 == 0 {
+	if goarm > 6 && cpu.HWCap&_HWCAP_VFPv3 == 0 {
 		print("runtime: this CPU has no VFPv3 floating point hardware, so it cannot run\n")
 		print("this GOARM=", goarm, " binary. Recompile using GOARM=5 or GOARM=6.\n")
 		exit(1)
@@ -53,9 +53,10 @@ func archauxv(tag, val uintptr) {
 			armArch = t - '0'
 		}
 
-	case _AT_HWCAP: // CPU capability bit flags
-		hwcap = uint32(val)
-		hardDiv = (hwcap & _HWCAP_IDIVA) != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
+	case _AT_HWCAP2:
+		cpu.HWCap2 = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 28a0319f10..cbe528b4af 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -6,20 +6,10 @@
 
 package runtime
 
-// For go:linkname
-import _ "unsafe"
+import "internal/cpu"
 
 var randomNumber uint32
 
-// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
-// HWCAP/HWCAP2 bits for hardware capabilities.
-
-//go:linkname cpu_hwcap internal/cpu.hwcap
-var cpu_hwcap uint
-
-//go:linkname cpu_hwcap2 internal/cpu.hwcap2
-var cpu_hwcap2 uint
-
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_RANDOM:
@@ -28,10 +18,13 @@ func archauxv(tag, val uintptr) {
 		// it as a byte array.
 		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
 			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+
 	case _AT_HWCAP:
-		cpu_hwcap = uint(val)
+		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
+		// HWCAP/HWCAP2 bits for hardware capabilities.
+		cpu.HWCap = uint(val)
 	case _AT_HWCAP2:
-		cpu_hwcap2 = uint(val)
+		cpu.HWCap2 = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_novdso.go b/src/runtime/os_linux_novdso.go
index ee4a7a95c2..e54c1c4dc1 100644
--- a/src/runtime/os_linux_novdso.go
+++ b/src/runtime/os_linux_novdso.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build !386,!amd64,!arm,!arm64
+// +build !386,!amd64,!arm,!arm64,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/src/runtime/os_linux_ppc64x.go b/src/runtime/os_linux_ppc64x.go
index 2c67864a96..cc79cc4a66 100644
--- a/src/runtime/os_linux_ppc64x.go
+++ b/src/runtime/os_linux_ppc64x.go
@@ -7,23 +7,16 @@
 
 package runtime
 
-// For go:linkname
-import _ "unsafe"
-
-// ppc64x doesn't have a 'cpuid' instruction equivalent and relies on
-// HWCAP/HWCAP2 bits for hardware capabilities.
-
-//go:linkname cpu_hwcap internal/cpu.hwcap
-var cpu_hwcap uint
-
-//go:linkname cpu_hwcap2 internal/cpu.hwcap2
-var cpu_hwcap2 uint
+import "internal/cpu"
 
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		cpu_hwcap = uint(val)
+		// ppc64x doesn't have a 'cpuid' instruction
+		// equivalent and relies on HWCAP/HWCAP2 bits for
+		// hardware capabilities.
+		cpu.HWCap = uint(val)
 	case _AT_HWCAP2:
-		cpu_hwcap2 = uint(val)
+		cpu.HWCap2 = uint(val)
 	}
 }
diff --git a/src/runtime/os_nacl_arm.go b/src/runtime/os_nacl_arm.go
index c64ebf31d3..8669ee75b4 100644
--- a/src/runtime/os_nacl_arm.go
+++ b/src/runtime/os_nacl_arm.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-var hardDiv bool // TODO: set if a hardware divider is available
-
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 
diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go
index b02e36a73a..95603da643 100644
--- a/src/runtime/os_netbsd_arm.go
+++ b/src/runtime/os_netbsd_arm.go
@@ -6,8 +6,6 @@ package runtime
 
 import "unsafe"
 
-var hardDiv bool // TODO: set if a hardware divider is available
-
 func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) {
 	// Machine dependent mcontext initialisation for LWP.
 	mc.__gregs[_REG_R15] = uint32(funcPC(lwp_tramp))
diff --git a/src/runtime/os_nonopenbsd.go b/src/runtime/os_nonopenbsd.go
new file mode 100644
index 0000000000..e65697bdb3
--- /dev/null
+++ b/src/runtime/os_nonopenbsd.go
@@ -0,0 +1,17 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !openbsd
+
+package runtime
+
+// osStackAlloc performs OS-specific initialization before s is used
+// as stack memory.
+func osStackAlloc(s *mspan) {
+}
+
+// osStackFree undoes the effect of osStackAlloc before s is returned
+// to the heap.
+func osStackFree(s *mspan) {
+}
diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
index c359ceb280..96112cb25b 100644
--- a/src/runtime/os_openbsd.go
+++ b/src/runtime/os_openbsd.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -80,31 +81,42 @@ var sigset_all = ^sigset(0)
 
 // From OpenBSD's <sys/sysctl.h>
 const (
+	_CTL_KERN   = 1
+	_KERN_OSREV = 3
+
 	_CTL_HW      = 6
 	_HW_NCPU     = 3
 	_HW_PAGESIZE = 7
 )
 
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
+func sysctlInt(mib []uint32) (int32, bool) {
+	var out int32
 	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], uint32(len(mib)), (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret < 0 {
+		return 0, false
+	}
+	return out, true
+}
 
+func getncpu() int32 {
 	// Fetch hw.ncpu via sysctl.
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
+	if ncpu, ok := sysctlInt([]uint32{_CTL_HW, _HW_NCPU}); ok {
+		return int32(ncpu)
 	}
 	return 1
 }
 
 func getPageSize() uintptr {
-	mib := [2]uint32{_CTL_HW, _HW_PAGESIZE}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return uintptr(out)
+	if ps, ok := sysctlInt([]uint32{_CTL_HW, _HW_PAGESIZE}); ok {
+		return uintptr(ps)
+	}
+	return 0
+}
+
+func getOSRev() int {
+	if osrev, ok := sysctlInt([]uint32{_CTL_KERN, _KERN_OSREV}); ok {
+		return int(osrev)
 	}
 	return 0
 }
@@ -171,10 +183,12 @@ func newosproc(mp *m) {
 		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
 	}
 
+	// Stack pointer must point inside stack area (as marked with MAP_STACK),
+	// rather than at the top of it.
 	param := tforkt{
 		tf_tcb:   unsafe.Pointer(&mp.tls[0]),
 		tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
-		tf_stack: uintptr(stk),
+		tf_stack: uintptr(stk) - sys.PtrSize,
 	}
 
 	var oset sigset
@@ -194,6 +208,7 @@ func newosproc(mp *m) {
 func osinit() {
 	ncpu = getncpu()
 	physPageSize = getPageSize()
+	haveMapStack = getOSRev() >= 201805 // OpenBSD 6.3
 }
 
 var urandom_dev = []byte("/dev/urandom\x00")
@@ -286,3 +301,35 @@ func sigdelset(mask *sigset, i int) {
 
 func (c *sigctxt) fixsigcode(sig uint32) {
 }
+
+var haveMapStack = false
+
+func osStackAlloc(s *mspan) {
+	// OpenBSD 6.4+ requires that stacks be mapped with MAP_STACK.
+	// It will check this on entry to system calls, traps, and
+	// when switching to the alternate system stack.
+	//
+	// This function is called before s is used for any data, so
+	// it's safe to simply re-map it.
+	osStackRemap(s, _MAP_STACK)
+}
+
+func osStackFree(s *mspan) {
+	// Undo MAP_STACK.
+	osStackRemap(s, 0)
+}
+
+func osStackRemap(s *mspan, flags int32) {
+	if !haveMapStack {
+		// OpenBSD prior to 6.3 did not have MAP_STACK and so
+		// the following mmap will fail. But it also didn't
+		// require MAP_STACK (obviously), so there's no need
+		// to do the mmap.
+		return
+	}
+	a, err := mmap(unsafe.Pointer(s.base()), s.npages*pageSize, _PROT_READ|_PROT_WRITE, _MAP_PRIVATE|_MAP_ANON|_MAP_FIXED|flags, -1, 0)
+	if err != 0 || uintptr(a) != s.base() {
+		print("runtime: remapping stack memory ", hex(s.base()), " ", s.npages*pageSize, " a=", a, " err=", err, "\n")
+		throw("remapping stack memory failed")
+	}
+}
diff --git a/src/runtime/os_openbsd_arm.go b/src/runtime/os_openbsd_arm.go
index c318578ab5..be2e1e9959 100644
--- a/src/runtime/os_openbsd_arm.go
+++ b/src/runtime/os_openbsd_arm.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-var hardDiv bool // TODO: set if a hardware divider is available
-
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 9f41c5ac83..5469114a2b 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -112,20 +112,20 @@ func sigpanic() {
 }
 
 func atolwhex(p string) int64 {
-	for hasprefix(p, " ") || hasprefix(p, "\t") {
+	for hasPrefix(p, " ") || hasPrefix(p, "\t") {
 		p = p[1:]
 	}
 	neg := false
-	if hasprefix(p, "-") || hasprefix(p, "+") {
+	if hasPrefix(p, "-") || hasPrefix(p, "+") {
 		neg = p[0] == '-'
 		p = p[1:]
-		for hasprefix(p, " ") || hasprefix(p, "\t") {
+		for hasPrefix(p, " ") || hasPrefix(p, "\t") {
 			p = p[1:]
 		}
 	}
 	var n int64
 	switch {
-	case hasprefix(p, "0x"), hasprefix(p, "0X"):
+	case hasPrefix(p, "0x"), hasPrefix(p, "0X"):
 		p = p[2:]
 		for ; len(p) > 0; p = p[1:] {
 			if '0' <= p[0] && p[0] <= '9' {
@@ -138,7 +138,7 @@ func atolwhex(p string) int64 {
 				break
 			}
 		}
-	case hasprefix(p, "0"):
+	case hasPrefix(p, "0"):
 		for ; len(p) > 0 && '0' <= p[0] && p[0] <= '7'; p = p[1:] {
 			n = n*8 + int64(p[0]-'0')
 		}
diff --git a/src/runtime/os_plan9_arm.go b/src/runtime/os_plan9_arm.go
index 1ce0141ce2..fdce1e7a35 100644
--- a/src/runtime/os_plan9_arm.go
+++ b/src/runtime/os_plan9_arm.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-var hardDiv bool // TODO: set if a hardware divider is available
-
 func checkgoarm() {
 	return // TODO(minux)
 }
diff --git a/src/runtime/os_solaris.go b/src/runtime/os_solaris.go
index 703a2e5430..4575b5e641 100644
--- a/src/runtime/os_solaris.go
+++ b/src/runtime/os_solaris.go
@@ -37,12 +37,14 @@ func sysvicall0(fn *libcFunc) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil // See comment in sys_darwin.go:libcCall
 	}
 
 	var libcall libcall
@@ -64,12 +66,14 @@ func sysvicall1(fn *libcFunc, a1 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
@@ -92,12 +96,14 @@ func sysvicall2(fn *libcFunc, a1, a2 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
@@ -119,12 +125,14 @@ func sysvicall3(fn *libcFunc, a1, a2, a3 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
@@ -146,12 +154,14 @@ func sysvicall4(fn *libcFunc, a1, a2, a3, a4 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
@@ -173,12 +183,14 @@ func sysvicall5(fn *libcFunc, a1, a2, a3, a4, a5 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
@@ -200,12 +212,14 @@ func sysvicall6(fn *libcFunc, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		mp = nil
 	}
 
 	var libcall libcall
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 68e404e675..5607bf95c1 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -45,6 +45,7 @@ const (
 //go:cgo_import_dynamic runtime._SwitchToThread SwitchToThread%0 "kernel32.dll"
 //go:cgo_import_dynamic runtime._VirtualAlloc VirtualAlloc%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._VirtualQuery VirtualQuery%3 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WSAGetOverlappedResult WSAGetOverlappedResult%5 "ws2_32.dll"
 //go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
@@ -92,6 +93,7 @@ var (
 	_SwitchToThread,
 	_VirtualAlloc,
 	_VirtualFree,
+	_VirtualQuery,
 	_WSAGetOverlappedResult,
 	_WaitForSingleObject,
 	_WriteConsoleW,
@@ -619,12 +621,10 @@ func semacreate(mp *m) {
 //go:nowritebarrierrec
 //go:nosplit
 func newosproc(mp *m) {
-	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
-	// stackSize must match SizeOfStackReserve in cmd/link/internal/ld/pe.go.
-	const stackSize = 0x00200000*_64bit + 0x00100000*(1-_64bit)
-	thandle := stdcall6(_CreateThread, 0, stackSize,
+	// We pass 0 for the stack size to use the default for this binary.
+	thandle := stdcall6(_CreateThread, 0, 0,
 		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
-		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
+		0, 0)
 
 	if thandle == 0 {
 		if atomic.Load(&exiting) != 0 {
@@ -689,6 +689,33 @@ func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
 	atomic.Storeuintptr(&getg().m.thread, thandle)
+
+	// Query the true stack base from the OS. Currently we're
+	// running on a small assumed stack.
+	var mbi memoryBasicInformation
+	res := stdcall3(_VirtualQuery, uintptr(unsafe.Pointer(&mbi)), uintptr(unsafe.Pointer(&mbi)), unsafe.Sizeof(mbi))
+	if res == 0 {
+		print("runtime: VirtualQuery failed; errno=", getlasterror(), "\n")
+		throw("VirtualQuery for stack base failed")
+	}
+	// The system leaves an 8K PAGE_GUARD region at the bottom of
+	// the stack (in theory VirtualQuery isn't supposed to include
+	// that, but it does). Add an additional 8K of slop for
+	// calling C functions that don't have stack checks and for
+	// lastcontinuehandler. We shouldn't be anywhere near this
+	// bound anyway.
+	base := mbi.allocationBase + 16<<10
+	// Sanity check the stack bounds.
+	g0 := getg()
+	if base > g0.stack.hi || g0.stack.hi-base > 64<<20 {
+		print("runtime: g0 stack [", hex(base), ",", hex(g0.stack.hi), ")\n")
+		throw("bad g0 stack")
+	}
+	g0.stack.lo = base
+	g0.stackguard0 = g0.stack.lo + _StackGuard
+	g0.stackguard1 = g0.stackguard0
+	// Sanity check the SP.
+	stackcheck()
 }
 
 // Called from dropm to undo the effect of an minit.
@@ -707,17 +734,20 @@ func stdcall(fn stdFunction) uintptr {
 	gp := getg()
 	mp := gp.m
 	mp.libcall.fn = uintptr(unsafe.Pointer(fn))
-
-	if mp.profilehz != 0 {
+	resetLibcall := false
+	if mp.profilehz != 0 && mp.libcallsp == 0 {
 		// leave pc/sp for cpu profiler
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+		resetLibcall = true // See comment in sys_darwin.go:libcCall
 	}
 	asmcgocall(asmstdcallAddr, unsafe.Pointer(&mp.libcall))
-	mp.libcallsp = 0
+	if resetLibcall {
+		mp.libcallsp = 0
+	}
 	return mp.libcall.r1
 }
 
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index ce367cfa70..45be886196 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -23,7 +23,23 @@ func panicCheckMalloc(err error) {
 
 var indexError = error(errorString("index out of range"))
 
+// The panicindex, panicslice, and panicdivide functions are called by
+// code generated by the compiler for out of bounds index expressions,
+// out of bounds slice expressions, and division by zero. The
+// panicdivide (again), panicoverflow, panicfloat, and panicmem
+// functions are called by the signal handler when a signal occurs
+// indicating the respective problem.
+//
+// Since panicindex and panicslice are never called directly, and
+// since the runtime package should never have an out of bounds slice
+// or array reference, if we see those functions called from the
+// runtime package we turn the panic into a throw. That will dump the
+// entire runtime stack for easier debugging.
+
 func panicindex() {
+	if hasPrefix(funcname(findfunc(getcallerpc())), "runtime.") {
+		throw(string(indexError.(errorString)))
+	}
 	panicCheckMalloc(indexError)
 	panic(indexError)
 }
@@ -31,6 +47,9 @@ func panicindex() {
 var sliceError = error(errorString("slice bounds out of range"))
 
 func panicslice() {
+	if hasPrefix(funcname(findfunc(getcallerpc())), "runtime.") {
+		throw(string(sliceError.(errorString)))
+	}
 	panicCheckMalloc(sliceError)
 	panic(sliceError)
 }
@@ -870,6 +889,11 @@ func shouldPushSigpanic(gp *g, pc, lr uintptr) bool {
 
 // isAbortPC returns true if pc is the program counter at which
 // runtime.abort raises a signal.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
 func isAbortPC(pc uintptr) bool {
 	return pc == funcPC(abort) || ((GOARCH == "arm" || GOARCH == "arm64") && pc == funcPC(abort)+sys.PCQuantum)
 }
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index c1024c99ed..74cdd15cfb 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -28,6 +28,7 @@
 //            if err != nil {
 //                log.Fatal("could not create CPU profile: ", err)
 //            }
+//            defer f.Close()
 //            if err := pprof.StartCPUProfile(f); err != nil {
 //                log.Fatal("could not start CPU profile: ", err)
 //            }
@@ -41,11 +42,11 @@
 //            if err != nil {
 //                log.Fatal("could not create memory profile: ", err)
 //            }
+//            defer f.Close()
 //            runtime.GC() // get up-to-date statistics
 //            if err := pprof.WriteHeapProfile(f); err != nil {
 //                log.Fatal("could not write memory profile: ", err)
 //            }
-//            f.Close()
 //        }
 //    }
 //
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 44d514393e..126ba50054 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -72,15 +72,24 @@ func cpuHog2(x int) int {
 	return foo
 }
 
+// Return a list of functions that we don't want to ever appear in CPU
+// profiles. For gccgo, that list includes the sigprof handler itself.
+func avoidFunctions() []string {
+	if runtime.Compiler == "gccgo" {
+		return []string{"runtime.sigprof"}
+	}
+	return nil
+}
+
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof.cpuHog1"}, func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"runtime/pprof.cpuHog1"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(cpuHog1, &salt1, dur)
 	})
 }
 
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog2"}, func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog2"}, avoidFunctions(), func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
 			cpuHogger(cpuHog1, &salt1, dur)
@@ -92,7 +101,7 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 }
 
 func TestCPUProfileInlining(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(inlinedCaller, &salt1, dur)
 	})
 }
@@ -130,7 +139,9 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 	}
 }
 
-func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
+// testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
+// as interpreted by matches.
+func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
@@ -169,7 +180,7 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 		f(duration)
 		StopCPUProfile()
 
-		if profileOk(t, need, prof, duration) {
+		if profileOk(t, matches, need, avoid, prof, duration) {
 			return
 		}
 
@@ -202,29 +213,43 @@ func contains(slice []string, s string) bool {
 	return false
 }
 
-func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+// stackContains matches if a function named spec appears anywhere in the stack trace.
+func stackContains(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	for _, loc := range stk {
+		for _, line := range loc.Line {
+			if strings.Contains(line.Function.Name, spec) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
+
+func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
 	ok = true
 
-	// Check that profile is well formed and contains need.
+	// Check that profile is well formed, contains 'need', and does not contain
+	// anything from 'avoid'.
 	have := make([]uintptr, len(need))
+	avoidSamples := make([]uintptr, len(avoid))
 	var samples uintptr
 	var buf bytes.Buffer
 	parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
 		fmt.Fprintf(&buf, "%d:", count)
 		fprintStack(&buf, stk)
 		samples += count
-		for i, name := range need {
-			if semi := strings.Index(name, ";"); semi > -1 {
-				kv := strings.SplitN(name[semi+1:], "=", 2)
-				if len(kv) != 2 || !contains(labels[kv[0]], kv[1]) {
-					continue
-				}
-				name = name[:semi]
+		for i, spec := range need {
+			if matches(spec, count, stk, labels) {
+				have[i] += count
 			}
+		}
+		for i, name := range avoid {
 			for _, loc := range stk {
 				for _, line := range loc.Line {
 					if strings.Contains(line.Function.Name, name) {
-						have[i] += count
+						avoidSamples[i] += count
 					}
 				}
 			}
@@ -251,6 +276,14 @@ func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Dur
 		ok = false
 	}
 
+	for i, name := range avoid {
+		bad := avoidSamples[i]
+		if bad != 0 {
+			t.Logf("found %d samples in avoid-function %s\n", bad, name)
+			ok = false
+		}
+	}
+
 	if len(need) == 0 {
 		return ok
 	}
@@ -318,6 +351,9 @@ func TestCPUProfileWithFork(t *testing.T) {
 // If it did, it would see inconsistent state and would either record an incorrect stack
 // or crash because the stack was malformed.
 func TestGoroutineSwitch(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("not applicable for gccgo")
+	}
 	// How much to try. These defaults take about 1 seconds
 	// on a 2012 MacBook Pro. The ones in short mode take
 	// about 0.1 seconds.
@@ -377,7 +413,7 @@ func fprintStack(w io.Writer, stk []*profile.Location) {
 
 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
-	testCPUProfile(t, nil, func(duration time.Duration) {
+	testCPUProfile(t, nil, nil, nil, func(duration time.Duration) {
 		t := time.After(duration)
 		pi := new(big.Int)
 		for {
@@ -395,6 +431,48 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
+// stackContainsAll matches if all functions in spec (comma-separated) appear somewhere in the stack trace.
+func stackContainsAll(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	for _, f := range strings.Split(spec, ",") {
+		if !stackContains(f, count, stk, labels) {
+			return false
+		}
+	}
+	return true
+}
+
+func TestMorestack(t *testing.T) {
+	testCPUProfile(t, stackContainsAll, []string{"runtime.newstack,runtime/pprof.growstack"}, avoidFunctions(), func(duration time.Duration) {
+		t := time.After(duration)
+		c := make(chan bool)
+		for {
+			go func() {
+				growstack1()
+				c <- true
+			}()
+			select {
+			case <-t:
+				return
+			case <-c:
+			}
+		}
+	})
+}
+
+//go:noinline
+func growstack1() {
+	growstack()
+}
+
+//go:noinline
+func growstack() {
+	var buf [8 << 10]byte
+	use(buf)
+}
+
+//go:noinline
+func use(x [8 << 10]byte) {}
+
 func TestBlockProfile(t *testing.T) {
 	type TestCase struct {
 		name string
@@ -848,8 +926,25 @@ func TestEmptyCallStack(t *testing.T) {
 	}
 }
 
+// stackContainsLabeled takes a spec like funcname;key=value and matches if the stack has that key
+// and value and has funcname somewhere in the stack.
+func stackContainsLabeled(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	semi := strings.Index(spec, ";")
+	if semi == -1 {
+		panic("no semicolon in key/value spec")
+	}
+	kv := strings.SplitN(spec[semi+1:], "=", 2)
+	if len(kv) != 2 {
+		panic("missing = in key/value spec")
+	}
+	if !contains(labels[kv[0]], kv[1]) {
+		return false
+	}
+	return stackContains(spec[:semi], count, stk, labels)
+}
+
 func TestCPUProfileLabel(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof.cpuHogger;key=value"}, func(dur time.Duration) {
+	testCPUProfile(t, stackContainsLabeled, []string{"runtime/pprof.cpuHogger;key=value"}, avoidFunctions(), func(dur time.Duration) {
 		Do(context.Background(), Labels("key", "value"), func(context.Context) {
 			cpuHogger(cpuHog1, &salt1, dur)
 		})
@@ -860,7 +955,7 @@ func TestLabelRace(t *testing.T) {
 	// Test the race detector annotations for synchronization
 	// between settings labels and consuming them from the
 	// profile.
-	testCPUProfile(t, []string{"runtime/pprof.cpuHogger;key=value"}, func(dur time.Duration) {
+	testCPUProfile(t, stackContainsLabeled, []string{"runtime/pprof.cpuHogger;key=value"}, nil, func(dur time.Duration) {
 		start := time.Now()
 		var wg sync.WaitGroup
 		for time.Since(start) < dur {
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
index 1cf3a5154f..cbd0b83376 100644
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -54,6 +54,7 @@ type memMap struct {
 	file, buildID string
 
 	funcs symbolizeFlag
+	fake  bool // map entry was faked; /proc/self/maps wasn't available
 }
 
 // symbolizeFlag keeps track of symbolization result.
@@ -267,7 +268,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 		frame, more = frames.Next()
 	}
 	for i := range b.mem {
-		if b.mem[i].start <= addr && addr < b.mem[i].end {
+		if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
 			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
 
 			m := b.mem[i]
@@ -440,6 +441,12 @@ func (b *profileBuilder) build() {
 func (b *profileBuilder) readMapping() {
 	data, _ := ioutil.ReadFile("/proc/self/maps")
 	parseProcSelfMaps(data, b.addMapping)
+	if len(b.mem) == 0 { // pprof expects a map entry, so fake one.
+		b.addMappingEntry(0, 0, 0, "", "", true)
+		// TODO(hyangah): make addMapping return *memMap or
+		// take a memMap struct, and get rid of addMappingEntry
+		// that takes a bunch of positional arguments.
+	}
 }
 
 func parseProcSelfMaps(data []byte, addMapping func(lo, hi, offset uint64, file, buildID string)) {
@@ -540,11 +547,16 @@ func parseProcSelfMaps(data []byte, addMapping func(lo, hi, offset uint64, file,
 }
 
 func (b *profileBuilder) addMapping(lo, hi, offset uint64, file, buildID string) {
+	b.addMappingEntry(lo, hi, offset, file, buildID, false)
+}
+
+func (b *profileBuilder) addMappingEntry(lo, hi, offset uint64, file, buildID string, fake bool) {
 	b.mem = append(b.mem, memMap{
 		start:   uintptr(lo),
 		end:     uintptr(hi),
 		offset:  offset,
 		file:    file,
 		buildID: buildID,
+		fake:    fake,
 	})
 }
diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go
index 36c345b6d9..76bd46da02 100644
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -97,9 +97,16 @@ func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
 		addr2 = mprof.Mapping[1].Start
 		map2 = mprof.Mapping[1]
 		map2.BuildID, _ = elfBuildID(map2.File)
+	case "js":
+		addr1 = uint64(funcPC(f1))
+		addr2 = uint64(funcPC(f2))
 	default:
 		addr1 = uint64(funcPC(f1))
 		addr2 = uint64(funcPC(f2))
+		// Fake mapping - HasFunctions will be true because two PCs from Go
+		// will be fully symbolized.
+		fake := &profile.Mapping{ID: 1, HasFunctions: true}
+		map1, map2 = fake, fake
 	}
 	return
 }
@@ -301,3 +308,41 @@ func symbolized(loc *profile.Location) bool {
 	}
 	return true
 }
+
+// TestFakeMapping tests if at least one mapping exists
+// (including a fake mapping), and their HasFunctions bits
+// are set correctly.
+func TestFakeMapping(t *testing.T) {
+	var buf bytes.Buffer
+	if err := Lookup("heap").WriteTo(&buf, 0); err != nil {
+		t.Fatalf("failed to write heap profile: %v", err)
+	}
+	prof, err := profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("failed to parse the generated profile data: %v", err)
+	}
+	t.Logf("Profile: %s", prof)
+	if len(prof.Mapping) == 0 {
+		t.Fatal("want profile with at least one mapping entry, got 0 mapping")
+	}
+
+	hit := make(map[*profile.Mapping]bool)
+	miss := make(map[*profile.Mapping]bool)
+	for _, loc := range prof.Location {
+		if symbolized(loc) {
+			hit[loc.Mapping] = true
+		} else {
+			miss[loc.Mapping] = true
+		}
+	}
+	for _, m := range prof.Mapping {
+		if miss[m] && m.HasFunctions {
+			t.Errorf("mapping %+v has HasFunctions=true, but contains locations with failed symbolization", m)
+			continue
+		}
+		if !miss[m] && hit[m] && !m.HasFunctions {
+			t.Errorf("mapping %+v has HasFunctions=false, but all referenced locations from this lapping were symbolized successfully", m)
+			continue
+		}
+	}
+}
diff --git a/src/runtime/pprof/protomem.go b/src/runtime/pprof/protomem.go
index 82565d5245..1c88aae43a 100644
--- a/src/runtime/pprof/protomem.go
+++ b/src/runtime/pprof/protomem.go
@@ -56,8 +56,8 @@ func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64, defau
 		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
 		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
 		var blockSize int64
-		if values[0] > 0 {
-			blockSize = values[1] / values[0]
+		if r.AllocObjects > 0 {
+			blockSize = r.AllocBytes / r.AllocObjects
 		}
 		b.pbSample(values, locs, func() {
 			if blockSize != 0 {
diff --git a/src/runtime/pprof/protomem_test.go b/src/runtime/pprof/protomem_test.go
index 315d5f0b4d..471b1ae9c3 100644
--- a/src/runtime/pprof/protomem_test.go
+++ b/src/runtime/pprof/protomem_test.go
@@ -48,7 +48,7 @@ func TestConvertMemProfile(t *testing.T) {
 				{ID: 3, Mapping: map2, Address: addr2 + 1},
 				{ID: 4, Mapping: map2, Address: addr2 + 2},
 			},
-			NumLabel: map[string][]int64{"bytes": {829411}},
+			NumLabel: map[string][]int64{"bytes": {512 * 1024}},
 		},
 		{
 			Value: []int64{1, 829411, 0, 0},
@@ -57,7 +57,7 @@ func TestConvertMemProfile(t *testing.T) {
 				{ID: 6, Mapping: map1, Address: addr1 + 2},
 				{ID: 7, Mapping: map2, Address: addr2 + 3},
 			},
-			NumLabel: map[string][]int64{"bytes": {829411}},
+			NumLabel: map[string][]int64{"bytes": {512 * 1024}},
 		},
 	}
 	for _, tc := range []struct {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index b5486321ed..73b4a1d9d6 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -477,20 +477,14 @@ const (
 	_GoidCacheBatch = 16
 )
 
-//go:linkname internal_cpu_initialize internal/cpu.initialize
-func internal_cpu_initialize(env string)
-
-//go:linkname internal_cpu_debugOptions internal/cpu.debugOptions
-var internal_cpu_debugOptions bool
-
 // cpuinit extracts the environment variable GODEBUGCPU from the environment on
-// Linux and Darwin if the GOEXPERIMENT debugcpu was set and calls internal/cpu.initialize.
+// Linux and Darwin if the GOEXPERIMENT debugcpu was set and calls internal/cpu.Initialize.
 func cpuinit() {
 	const prefix = "GODEBUGCPU="
 	var env string
 
 	if haveexperiment("debugcpu") && (GOOS == "linux" || GOOS == "darwin") {
-		internal_cpu_debugOptions = true
+		cpu.DebugOptions = true
 
 		// Similar to goenv_unix but extracts the environment value for
 		// GODEBUGCPU directly.
@@ -504,18 +498,18 @@ func cpuinit() {
 			p := argv_index(argv, argc+1+i)
 			s := *(*string)(unsafe.Pointer(&stringStruct{unsafe.Pointer(p), findnull(p)}))
 
-			if hasprefix(s, prefix) {
+			if hasPrefix(s, prefix) {
 				env = gostring(p)[len(prefix):]
 				break
 			}
 		}
 	}
 
-	internal_cpu_initialize(env)
+	cpu.Initialize(env)
 
-	support_erms = cpu.X86.HasERMS
+	// Support cpu feature variables are used in code generated by the compiler
+	// to guard execution of instructions that can not be assumed to be always supported.
 	support_popcnt = cpu.X86.HasPOPCNT
-	support_sse2 = cpu.X86.HasSSE2
 	support_sse41 = cpu.X86.HasSSE41
 
 	arm64_support_atomics = cpu.ARM64.HasATOMICS
@@ -1148,8 +1142,8 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 
 	_g_.m.locks++ // disable preemption because it can be holding p in a local var
 	if netpollinited() {
-		gp := netpoll(false) // non-blocking
-		injectglist(gp)
+		list := netpoll(false) // non-blocking
+		injectglist(&list)
 	}
 	add := needaddgcproc()
 	lock(&sched.lock)
@@ -1233,6 +1227,7 @@ func mstart() {
 	if osStack {
 		// Initialize stack bounds from system stack.
 		// Cgo may have left stack size in stack.hi.
+		// minit may update the stack bounds.
 		size := _g_.stack.hi
 		if size == 0 {
 			size = 8192 * sys.StackGuardMultiplier
@@ -1604,7 +1599,7 @@ func allocm(_p_ *p, fn func()) *m {
 // the following strategy: there is a stack of available m's
 // that can be stolen. Using compare-and-swap
 // to pop from the stack has ABA races, so we simulate
-// a lock by doing an exchange (via casp) to steal the stack
+// a lock by doing an exchange (via Casuintptr) to steal the stack
 // head and replace the top pointer with MLOCKED (1).
 // This serves as a simple spin lock that we can use even
 // without an m. The thread that locks the stack in this way
@@ -2311,9 +2306,9 @@ top:
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
-		if gp := netpoll(false); gp != nil { // non-blocking
-			// netpoll returns list of goroutines linked by schedlink.
-			injectglist(gp.schedlink.ptr())
+		if list := netpoll(false); !list.empty() { // non-blocking
+			gp := list.pop()
+			injectglist(&list)
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
@@ -2465,22 +2460,23 @@ stop:
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
-		gp := netpoll(true) // block until new work is available
+		list := netpoll(true) // block until new work is available
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
-		if gp != nil {
+		if !list.empty() {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
 			if _p_ != nil {
 				acquirep(_p_)
-				injectglist(gp.schedlink.ptr())
+				gp := list.pop()
+				injectglist(&list)
 				casgstatus(gp, _Gwaiting, _Grunnable)
 				if trace.enabled {
 					traceGoUnpark(gp, 0)
 				}
 				return gp, false
 			}
-			injectglist(gp)
+			injectglist(&list)
 		}
 	}
 	stopm()
@@ -2500,8 +2496,8 @@ func pollWork() bool {
 		return true
 	}
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
-		if gp := netpoll(false); gp != nil {
-			injectglist(gp)
+		if list := netpoll(false); !list.empty() {
+			injectglist(&list)
 			return true
 		}
 	}
@@ -2526,22 +2522,21 @@ func resetspinning() {
 	}
 }
 
-// Injects the list of runnable G's into the scheduler.
+// Injects the list of runnable G's into the scheduler and clears glist.
 // Can run concurrently with GC.
-func injectglist(glist *g) {
-	if glist == nil {
+func injectglist(glist *gList) {
+	if glist.empty() {
 		return
 	}
 	if trace.enabled {
-		for gp := glist; gp != nil; gp = gp.schedlink.ptr() {
+		for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
 			traceGoUnpark(gp, 0)
 		}
 	}
 	lock(&sched.lock)
 	var n int
-	for n = 0; glist != nil; n++ {
-		gp := glist
-		glist = gp.schedlink.ptr()
+	for n = 0; !glist.empty(); n++ {
+		gp := glist.pop()
 		casgstatus(gp, _Gwaiting, _Grunnable)
 		globrunqput(gp)
 	}
@@ -2549,6 +2544,7 @@ func injectglist(glist *g) {
 	for ; n != 0 && sched.npidle != 0; n-- {
 		startm(nil, false)
 	}
+	*glist = gList{}
 }
 
 // One round of scheduler: find a runnable goroutine and execute it.
@@ -3469,25 +3465,21 @@ func gfput(_p_ *p, gp *g) {
 		gp.stackguard0 = 0
 	}
 
-	gp.schedlink.set(_p_.gfree)
-	_p_.gfree = gp
-	_p_.gfreecnt++
-	if _p_.gfreecnt >= 64 {
-		lock(&sched.gflock)
-		for _p_.gfreecnt >= 32 {
-			_p_.gfreecnt--
-			gp = _p_.gfree
-			_p_.gfree = gp.schedlink.ptr()
+	_p_.gFree.push(gp)
+	_p_.gFree.n++
+	if _p_.gFree.n >= 64 {
+		lock(&sched.gFree.lock)
+		for _p_.gFree.n >= 32 {
+			_p_.gFree.n--
+			gp = _p_.gFree.pop()
 			if gp.stack.lo == 0 {
-				gp.schedlink.set(sched.gfreeNoStack)
-				sched.gfreeNoStack = gp
+				sched.gFree.noStack.push(gp)
 			} else {
-				gp.schedlink.set(sched.gfreeStack)
-				sched.gfreeStack = gp
+				sched.gFree.stack.push(gp)
 			}
-			sched.ngfree++
+			sched.gFree.n++
 		}
-		unlock(&sched.gflock)
+		unlock(&sched.gFree.lock)
 	}
 }
 
@@ -3495,44 +3487,42 @@ func gfput(_p_ *p, gp *g) {
 // If local list is empty, grab a batch from global list.
 func gfget(_p_ *p) *g {
 retry:
-	gp := _p_.gfree
-	if gp == nil && (sched.gfreeStack != nil || sched.gfreeNoStack != nil) {
-		lock(&sched.gflock)
-		for _p_.gfreecnt < 32 {
-			if sched.gfreeStack != nil {
-				// Prefer Gs with stacks.
-				gp = sched.gfreeStack
-				sched.gfreeStack = gp.schedlink.ptr()
-			} else if sched.gfreeNoStack != nil {
-				gp = sched.gfreeNoStack
-				sched.gfreeNoStack = gp.schedlink.ptr()
-			} else {
-				break
+	if _p_.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) {
+		lock(&sched.gFree.lock)
+		// Move a batch of free Gs to the P.
+		for _p_.gFree.n < 32 {
+			// Prefer Gs with stacks.
+			gp := sched.gFree.stack.pop()
+			if gp == nil {
+				gp = sched.gFree.noStack.pop()
+				if gp == nil {
+					break
+				}
 			}
-			_p_.gfreecnt++
-			sched.ngfree--
-			gp.schedlink.set(_p_.gfree)
-			_p_.gfree = gp
+			sched.gFree.n--
+			_p_.gFree.push(gp)
+			_p_.gFree.n++
 		}
-		unlock(&sched.gflock)
+		unlock(&sched.gFree.lock)
 		goto retry
 	}
-	if gp != nil {
-		_p_.gfree = gp.schedlink.ptr()
-		_p_.gfreecnt--
-		if gp.stack.lo == 0 {
-			// Stack was deallocated in gfput. Allocate a new one.
-			systemstack(func() {
-				gp.stack = stackalloc(_FixedStack)
-			})
-			gp.stackguard0 = gp.stack.lo + _StackGuard
-		} else {
-			if raceenabled {
-				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
-			}
-			if msanenabled {
-				msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
-			}
+	gp := _p_.gFree.pop()
+	if gp == nil {
+		return nil
+	}
+	_p_.gFree.n--
+	if gp.stack.lo == 0 {
+		// Stack was deallocated in gfput. Allocate a new one.
+		systemstack(func() {
+			gp.stack = stackalloc(_FixedStack)
+		})
+		gp.stackguard0 = gp.stack.lo + _StackGuard
+	} else {
+		if raceenabled {
+			racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+		}
+		if msanenabled {
+			msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
 		}
 	}
 	return gp
@@ -3540,21 +3530,18 @@ retry:
 
 // Purge all cached G's from gfree list to the global list.
 func gfpurge(_p_ *p) {
-	lock(&sched.gflock)
-	for _p_.gfreecnt != 0 {
-		_p_.gfreecnt--
-		gp := _p_.gfree
-		_p_.gfree = gp.schedlink.ptr()
+	lock(&sched.gFree.lock)
+	for !_p_.gFree.empty() {
+		gp := _p_.gFree.pop()
+		_p_.gFree.n--
 		if gp.stack.lo == 0 {
-			gp.schedlink.set(sched.gfreeNoStack)
-			sched.gfreeNoStack = gp
+			sched.gFree.noStack.push(gp)
 		} else {
-			gp.schedlink.set(sched.gfreeStack)
-			sched.gfreeStack = gp
+			sched.gFree.stack.push(gp)
 		}
-		sched.ngfree++
+		sched.gFree.n++
 	}
-	unlock(&sched.gflock)
+	unlock(&sched.gFree.lock)
 }
 
 // Breakpoint executes a breakpoint trap.
@@ -3667,9 +3654,9 @@ func badunlockosthread() {
 }
 
 func gcount() int32 {
-	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
+	n := int32(allglen) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
 	for _, _p_ := range allp {
-		n -= _p_.gfreecnt
+		n -= _p_.gFree.n
 	}
 
 	// All these variables can be changed concurrently, so the result can be inconsistent.
@@ -3715,7 +3702,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// received from somewhere else (with _LostSIGPROFDuringAtomic64 as pc).
 	if GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "arm" {
 		if f := findfunc(pc); f.valid() {
-			if hasprefix(funcname(f), "runtime/internal/atomic") {
+			if hasPrefix(funcname(f), "runtime/internal/atomic") {
 				lostAtomic64Count++
 				return
 			}
@@ -4386,8 +4373,8 @@ func sysmon() {
 		now := nanotime()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			gp := netpoll(false) // non-blocking - returns list of goroutines
-			if gp != nil {
+			list := netpoll(false) // non-blocking - returns list of goroutines
+			if !list.empty() {
 				// Need to decrement number of idle locked M's
 				// (pretending that one more is running) before injectglist.
 				// Otherwise it can lead to the following situation:
@@ -4396,7 +4383,7 @@ func sysmon() {
 				// observes that there is no work to do and no other running M's
 				// and reports deadlock.
 				incidlelocked(-1)
-				injectglist(gp)
+				injectglist(&list)
 				incidlelocked(1)
 			}
 		}
@@ -4411,8 +4398,9 @@ func sysmon() {
 		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
 			lock(&forcegc.lock)
 			forcegc.idle = 0
-			forcegc.g.schedlink = 0
-			injectglist(forcegc.g)
+			var list gList
+			list.push(forcegc.g)
+			injectglist(&list)
 			unlock(&forcegc.lock)
 		}
 		// scavenge heap once in a while
@@ -4579,7 +4567,7 @@ func schedtrace(detailed bool) {
 			if mp != nil {
 				id = mp.id
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
@@ -4666,13 +4654,7 @@ func mget() *m {
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqput(gp *g) {
-	gp.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(gp)
-	} else {
-		sched.runqhead.set(gp)
-	}
-	sched.runqtail.set(gp)
+	sched.runq.pushBack(gp)
 	sched.runqsize++
 }
 
@@ -4681,25 +4663,17 @@ func globrunqput(gp *g) {
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqputhead(gp *g) {
-	gp.schedlink = sched.runqhead
-	sched.runqhead.set(gp)
-	if sched.runqtail == 0 {
-		sched.runqtail.set(gp)
-	}
+	sched.runq.push(gp)
 	sched.runqsize++
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
+// This clears *batch.
 // Sched must be locked.
-func globrunqputbatch(ghead *g, gtail *g, n int32) {
-	gtail.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(ghead)
-	} else {
-		sched.runqhead.set(ghead)
-	}
-	sched.runqtail.set(gtail)
+func globrunqputbatch(batch *gQueue, n int32) {
+	sched.runq.pushBackAll(*batch)
 	sched.runqsize += n
+	*batch = gQueue{}
 }
 
 // Try get a batch of G's from the global runnable queue.
@@ -4721,16 +4695,11 @@ func globrunqget(_p_ *p, max int32) *g {
 	}
 
 	sched.runqsize -= n
-	if sched.runqsize == 0 {
-		sched.runqtail = 0
-	}
 
-	gp := sched.runqhead.ptr()
-	sched.runqhead = gp.schedlink
+	gp := sched.runq.pop()
 	n--
 	for ; n > 0; n-- {
-		gp1 := sched.runqhead.ptr()
-		sched.runqhead = gp1.schedlink
+		gp1 := sched.runq.pop()
 		runqput(_p_, gp1, false)
 	}
 	return gp
@@ -4858,10 +4827,13 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 	for i := uint32(0); i < n; i++ {
 		batch[i].schedlink.set(batch[i+1])
 	}
+	var q gQueue
+	q.head.set(batch[0])
+	q.tail.set(batch[n])
 
 	// Now put the batch on global queue.
 	lock(&sched.lock)
-	globrunqputbatch(batch[0], batch[n], int32(n+1))
+	globrunqputbatch(&q, int32(n+1))
 	unlock(&sched.lock)
 	return true
 }
@@ -4973,6 +4945,107 @@ func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
 	return gp
 }
 
+// A gQueue is a dequeue of Gs linked through g.schedlink. A G can only
+// be on one gQueue or gList at a time.
+type gQueue struct {
+	head guintptr
+	tail guintptr
+}
+
+// empty returns true if q is empty.
+func (q *gQueue) empty() bool {
+	return q.head == 0
+}
+
+// push adds gp to the head of q.
+func (q *gQueue) push(gp *g) {
+	gp.schedlink = q.head
+	q.head.set(gp)
+	if q.tail == 0 {
+		q.tail.set(gp)
+	}
+}
+
+// pushBack adds gp to the tail of q.
+func (q *gQueue) pushBack(gp *g) {
+	gp.schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink.set(gp)
+	} else {
+		q.head.set(gp)
+	}
+	q.tail.set(gp)
+}
+
+// pushBackAll adds all Gs in l2 to the tail of q. After this q2 must
+// not be used.
+func (q *gQueue) pushBackAll(q2 gQueue) {
+	if q2.tail == 0 {
+		return
+	}
+	q2.tail.ptr().schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink = q2.head
+	} else {
+		q.head = q2.head
+	}
+	q.tail = q2.tail
+}
+
+// pop removes and returns the head of queue q. It returns nil if
+// q is empty.
+func (q *gQueue) pop() *g {
+	gp := q.head.ptr()
+	if gp != nil {
+		q.head = gp.schedlink
+		if q.head == 0 {
+			q.tail = 0
+		}
+	}
+	return gp
+}
+
+// popList takes all Gs in q and returns them as a gList.
+func (q *gQueue) popList() gList {
+	stack := gList{q.head}
+	*q = gQueue{}
+	return stack
+}
+
+// A gList is a list of Gs linked through g.schedlink. A G can only be
+// on one gQueue or gList at a time.
+type gList struct {
+	head guintptr
+}
+
+// empty returns true if l is empty.
+func (l *gList) empty() bool {
+	return l.head == 0
+}
+
+// push adds gp to the head of l.
+func (l *gList) push(gp *g) {
+	gp.schedlink = l.head
+	l.head.set(gp)
+}
+
+// pushAll prepends all Gs in q to l.
+func (l *gList) pushAll(q gQueue) {
+	if !q.empty() {
+		q.tail.ptr().schedlink = l.head
+		l.head = q.head
+	}
+}
+
+// pop removes and returns the head of l. If l is empty, it returns nil.
+func (l *gList) pop() *g {
+	gp := l.head.ptr()
+	if gp != nil {
+		l.head = gp.schedlink
+	}
+	return gp
+}
+
 //go:linkname setMaxThreads runtime/debug.setMaxThreads
 func setMaxThreads(in int) (out int) {
 	lock(&sched.lock)
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index f702c7a5d4..95e965411b 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build race,linux,amd64 race,freebsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le
+// +build race,linux,amd64 race,freebsd,amd64 race,netbsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le
 
 package race
 
diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s
index d039a8e0ab..e3972f4924 100644
--- a/src/runtime/rt0_darwin_arm64.s
+++ b/src/runtime/rt0_darwin_arm64.s
@@ -49,7 +49,9 @@ TEXT _rt0_arm64_darwin_lib(SB),NOSPLIT,$168
 	MOVD  _cgo_sys_thread_create(SB), R4
 	MOVD  $_rt0_arm64_darwin_lib_go(SB), R0
 	MOVD  $0, R1
+	SUB   $16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL    (R4)
+	ADD   $16, RSP
 
 	// Restore callee-save registers.
 	MOVD 24(RSP), R19
diff --git a/src/runtime/rt0_linux_arm64.s b/src/runtime/rt0_linux_arm64.s
index 458f082159..a6bc99df56 100644
--- a/src/runtime/rt0_linux_arm64.s
+++ b/src/runtime/rt0_linux_arm64.s
@@ -48,7 +48,9 @@ TEXT _rt0_arm64_linux_lib(SB),NOSPLIT,$184
 	BEQ	nocgo
 	MOVD	$_rt0_arm64_linux_lib_go(SB), R0
 	MOVD	$0, R1
+	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL	(R4)
+	ADD	$16, RSP
 	B	restore
 
 nocgo:
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index 4733efba6d..d9c6f6d22a 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -162,7 +162,22 @@ func testGdbPython(t *testing.T, cgo bool) {
 	args := []string{"-nx", "-q", "--batch",
 		"-iex", "add-auto-load-safe-path " + filepath.Join(runtime.GOROOT(), "src", "runtime"),
 		"-ex", "set startup-with-shell off",
-		"-ex", "info auto-load python-scripts",
+	}
+	if cgo {
+		// When we build the cgo version of the program, the system's
+		// linker is used. Some external linkers, like GNU gold,
+		// compress the .debug_gdb_scripts into .zdebug_gdb_scripts.
+		// Until gold and gdb can work together, temporarily load the
+		// python script directly.
+		args = append(args,
+			"-ex", "source "+filepath.Join(runtime.GOROOT(), "src", "runtime", "runtime-gdb.py"),
+		)
+	} else {
+		args = append(args,
+			"-ex", "info auto-load python-scripts",
+		)
+	}
+	args = append(args,
 		"-ex", "set python print-stack full",
 		"-ex", "br fmt.Println",
 		"-ex", "run",
@@ -193,7 +208,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 		"-ex", "goroutine 1 bt",
 		"-ex", "echo END\n",
 		filepath.Join(dir, "a.exe"),
-	}
+	)
 	got, _ := exec.Command("gdb", args...).CombinedOutput()
 	t.Logf("gdb output: %s\n", got)
 
diff --git a/src/runtime/runtime-lldb_test.go b/src/runtime/runtime-lldb_test.go
index a036fd8480..fe3a0eb90d 100644
--- a/src/runtime/runtime-lldb_test.go
+++ b/src/runtime/runtime-lldb_test.go
@@ -10,7 +10,6 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"regexp"
 	"runtime"
 	"strings"
 	"testing"
@@ -83,12 +82,8 @@ target = debugger.CreateTargetWithFileAndArch("a.exe", None)
 if target:
   print "Created target"
   main_bp = target.BreakpointCreateByLocation("main.go", 10)
-  if main_bp.GetNumLocations() != 0:
+  if main_bp:
     print "Created breakpoint"
-  else:
-    # This happens if lldb can't read the program's DWARF. See https://golang.org/issue/25925.
-    print "SKIP: no matching locations for breakpoint"
-    exit(1)
   process = target.LaunchSimple(None, None, os.getcwd())
   if process:
     print "Process launched"
@@ -103,7 +98,7 @@ if target:
         if state in [lldb.eStateUnloaded, lldb.eStateLaunching, lldb.eStateRunning]:
           continue
       else:
-        print "SKIP: Timeout launching"
+        print "Timeout launching"
       break
     if state == lldb.eStateStopped:
       for t in process.threads:
@@ -159,7 +154,9 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("failed to create file: %v", err)
 	}
 
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
+	// As of 2018-07-17, lldb doesn't support compressed DWARF, so
+	// disable it for this test.
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-ldflags=-compressdwarf=false", "-o", "a.exe")
 	cmd.Dir = dir
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -177,9 +174,8 @@ func TestLldbPython(t *testing.T) {
 	got, _ := cmd.CombinedOutput()
 
 	if string(got) != expectedLldbOutput {
-		skipReason := regexp.MustCompile("SKIP: .*\n").Find(got)
-		if skipReason != nil {
-			t.Skip(string(skipReason))
+		if strings.Contains(string(got), "Timeout launching") {
+			t.Skip("Timeout launching")
 		}
 		t.Fatalf("Unexpected lldb output:\n%s", got)
 	}
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index a0769bbb67..d5f78baded 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -145,7 +145,7 @@ func check() {
 		h     uint64
 		i, i1 float32
 		j, j1 float64
-		k, k1 unsafe.Pointer
+		k     unsafe.Pointer
 		l     *uint16
 		m     [4]byte
 	)
@@ -234,21 +234,6 @@ func check() {
 		throw("cas6")
 	}
 
-	k = unsafe.Pointer(uintptr(0xfedcb123))
-	if sys.PtrSize == 8 {
-		k = unsafe.Pointer(uintptr(k) << 10)
-	}
-	if casp(&k, nil, nil) {
-		throw("casp1")
-	}
-	k1 = add(k, 1)
-	if !casp(&k, k, k1) {
-		throw("casp2")
-	}
-	if k != k1 {
-		throw("casp3")
-	}
-
 	m = [4]byte{1, 1, 1, 1}
 	atomic.Or8(&m[1], 0xf0)
 	if m[0] != 1 || m[1] != 0xf1 || m[2] != 1 || m[3] != 1 {
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index a3193b63c5..259bb376ae 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -506,8 +507,10 @@ type p struct {
 	runnext guintptr
 
 	// Available G's (status == Gdead)
-	gfree    *g
-	gfreecnt int32
+	gFree struct {
+		gList
+		n int32
+	}
 
 	sudogcache []*sudog
 	sudogbuf   [128]*sudog
@@ -546,7 +549,7 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
-	pad [sys.CacheLineSize]byte
+	pad cpu.CacheLinePad
 }
 
 type schedt struct {
@@ -574,15 +577,16 @@ type schedt struct {
 	nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
 
 	// Global runnable queue.
-	runqhead guintptr
-	runqtail guintptr
+	runq     gQueue
 	runqsize int32
 
 	// Global cache of dead G's.
-	gflock       mutex
-	gfreeStack   *g
-	gfreeNoStack *g
-	ngfree       int32
+	gFree struct {
+		lock    mutex
+		stack   gList // Gs with stacks
+		noStack gList // Gs without stacks
+		n       int32
+	}
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
@@ -833,20 +837,18 @@ var (
 	newprocs   int32
 
 	// Information about what cpu features are available.
-	// Set on startup in runtime.cpuinit.
 	// Packages outside the runtime should not use these
 	// as they are not an external api.
-	// TODO: deprecate these; use internal/cpu directly.
+	// Set on startup in asm_{386,amd64,amd64p32}.s
 	processorVersionInfo uint32
 	isIntel              bool
 	lfenceBeforeRdtsc    bool
 
 	// Set in runtime.cpuinit.
-	support_erms         bool
-	support_popcnt       bool
-	support_sse2         bool
-	support_sse41        bool
-	arm64_support_atomics      bool
+	// TODO: deprecate these; use internal/cpu directly.
+	support_popcnt        bool
+	support_sse41         bool
+	arm64_support_atomics bool
 
 	goarm                uint8 // set by cmd/link on arm systems
 	framepointer_enabled bool  // set by cmd/link
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index aba9733127..18e0a398ba 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -20,8 +20,8 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -48,7 +48,7 @@ const semTabSize = 251
 
 var semtable [semTabSize]struct {
 	root semaRoot
-	pad  [sys.CacheLineSize - unsafe.Sizeof(semaRoot{})]byte
+	pad  [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte
 }
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index 500b02880d..a63450038d 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -38,6 +38,13 @@ func initExceptionHandler() {
 	}
 }
 
+// isgoexception returns true if this exception should be translated
+// into a Go panic.
+//
+// It is nosplit to avoid growing the stack in case we're aborting
+// because of a stack overflow.
+//
+//go:nosplit
 func isgoexception(info *exceptionrecord, r *context) bool {
 	// Only handle exception if executing instructions in Go binary
 	// (not Windows library code).
@@ -46,7 +53,9 @@ func isgoexception(info *exceptionrecord, r *context) bool {
 		return false
 	}
 
-	if isAbortPC(r.ip()) {
+	// In the case of an abort, the exception IP is one byte after
+	// the INT3 (this differs from UNIX OSes).
+	if isAbortPC(r.ip() - 1) {
 		// Never turn abort into a panic.
 		return false
 	}
@@ -71,11 +80,19 @@ func isgoexception(info *exceptionrecord, r *context) bool {
 // Called by sigtramp from Windows VEH handler.
 // Return value signals whether the exception has been handled (EXCEPTION_CONTINUE_EXECUTION)
 // or should be made available to other handlers in the chain (EXCEPTION_CONTINUE_SEARCH).
+//
+// This is the first entry into Go code for exception handling. This
+// is nosplit to avoid growing the stack until we've checked for
+// _EXCEPTION_BREAKPOINT, which is raised if we overflow the g0 stack,
+//
+//go:nosplit
 func exceptionhandler(info *exceptionrecord, r *context, gp *g) int32 {
 	if !isgoexception(info, r) {
 		return _EXCEPTION_CONTINUE_SEARCH
 	}
 
+	// After this point, it is safe to grow the stack.
+
 	if gp.throwsplit {
 		// We can't safely sigpanic because it may grow the
 		// stack. Let it fall through.
@@ -111,6 +128,10 @@ func exceptionhandler(info *exceptionrecord, r *context, gp *g) int32 {
 // if ExceptionHandler returns EXCEPTION_CONTINUE_EXECUTION.
 // firstcontinuehandler will stop that search,
 // if exceptionhandler did the same earlier.
+//
+// It is nosplit for the same reason as exceptionhandler.
+//
+//go:nosplit
 func firstcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
 	if !isgoexception(info, r) {
 		return _EXCEPTION_CONTINUE_SEARCH
@@ -122,6 +143,10 @@ var testingWER bool
 
 // lastcontinuehandler is reached, because runtime cannot handle
 // current exception. lastcontinuehandler will print crash info and exit.
+//
+// It is nosplit for the same reason as exceptionhandler.
+//
+//go:nosplit
 func lastcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
 	if testingWER {
 		return _EXCEPTION_CONTINUE_SEARCH
@@ -134,6 +159,13 @@ func lastcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
 	}
 	panicking = 1
 
+	// In case we're handling a g0 stack overflow, blow away the
+	// g0 stack bounds so we have room to print the traceback. If
+	// this somehow overflows the stack, the OS will trap it.
+	_g_.stack.lo = 0
+	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+	_g_.stackguard1 = _g_.stackguard0
+
 	print("Exception ", hex(info.exceptioncode), " ", hex(info.exceptioninformation[0]), " ", hex(info.exceptioninformation[1]), " ", hex(r.ip()), "\n")
 
 	print("PC=", hex(r.ip()), "\n")
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 9f53240954..a425433b20 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -237,8 +237,10 @@ func signal_ignore(s uint32) {
 	atomic.Store(&sig.ignored[s/32], i)
 }
 
-// sigInitIgnored marks the signal as already ignored.  This is called at
-// program start by siginit.
+// sigInitIgnored marks the signal as already ignored. This is called at
+// program start by initsig. In a shared library initsig is called by
+// libpreinit, so the runtime may not be initialized yet.
+//go:nosplit
 func sigInitIgnored(s uint32) {
 	i := sig.ignored[s/32]
 	i |= 1 << (s & 31)
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index fd5d08b52c..4206f4384a 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -195,21 +195,19 @@ func growslice(et *_type, old slice, cap int) slice {
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
 		p = mallocgc(capmem, nil, false)
-		memmove(p, old.array, lenmem)
 		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
 		// Only clear the part that will not be overwritten.
 		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
 		p = mallocgc(capmem, et, true)
-		if !writeBarrier.enabled {
-			memmove(p, old.array, lenmem)
-		} else {
-			for i := uintptr(0); i < lenmem; i += et.size {
-				typedmemmove(et, add(p, i), add(old.array, i))
-			}
+		if writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice p
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem)
 		}
 	}
+	memmove(p, old.array, lenmem)
 
 	return slice{p, old.len, newcap}
 }
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 648603db35..c7bfc0434b 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -186,6 +186,7 @@ func stackpoolalloc(order uint8) gclinkptr {
 		if s.manualFreeList.ptr() != nil {
 			throw("bad manualFreeList")
 		}
+		osStackAlloc(s)
 		s.elemsize = _FixedStack << order
 		for i := uintptr(0); i < _StackCacheSize; i += s.elemsize {
 			x := gclinkptr(s.base() + i)
@@ -238,6 +239,7 @@ func stackpoolfree(x gclinkptr, order uint8) {
 		// By not freeing, we prevent step #4 until GC is done.
 		stackpool[order].remove(s)
 		s.manualFreeList = 0
+		osStackFree(s)
 		mheap_.freeManual(s, &memstats.stacks_inuse)
 	}
 }
@@ -385,6 +387,7 @@ func stackalloc(n uint32) stack {
 			if s == nil {
 				throw("out of memory")
 			}
+			osStackAlloc(s)
 			s.elemsize = uintptr(n)
 		}
 		v = unsafe.Pointer(s.base())
@@ -463,6 +466,7 @@ func stackfree(stk stack) {
 		if gcphase == _GCoff {
 			// Free the stack immediately if we're
 			// sweeping.
+			osStackFree(s)
 			mheap_.freeManual(s, &memstats.stacks_inuse)
 		} else {
 			// If the GC is running, we can't return a
@@ -1112,6 +1116,7 @@ func freeStackSpans() {
 			if s.allocCount == 0 {
 				list.remove(s)
 				s.manualFreeList = 0
+				osStackFree(s)
 				mheap_.freeManual(s, &memstats.stacks_inuse)
 			}
 			s = next
@@ -1126,6 +1131,7 @@ func freeStackSpans() {
 		for s := stackLarge.free[i].first; s != nil; {
 			next := s.next
 			stackLarge.free[i].remove(s)
+			osStackFree(s)
 			mheap_.freeManual(s, &memstats.stacks_inuse)
 			s = next
 		}
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index 5d674470c1..dc65395141 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -7,9 +7,11 @@ package runtime_test
 import (
 	"bytes"
 	"fmt"
+	"os"
 	"reflect"
 	"regexp"
 	. "runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -126,9 +128,18 @@ func TestStackGrowth(t *testing.T) {
 		}()
 		<-done
 		GC()
+
+		timeout := 20 * time.Second
+		if s := os.Getenv("GO_TEST_TIMEOUT_SCALE"); s != "" {
+			scale, err := strconv.Atoi(s)
+			if err == nil {
+				timeout *= time.Duration(scale)
+			}
+		}
+
 		select {
 		case <-done:
-		case <-time.After(20 * time.Second):
+		case <-time.After(timeout):
 			if atomic.LoadUint32(&started) == 0 {
 				t.Log("finalizer did not start")
 			} else {
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 6e42483b13..d10bd96f43 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -333,7 +333,7 @@ func index(s, t string) int {
 		return 0
 	}
 	for i := 0; i < len(s); i++ {
-		if s[i] == t[0] && hasprefix(s[i:], t) {
+		if s[i] == t[0] && hasPrefix(s[i:], t) {
 			return i
 		}
 	}
@@ -344,8 +344,8 @@ func contains(s, t string) bool {
 	return index(s, t) >= 0
 }
 
-func hasprefix(s, t string) bool {
-	return len(s) >= len(t) && s[:len(t)] == t
+func hasPrefix(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
 }
 
 const (
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 74b385d596..632b1e2293 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -178,7 +178,7 @@ func goexit(neverCallThisFunction)
 // cgocallback_gofunc is not called from go, only from cgocallback,
 // so the arguments will be found via cgocallback's pointer-declared arguments.
 // See the assembly implementations for more details.
-func cgocallback_gofunc(fv uintptr, frame uintptr, framesize, ctxt uintptr)
+func cgocallback_gofunc(fv, frame, framesize, ctxt uintptr)
 
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
diff --git a/src/runtime/stubs_x86.go b/src/runtime/stubs_x86.go
new file mode 100644
index 0000000000..830c48bd01
--- /dev/null
+++ b/src/runtime/stubs_x86.go
@@ -0,0 +1,10 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32 386
+
+package runtime
+
+// stackcheck checks that SP is in range [g->stack.lo, g->stack.hi).
+func stackcheck()
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index f0d0815903..7efbef746c 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -18,12 +18,30 @@ func libcCall(fn, arg unsafe.Pointer) int32 {
 	if gp != nil {
 		mp = gp.m
 	}
-	if mp != nil {
+	if mp != nil && mp.libcallsp == 0 {
 		mp.libcallg.set(gp)
 		mp.libcallpc = getcallerpc()
 		// sp must be the last, because once async cpu profiler finds
 		// all three values to be non-zero, it will use them
 		mp.libcallsp = getcallersp()
+	} else {
+		// Make sure we don't reset libcallsp. This makes
+		// libcCall reentrant; We remember the g/pc/sp for the
+		// first call on an M, until that libcCall instance
+		// returns.  Reentrance only matters for signals, as
+		// libc never calls back into Go.  The tricky case is
+		// where we call libcX from an M and record g/pc/sp.
+		// Before that call returns, a signal arrives on the
+		// same M and the signal handling code calls another
+		// libc function.  We don't want that second libcCall
+		// from within the handler to be recorded, and we
+		// don't want that call's completion to zero
+		// libcallsp.
+		// We don't need to set libcall* while we're in a sighandler
+		// (even if we're not currently in libc) because we block all
+		// signals while we're handling a signal. That includes the
+		// profile signal, which is the one that uses the libcall* info.
+		mp = nil
 	}
 	res := asmcgocall(fn, arg)
 	if mp != nil {
diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
index 09f12283a1..4bfb9b8362 100644
--- a/src/runtime/sys_darwin_386.s
+++ b/src/runtime/sys_darwin_386.s
@@ -326,6 +326,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	ADDL	$28, SP
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
+
 TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
 	PUSHL	BP
 	MOVL	SP, BP
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 142933585d..db74352613 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -215,6 +215,82 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
+// Used instead of sigtramp in programs that use cgo.
+// Arguments from kernel are in DI, SI, DX.
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	// If no traceback function, do usual sigtramp.
+	MOVQ	runtime·cgoTraceback(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// If no traceback support function, which means that
+	// runtime/cgo was not linked in, do usual sigtramp.
+	MOVQ	_cgo_callers(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// Figure out if we are currently in a cgo call.
+	// If not, just do usual sigtramp.
+	get_tls(CX)
+	MOVQ	g(CX),AX
+	TESTQ	AX, AX
+	JZ	sigtrampnog     // g == nil
+	MOVQ	g_m(AX), AX
+	TESTQ	AX, AX
+	JZ	sigtramp        // g.m == nil
+	MOVL	m_ncgo(AX), CX
+	TESTL	CX, CX
+	JZ	sigtramp        // g.m.ncgo == 0
+	MOVQ	m_curg(AX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg == nil
+	MOVQ	g_syscallsp(CX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg.syscallsp == 0
+	MOVQ	m_cgoCallers(AX), R8
+	TESTQ	R8, R8
+	JZ	sigtramp        // g.m.cgoCallers == nil
+	MOVL	m_cgoCallersUse(AX), CX
+	TESTL	CX, CX
+	JNZ	sigtramp	// g.m.cgoCallersUse != 0
+
+	// Jump to a function in runtime/cgo.
+	// That function, written in C, will call the user's traceback
+	// function with proper unwind info, and will then call back here.
+	// The first three arguments, and the fifth, are already in registers.
+	// Set the two remaining arguments now.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigtramp(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+sigtramp:
+	JMP	runtime·sigtramp(SB)
+
+sigtrampnog:
+	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
+	// stack trace.
+	CMPL	DI, $27 // 27 == SIGPROF
+	JNZ	sigtramp
+
+	// Lock sigprofCallersUse.
+	MOVL	$0, AX
+	MOVL	$1, CX
+	MOVQ	$runtime·sigprofCallersUse(SB), R11
+	LOCK
+	CMPXCHGL	CX, 0(R11)
+	JNZ	sigtramp  // Skip stack trace if already locked.
+
+	// Jump to the traceback function in runtime/cgo.
+	// It will call back to sigprofNonGo, which will ignore the
+	// arguments passed in registers.
+	// First three arguments to traceback function are in registers already.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigprofCallers(SB), R8
+	MOVQ	$runtime·sigprofNonGo(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
 TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP			// make a frame; keep stack aligned
 	MOVQ	SP, BP
@@ -230,7 +306,7 @@ TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
 	CMPQ	AX, $-1
 	JNE	ok
 	CALL	libc_error(SB)
-	MOVQ	(AX), DX		// errno
+	MOVLQSX	(AX), DX		// errno
 	XORL	AX, AX
 ok:
 	MOVQ	AX, 32(BX)
@@ -295,7 +371,7 @@ TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
 	CMPQ	AX, $-1
 	JNE	ok
 	CALL	libc_error(SB)
-	MOVQ	(AX), AX		// errno
+	MOVLQSX	(AX), AX		// errno
 	NEGQ	AX			// caller wants it as a negative error code
 ok:
 	POPQ	BP
diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
index 9b5c667f45..7a269cf576 100644
--- a/src/runtime/sys_darwin_arm.s
+++ b/src/runtime/sys_darwin_arm.s
@@ -227,6 +227,9 @@ nog:
 
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
+
 TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
 	MOVW	4(R0), R1	// arg 2 new
 	MOVW	8(R0), R2	// arg 3 old
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index c324994d26..d7ba116b84 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -70,7 +70,7 @@ TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
 	CMP	R0, R2
 	BNE	ok
 	BL libc_error(SB)
-	MOVD	(R0), R1
+	MOVW	(R0), R1
 	MOVD	$0, R0
 ok:
 	MOVD	R0, 32(R19) // ret 1 p
@@ -223,6 +223,9 @@ nog:
 
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
+
 TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
 	MOVD	8(R0), R1	// arg 2 new
 	MOVD	16(R0), R2	// arg 3 old
@@ -274,7 +277,7 @@ TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
 	CMP	R0, R2
 	BNE	ok
 	BL libc_error(SB)
-	MOVD	(R0), R0	// errno
+	MOVW	(R0), R0	// errno
 	NEG	R0, R0	// caller wants it as a negative error code
 ok:
 	RET
diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index c6afd76a65..1c8fce3db6 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s
@@ -239,7 +239,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$24-8
 	MOVD	(g_sched+gobuf_sp)(R3), R1	// Set RSP to g0 stack
 
 noswitch:
-	SUB	$16, R1
+	SUB	$32, R1
 	BIC	$15, R1
 	MOVD	R1, RSP
 
@@ -298,7 +298,9 @@ TEXT runtime·callCgoSigaction(SB),NOSPLIT,$0
 	MOVD	new+8(FP), R1
 	MOVD	old+16(FP), R2
 	MOVD	 _cgo_sigaction(SB), R3
+	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL	R3
+	ADD	$16, RSP
 	MOVW	R0, ret+24(FP)
 	RET
 
@@ -361,7 +363,9 @@ TEXT runtime·callCgoMmap(SB),NOSPLIT,$0
 	MOVW	fd+24(FP), R4
 	MOVW	off+28(FP), R5
 	MOVD	_cgo_mmap(SB), R9
+	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL	R9
+	ADD	$16, RSP
 	MOVD	R0, ret+32(FP)
 	RET
 
@@ -382,7 +386,9 @@ TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0
 	MOVD	addr+0(FP), R0
 	MOVD	n+8(FP), R1
 	MOVD	_cgo_munmap(SB), R9
+	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
 	BL	R9
+	ADD	$16, RSP
 	RET
 
 TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index 9ce810a6b6..8e64f1c562 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s
@@ -218,6 +218,8 @@ TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
 	MOVV	size+24(FP), R7
 	MOVV	$SYS_rt_sigaction, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+32(FP)
 	RET
 
@@ -299,6 +301,8 @@ TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
 	MOVW	val3+32(FP), R9
 	MOVV	$SYS_futex, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+40(FP)
 	RET
 
@@ -321,6 +325,8 @@ TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
 
 	MOVV	$SYS_clone, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 
 	// In parent, return.
 	BEQ	R2, 3(PC)
@@ -383,6 +389,8 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
 	MOVV	buf+16(FP), R6
 	MOVV	$SYS_sched_getaffinity, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+24(FP)
 	RET
 
@@ -391,6 +399,8 @@ TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
 	MOVW    size+0(FP), R4
 	MOVV	$SYS_epoll_create, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+8(FP)
 	RET
 
@@ -399,6 +409,8 @@ TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
 	MOVW	flags+0(FP), R4
 	MOVV	$SYS_epoll_create1, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+8(FP)
 	RET
 
@@ -424,6 +436,8 @@ TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
 	MOVV	$0, R8
 	MOVV	$SYS_epoll_pwait, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+24(FP)
 	RET
 
diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index 95f6367893..a6bca3bebd 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s
@@ -234,6 +234,8 @@ TEXT runtime·rt_sigaction(SB),NOSPLIT,$0-20
 	MOVW	size+12(FP), R7
 	MOVW	$SYS_rt_sigaction, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+16(FP)
 	RET
 
@@ -320,6 +322,8 @@ TEXT runtime·futex(SB),NOSPLIT,$20-28
 
 	MOVW	$SYS_futex, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+24(FP)
 	RET
 
@@ -351,11 +355,11 @@ TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0-24
 
 	MOVW	$SYS_clone, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 
 	// In parent, return.
-	BEQ	R2, 5(PC)
-	SUBU	R2, R0, R3
-	CMOVN	R7, R3, R2
+	BEQ	R2, 3(PC)
 	MOVW	R2, ret+20(FP)
 	RET
 
@@ -417,6 +421,8 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0-16
 	MOVW	buf+8(FP), R6
 	MOVW	$SYS_sched_getaffinity, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+12(FP)
 	RET
 
@@ -425,6 +431,8 @@ TEXT runtime·epollcreate(SB),NOSPLIT,$0-8
 	MOVW	size+0(FP), R4
 	MOVW	$SYS_epoll_create, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+4(FP)
 	RET
 
@@ -433,6 +441,8 @@ TEXT runtime·epollcreate1(SB),NOSPLIT,$0-8
 	MOVW	flags+0(FP), R4
 	MOVW	$SYS_epoll_create1, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+4(FP)
 	RET
 
@@ -456,6 +466,8 @@ TEXT runtime·epollwait(SB),NOSPLIT,$0-20
 	MOVW	timeout+12(FP), R7
 	MOVW	$SYS_epoll_wait, R2
 	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+16(FP)
 	RET
 
diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index b7d34b00da..075adf2368 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s
@@ -154,21 +154,87 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 
 // func walltime() (sec int64, nsec int32)
 TEXT runtime·walltime(SB),NOSPLIT,$16
-	MOVD	$0, R3 // CLOCK_REALTIME
-	MOVD	$0(R1), R4
-	SYSCALL	$SYS_clock_gettime
-	MOVD	0(R1), R3	// sec
-	MOVD	8(R1), R5	// nsec
+	MOVD	R1, R15		// R15 is unchanged by C code
+	MOVD	g_m(g), R21	// R21 = m
+
+	MOVD	$0, R3		// CLOCK_REALTIME
+
+	MOVD	runtime·vdsoClockgettimeSym(SB), R12	// Check for VDSO availability
+	CMP	R12, R0
+	BEQ	fallback
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	MOVD	LR, R14
+	MOVD	R14, m_vdsoPC(R21)
+	MOVD	R15, m_vdsoSP(R21)
+
+	MOVD	m_curg(R21), R6
+	CMP	g, R6
+	BNE	noswitch
+
+	MOVD	m_g0(R21), R7
+	MOVD	(g_sched+gobuf_sp)(R7), R1	// Set SP to g0 stack
+
+noswitch:
+	SUB	$16, R1			// Space for results
+	RLDICR	$0, R1, $59, R1		// Align for C code
+	MOVD	R12, CTR
+	MOVD	R1, R4
+	BL	(CTR)			// Call from VDSO
+	MOVD	$0, R0			// Restore R0
+	MOVD	R0, m_vdsoSP(R21)	// Clear vdsoSP
+	MOVD	0(R1), R3		// sec
+	MOVD	8(R1), R5		// nsec
+	MOVD	R15, R1			// Restore SP
+
+finish:
 	MOVD	R3, sec+0(FP)
 	MOVW	R5, nsec+8(FP)
 	RET
 
+	// Syscall fallback
+fallback:
+	ADD	$32, R1, R4
+	SYSCALL $SYS_clock_gettime
+	MOVD	32(R1), R3
+	MOVD	40(R1), R5
+	JMP	finish
+
 TEXT runtime·nanotime(SB),NOSPLIT,$16
-	MOVW	$1, R3 // CLOCK_MONOTONIC
-	MOVD	$0(R1), R4
-	SYSCALL	$SYS_clock_gettime
-	MOVD	0(R1), R3	// sec
-	MOVD	8(R1), R5	// nsec
+	MOVD	$1, R3		// CLOCK_MONOTONIC
+
+	MOVD	R1, R15		// R15 is unchanged by C code
+	MOVD	g_m(g), R21	// R21 = m
+
+	MOVD	runtime·vdsoClockgettimeSym(SB), R12	// Check for VDSO availability
+	CMP	R12, R0
+	BEQ	fallback
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	MOVD	LR, R14		// R14 is unchanged by C code
+	MOVD	R14, m_vdsoPC(R21)
+	MOVD	R15, m_vdsoSP(R21)
+
+	MOVD	m_curg(R21), R6
+	CMP	g, R6
+	BNE	noswitch
+
+	MOVD	m_g0(R21), R7
+	MOVD	(g_sched+gobuf_sp)(R7), R1	// Set SP to g0 stack
+
+noswitch:
+	SUB	$16, R1			// Space for results
+	RLDICR	$0, R1, $59, R1		// Align for C code
+	MOVD	R12, CTR
+	MOVD	R1, R4
+	BL	(CTR)			// Call from VDSO
+	MOVD	$0, R0			// Restore R0
+	MOVD	$0, m_vdsoSP(R21)	// Clear vdsoSP
+	MOVD	0(R1), R3		// sec
+	MOVD	8(R1), R5		// nsec
+	MOVD	R15, R1			// Restore SP
+
+finish:
 	// sec is in R3, nsec in R5
 	// return nsec in R3
 	MOVD	$1000000000, R4
@@ -177,6 +243,14 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16
 	MOVD	R3, ret+0(FP)
 	RET
 
+	// Syscall fallback
+fallback:
+	ADD	$32, R1, R4
+	SYSCALL $SYS_clock_gettime
+	MOVD	32(R1), R3
+	MOVD	48(R1), R5
+	JMP	finish
+
 TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	how+0(FP), R3
 	MOVD	new+8(FP), R4
@@ -193,6 +267,8 @@ TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
 	MOVD	old+16(FP), R5
 	MOVD	size+24(FP), R6
 	SYSCALL	$SYS_rt_sigaction
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+32(FP)
 	RET
 
@@ -388,6 +464,8 @@ TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr2+24(FP), R7
 	MOVW	val3+32(FP), R8
 	SYSCALL	$SYS_futex
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+40(FP)
 	RET
 
@@ -409,6 +487,8 @@ TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
 	MOVD	R7, -32(R4)
 
 	SYSCALL $SYS_clone
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 
 	// In parent, return.
 	CMP	R3, $0
@@ -472,6 +552,8 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
 	MOVD	len+8(FP), R4
 	MOVD	buf+16(FP), R5
 	SYSCALL	$SYS_sched_getaffinity
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+24(FP)
 	RET
 
@@ -479,6 +561,8 @@ TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
 TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
 	MOVW    size+0(FP), R3
 	SYSCALL	$SYS_epoll_create
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+8(FP)
 	RET
 
@@ -486,6 +570,8 @@ TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
 TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
 	MOVW	flags+0(FP), R3
 	SYSCALL	$SYS_epoll_create1
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+8(FP)
 	RET
 
@@ -507,6 +593,8 @@ TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
 	MOVW	nev+16(FP), R5
 	MOVW	timeout+20(FP), R6
 	SYSCALL	$SYS_epoll_wait
+	BVC	2(PC)
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+24(FP)
 	RET
 
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 5f0d8b7c2a..3c091adcb1 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -315,7 +315,7 @@ TEXT runtime·tstart(SB),NOSPLIT,$0
 	// Layout new m scheduler stack on os stack.
 	MOVL	SP, AX
 	MOVL	AX, (g_stack+stack_hi)(DX)
-	SUBL	$(64*1024), AX		// stack size
+	SUBL	$(64*1024), AX		// initial stack size (adjusted later)
 	MOVL	AX, (g_stack+stack_lo)(DX)
 	ADDL	$const__StackGuard, AX
 	MOVL	AX, g_stackguard0(DX)
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 2e5b29ba55..c9127ac2d2 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -363,7 +363,7 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
 	// Layout new m scheduler stack on os stack.
 	MOVQ	SP, AX
 	MOVQ	AX, (g_stack+stack_hi)(DX)
-	SUBQ	$(64*1024), AX		// stack size
+	SUBQ	$(64*1024), AX		// initial stack size (adjusted later)
 	MOVQ	AX, (g_stack+stack_lo)(DX)
 	ADDQ	$const__StackGuard, AX
 	MOVQ	AX, g_stackguard0(DX)
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 134d4dbd99..8264070569 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -42,20 +42,20 @@ func callbackasmAddr(i int) uintptr {
 //go:linkname compileCallback syscall.compileCallback
 func compileCallback(fn eface, cleanstack bool) (code uintptr) {
 	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
-		panic("compileCallback: not a function")
+		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	ft := (*functype)(unsafe.Pointer(fn._type))
 	if len(ft.out()) != 1 {
-		panic("compileCallback: function must have one output parameter")
+		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	uintptrSize := unsafe.Sizeof(uintptr(0))
 	if ft.out()[0].size != uintptrSize {
-		panic("compileCallback: output parameter size is wrong")
+		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	argsize := uintptr(0)
 	for _, t := range ft.in() {
 		if t.size > uintptrSize {
-			panic("compileCallback: input parameter size is wrong")
+			panic("compileCallback: argument size is larger than uintptr")
 		}
 		argsize += uintptrSize
 	}
@@ -107,11 +107,12 @@ func syscall_loadsystemlibrary(filename *uint16) (handle, err uintptr) {
 		}{filename, 0, _LOAD_LIBRARY_SEARCH_SYSTEM32}
 		c.args = uintptr(noescape(unsafe.Pointer(&args)))
 	} else {
-		// User is on Windows XP or something ancient.
-		// The caller wanted to only load the filename DLL
-		// from the System32 directory but that facility
-		// doesn't exist, so just load it the normal way. This
-		// is a potential security risk, but so is Windows XP.
+		// User doesn't have KB2533623 installed. The caller
+		// wanted to only load the filename DLL from the
+		// System32 directory but that facility doesn't exist,
+		// so just load it the normal way. This is a potential
+		// security risk, but so is not installing security
+		// updates.
 		c.fn = getLoadLibrary()
 		c.n = 1
 		c.args = uintptr(noescape(unsafe.Pointer(&filename)))
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 0f5e13f97e..0882e9cb73 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -957,6 +957,52 @@ uintptr_t cfunc() {
 	}
 }
 
+// Test that C code called via a DLL can use large Windows thread
+// stacks and call back in to Go without crashing. See issue #20975.
+//
+// See also TestBigStackCallbackCgo.
+func TestBigStackCallbackSyscall(t *testing.T) {
+	if _, err := exec.LookPath("gcc"); err != nil {
+		t.Skip("skipping test: gcc is missing")
+	}
+
+	srcname, err := filepath.Abs("testdata/testprogcgo/bigstack_windows.c")
+	if err != nil {
+		t.Fatal("Abs failed: ", err)
+	}
+
+	tmpdir, err := ioutil.TempDir("", "TestBigStackCallback")
+	if err != nil {
+		t.Fatal("TempDir failed: ", err)
+	}
+	defer os.RemoveAll(tmpdir)
+
+	outname := "mydll.dll"
+	cmd := exec.Command("gcc", "-shared", "-s", "-Werror", "-o", outname, srcname)
+	cmd.Dir = tmpdir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build dll: %v - %v", err, string(out))
+	}
+	dllpath := filepath.Join(tmpdir, outname)
+
+	dll := syscall.MustLoadDLL(dllpath)
+	defer dll.Release()
+
+	var ok bool
+	proc := dll.MustFindProc("bigStack")
+	cb := syscall.NewCallback(func() uintptr {
+		// Do something interesting to force stack checks.
+		forceStackCopy()
+		ok = true
+		return 0
+	})
+	proc.Call(cb)
+	if !ok {
+		t.Fatalf("callback not called")
+	}
+}
+
 // wantLoadLibraryEx reports whether we expect LoadLibraryEx to work for tests.
 func wantLoadLibraryEx() bool {
 	return testenv.Builder() == "windows-amd64-gce" || testenv.Builder() == "windows-386-gce"
diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
index 744b6108e2..3ca74ba5fe 100644
--- a/src/runtime/testdata/testprog/gc.go
+++ b/src/runtime/testdata/testprog/gc.go
@@ -48,8 +48,11 @@ func GCSys() {
 	fmt.Printf("OK\n")
 }
 
+var sink []byte
+
 func workthegc() []byte {
-	return make([]byte, 1029)
+	sink = make([]byte, 1029)
+	return sink
 }
 
 func GCFairness() {
diff --git a/src/runtime/testdata/testprogcgo/bigstack_windows.c b/src/runtime/testdata/testprogcgo/bigstack_windows.c
new file mode 100644
index 0000000000..cd85ac88d0
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/bigstack_windows.c
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This test source is used by both TestBigStackCallbackCgo (linked
+// directly into the Go binary) and TestBigStackCallbackSyscall
+// (compiled into a DLL).
+
+#include <windows.h>
+#include <stdio.h>
+
+#ifndef STACK_SIZE_PARAM_IS_A_RESERVATION
+#define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000
+#endif
+
+typedef void callback(char*);
+
+// Allocate a stack that's much larger than the default.
+static const int STACK_SIZE = 16<<20;
+
+static callback *bigStackCallback;
+
+static void useStack(int bytes) {
+	// Windows doesn't like huge frames, so we grow the stack 64k at a time.
+	char x[64<<10];
+	if (bytes < sizeof x) {
+		bigStackCallback(x);
+	} else {
+		useStack(bytes - sizeof x);
+	}
+}
+
+static DWORD WINAPI threadEntry(LPVOID lpParam) {
+	useStack(STACK_SIZE - (128<<10));
+	return 0;
+}
+
+void bigStack(callback *cb) {
+	bigStackCallback = cb;
+	HANDLE hThread = CreateThread(NULL, STACK_SIZE, threadEntry, NULL, STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
+	if (hThread == NULL) {
+		fprintf(stderr, "CreateThread failed\n");
+		exit(1);
+	}
+	WaitForSingleObject(hThread, INFINITE);
+}
diff --git a/src/runtime/testdata/testprogcgo/bigstack_windows.go b/src/runtime/testdata/testprogcgo/bigstack_windows.go
new file mode 100644
index 0000000000..f58fcf993f
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/bigstack_windows.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+typedef void callback(char*);
+extern void goBigStack1(char*);
+extern void bigStack(callback*);
+*/
+import "C"
+
+func init() {
+	register("BigStack", BigStack)
+}
+
+func BigStack() {
+	// Create a large thread stack and call back into Go to test
+	// if Go correctly determines the stack bounds.
+	C.bigStack((*C.callback)(C.goBigStack1))
+}
+
+//export goBigStack1
+func goBigStack1(x *C.char) {
+	println("OK")
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index 9de45f5e08..790819f259 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -7,7 +7,7 @@
 package runtime
 
 import (
-	"runtime/internal/sys"
+	"internal/cpu"
 	"unsafe"
 )
 
@@ -50,7 +50,7 @@ var timers [timersLen]struct {
 
 	// The padding should eliminate false sharing
 	// between timersBucket values.
-	pad [sys.CacheLineSize - unsafe.Sizeof(timersBucket{})%sys.CacheLineSize]byte
+	pad [cpu.CacheLinePadSize - unsafe.Sizeof(timersBucket{})%cpu.CacheLinePadSize]byte
 }
 
 func (t *timer) assignBucket() *timersBucket {
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 61f7513ee0..08e92d2efe 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -532,12 +532,12 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 }
 
 func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev byte, skip int, args ...uint64) {
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 	// TODO: test on non-zero extraBytes param.
 	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
 		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		(*bufp).set(buf)
+		bufp.set(buf)
 	}
 
 	ticks := uint64(cputicks()) / traceTickDiv
@@ -584,10 +584,10 @@ func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
 	gp := mp.curg
 	var nstk int
 	if gp == _g_ {
-		nstk = callers(skip+1, buf[:])
+		nstk = callers(skip+1, buf)
 	} else if gp != nil {
 		gp = mp.curg
-		nstk = gcallers(gp, skip, buf[:])
+		nstk = gcallers(gp, skip, buf)
 	}
 	if nstk > 0 {
 		nstk-- // skip runtime.goexit
@@ -689,11 +689,11 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
 	// so there must be no memory allocation or any activities
 	// that causes tracing after this point.
 
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
 	if buf == nil || len(buf.arr)-buf.pos < size {
 		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		(*bufp).set(buf)
+		bufp.set(buf)
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
@@ -708,7 +708,7 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
 	buf.varint(uint64(slen))
 	buf.pos += copy(buf.arr[buf.pos:], s[:slen])
 
-	(*bufp).set(buf)
+	bufp.set(buf)
 	return id, bufp
 }
 
@@ -1206,7 +1206,7 @@ func trace_userLog(id uint64, category, message string) {
 	traceEventLocked(extraSpace, mp, pid, bufp, traceEvUserLog, 3, id, categoryID)
 	// traceEventLocked reserved extra space for val and len(val)
 	// in buf, so buf now has room for the following.
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 
 	// double-check the message and its length can fit.
 	// Otherwise, truncate the message.
diff --git a/src/runtime/trace/annotation.go b/src/runtime/trace/annotation.go
index 3545ef3bba..d5a7d003fe 100644
--- a/src/runtime/trace/annotation.go
+++ b/src/runtime/trace/annotation.go
@@ -24,13 +24,13 @@ type traceContextKey struct{}
 // If the end function is called multiple times, only the first
 // call is used in the latency measurement.
 //
-//   ctx, task := trace.NewTask(ctx, "awesome task")
-//   trace.WithRegion(ctx, prepWork)
+//   ctx, task := trace.NewTask(ctx, "awesomeTask")
+//   trace.WithRegion(ctx, "preparation", prepWork)
 //   // preparation of the task
 //   go func() {  // continue processing the task in a separate goroutine.
 //       defer task.End()
-//       trace.WithRegion(ctx, remainingWork)
-//   }
+//       trace.WithRegion(ctx, "remainingWork", remainingWork)
+//   }()
 func NewTask(pctx context.Context, taskType string) (ctx context.Context, task *Task) {
 	pid := fromContext(pctx).id
 	id := newID()
diff --git a/src/runtime/trace/trace.go b/src/runtime/trace/trace.go
index a40f87e53c..7f9d72a846 100644
--- a/src/runtime/trace/trace.go
+++ b/src/runtime/trace/trace.go
@@ -82,10 +82,10 @@
 //
 //      ctx, task := trace.NewTask(ctx, "makeCappuccino")
 //      trace.Log(ctx, "orderID", orderID)
-
+//
 //      milk := make(chan bool)
 //      espresso := make(chan bool)
-
+//
 //      go func() {
 //              trace.WithRegion(ctx, "steamMilk", steamMilk)
 //              milk <- true
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index cc5e01eb8b..4c2010493a 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -99,8 +99,9 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 	if skip > 0 && callback != nil {
 		throw("gentraceback callback cannot be used with non-zero skip")
 	}
-	g := getg()
-	if g == gp && g == g.m.curg {
+
+	// Don't call this "g"; it's too easy get "g" and "gp" confused.
+	if ourg := getg(); ourg == gp && ourg == ourg.m.curg {
 		// The starting sp has been passed in as a uintptr, and the caller may
 		// have other uintptr-typed stack references as well.
 		// If during one of the calls that got us here or during one of the
@@ -196,16 +197,29 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		// Found an actual function.
 		// Derive frame pointer and link register.
 		if frame.fp == 0 {
-			// We want to jump over the systemstack switch. If we're running on the
-			// g0, this systemstack is at the top of the stack.
-			// if we're not on g0 or there's a no curg, then this is a regular call.
-			sp := frame.sp
-			if flags&_TraceJumpStack != 0 && f.funcID == funcID_systemstack && gp == g.m.g0 && gp.m.curg != nil {
-				sp = gp.m.curg.sched.sp
-				frame.sp = sp
-				cgoCtxt = gp.m.curg.cgoCtxt
+			// Jump over system stack transitions. If we're on g0 and there's a user
+			// goroutine, try to jump. Otherwise this is a regular call.
+			if flags&_TraceJumpStack != 0 && gp == gp.m.g0 && gp.m.curg != nil {
+				switch f.funcID {
+				case funcID_morestack:
+					// morestack does not return normally -- newstack()
+					// gogo's to curg.sched. Match that.
+					// This keeps morestack() from showing up in the backtrace,
+					// but that makes some sense since it'll never be returned
+					// to.
+					frame.pc = gp.m.curg.sched.pc
+					frame.fn = findfunc(frame.pc)
+					f = frame.fn
+					frame.sp = gp.m.curg.sched.sp
+					cgoCtxt = gp.m.curg.cgoCtxt
+				case funcID_systemstack:
+					// systemstack returns normally, so just follow the
+					// stack transition.
+					frame.sp = gp.m.curg.sched.sp
+					cgoCtxt = gp.m.curg.cgoCtxt
+				}
 			}
-			frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
+			frame.fp = frame.sp + uintptr(funcspdelta(f, frame.pc, &cache))
 			if !usesLR {
 				// On x86, call instruction pushes return PC before entering new function.
 				frame.fp += sys.RegSize
@@ -271,7 +285,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 
 		// If framepointer_enabled and there's a frame, then
 		// there's a saved bp here.
-		if framepointer_enabled && GOARCH == "amd64" && frame.varp > frame.sp {
+		if frame.varp > frame.sp && (framepointer_enabled && GOARCH == "amd64" || GOARCH == "arm64") {
 			frame.varp -= sys.RegSize
 		}
 
@@ -425,7 +439,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 				if frame.pc > f.entry {
 					print(" +", hex(frame.pc-f.entry))
 				}
-				if g.m.throwing > 0 && gp == g.m.curg || level >= 2 {
+				if gp.m != nil && gp.m.throwing > 0 && gp == gp.m.curg || level >= 2 {
 					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp), " pc=", hex(frame.pc))
 				}
 				print("\n")
@@ -679,7 +693,14 @@ func traceback(pc, sp, lr uintptr, gp *g) {
 // the initial PC must not be rewound to the previous instruction.
 // (All the saved pairs record a PC that is a return address, so we
 // rewind it into the CALL instruction.)
+// If gp.m.libcall{g,pc,sp} information is available, it uses that information in preference to
+// the pc/sp/lr passed in.
 func tracebacktrap(pc, sp, lr uintptr, gp *g) {
+	if gp.m.libcallsp != 0 {
+		// We're in C code somewhere, traceback from the saved position.
+		traceback1(gp.m.libcallpc, gp.m.libcallsp, 0, gp.m.libcallg.ptr(), 0)
+		return
+	}
 	traceback1(pc, sp, lr, gp, _TraceTrap)
 }
 
@@ -836,7 +857,7 @@ func showfuncinfo(f funcInfo, firstFrame, elideWrapper bool) bool {
 		return true
 	}
 
-	return contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+	return contains(name, ".") && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.
@@ -1015,7 +1036,7 @@ func isSystemGoroutine(gp *g) bool {
 		// back into user code.
 		return !fingRunning
 	}
-	return hasprefix(funcname(f), "runtime.")
+	return hasPrefix(funcname(f), "runtime.")
 }
 
 // SetCgoTraceback records three C functions to use to gather
@@ -1115,6 +1136,13 @@ func isSystemGoroutine(gp *g) bool {
 // to the symbolizer function, return the file/line of the call
 // instruction.  No additional subtraction is required or appropriate.
 //
+// On all platforms, the traceback function is invoked when a call from
+// Go to C to Go requests a stack trace. On linux/amd64, linux/ppc64le,
+// and freebsd/amd64, the traceback function is also invoked when a
+// signal is received by a thread that is executing a cgo call. The
+// traceback function should not make assumptions about when it is
+// called, as future versions of Go may make additional calls.
+//
 // The symbolizer function will be called with a single argument, a
 // pointer to a struct:
 //
diff --git a/src/runtime/type.go b/src/runtime/type.go
index d87d6e1507..88a44a37ed 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -112,10 +112,6 @@ func (t *_type) uncommon() *uncommontype {
 	}
 }
 
-func hasPrefix(s, prefix string) bool {
-	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
-}
-
 func (t *_type) name() string {
 	if t.tflag&tflagNamed == 0 {
 		return ""
@@ -131,6 +127,25 @@ func (t *_type) name() string {
 	return s[i+1:]
 }
 
+// pkgpath returns the path of the package where t was defined, if
+// available. This is not the same as the reflect package's PkgPath
+// method, in that it returns the package path for struct and interface
+// types, not just named types.
+func (t *_type) pkgpath() string {
+	if u := t.uncommon(); u != nil {
+		return t.nameOff(u.pkgpath).name()
+	}
+	switch t.kind & kindMask {
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		return st.pkgPath.name()
+	case kindInterface:
+		it := (*interfacetype)(unsafe.Pointer(t))
+		return it.pkgpath.name()
+	}
+	return ""
+}
+
 // reflectOffs holds type offsets defined at run time by the reflect package.
 //
 // When a type is defined at run time, its *rtype data lives on the heap.
diff --git a/src/runtime/vdso_elf64.go b/src/runtime/vdso_elf64.go
index 8510250065..7c9bd96277 100644
--- a/src/runtime/vdso_elf64.go
+++ b/src/runtime/vdso_elf64.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build amd64 arm64
+// +build amd64 arm64 ppc64 ppc64le
 
 package runtime
 
diff --git a/src/runtime/vdso_in_none.go b/src/runtime/vdso_in_none.go
index 34cfac56d1..f2d6bb55d9 100644
--- a/src/runtime/vdso_in_none.go
+++ b/src/runtime/vdso_in_none.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux,!386,!amd64,!arm,!arm64 !linux
+// +build linux,!386,!amd64,!arm,!arm64,!ppc64,!ppc64le !linux
 
 package runtime
 
diff --git a/src/runtime/vdso_linux.go b/src/runtime/vdso_linux.go
index f6a285efd7..9827874bea 100644
--- a/src/runtime/vdso_linux.go
+++ b/src/runtime/vdso_linux.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build 386 amd64 arm arm64
+// +build 386 amd64 arm arm64 ppc64 ppc64le
 
 package runtime
 
@@ -42,6 +42,8 @@ const (
 
 	_STT_FUNC = 2 /* Symbol is a code object */
 
+	_STT_NOTYPE = 0 /* Symbol type is not specified */
+
 	_STB_GLOBAL = 1 /* Global symbol */
 	_STB_WEAK   = 2 /* Weak symbol */
 
@@ -212,7 +214,8 @@ func vdsoParseSymbols(info *vdsoInfo, version int32) {
 		sym := &info.symtab[symIndex]
 		typ := _ELF_ST_TYPE(sym.st_info)
 		bind := _ELF_ST_BIND(sym.st_info)
-		if typ != _STT_FUNC || bind != _STB_GLOBAL && bind != _STB_WEAK || sym.st_shndx == _SHN_UNDEF {
+		// On ppc64x, VDSO functions are of type _STT_NOTYPE.
+		if typ != _STT_FUNC && typ != _STT_NOTYPE || bind != _STB_GLOBAL && bind != _STB_WEAK || sym.st_shndx == _SHN_UNDEF {
 			return false
 		}
 		if k.name != gostringnocopy(&info.symstrings[sym.st_name]) {
diff --git a/src/runtime/vdso_linux_ppc64x.go b/src/runtime/vdso_linux_ppc64x.go
new file mode 100644
index 0000000000..f30946e4c5
--- /dev/null
+++ b/src/runtime/vdso_linux_ppc64x.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build ppc64 ppc64le
+
+package runtime
+
+const (
+	// vdsoArrayMax is the byte-size of a maximally sized array on this architecture.
+	// See cmd/compile/internal/ppc64/galign.go arch.MAXWIDTH initialization.
+	vdsoArrayMax = 1<<50 - 1
+)
+
+var vdsoLinuxVersion = vdsoVersionKey{"LINUX_2.6.15", 0x75fcba5}
+
+var vdsoSymbolKeys = []vdsoSymbolKey{
+	{"__kernel_clock_gettime", 0xb0cd725, 0xdfa941fd, &vdsoClockgettimeSym},
+}
+
+// initialize with vsyscall fallbacks
+var (
+	vdsoClockgettimeSym uintptr = 0
+)
diff --git a/src/runtime/vdso_linux_test.go b/src/runtime/vdso_linux_test.go
index b5221f90b7..ad083c61b4 100644
--- a/src/runtime/vdso_linux_test.go
+++ b/src/runtime/vdso_linux_test.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build 386 amd64 arm arm64
+// +build 386 amd64 arm arm64 ppc64 ppc64le
 
 package runtime_test
 
diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index d48e515d32..729653488f 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s
@@ -44,7 +44,7 @@
 // the RET instruction will clobber R12 on nacl, and the compiler's register
 // allocator needs to know.
 TEXT runtime·udiv(SB),NOSPLIT|NOFRAME,$0
-	MOVBU	runtime·hardDiv(SB), Ra
+	MOVBU	internal∕cpu·ARM+const_offsetARMHasIDIVA(SB), Ra
 	CMP	$0, Ra
 	BNE	udiv_hardware
author	Filippo Valsorda <filippo@golang.org>	2018-09-06 13:25:27 -0400
committer	Filippo Valsorda <filippo@golang.org>	2018-09-06 13:25:27 -0400
commit	4d1aa482b8754c081d8f3f6b39fe61dd2e6504fc (patch)
tree	b12a76ad02035d1206d09a7aa14a6cda0c411c3c /src/runtime
parent	7eb1677c01c3decc510270d532ed69d0bf42bffa (diff)
parent	3e5b5d69dcdb82494f550049986426d84dd6b8f8 (diff)
download	go-4d1aa482b8754c081d8f3f6b39fe61dd2e6504fc.tar.xz