[dev.garbage] Merge remote-tracking branch 'origin/master' into HEAD

Change-Id: I282fd9ce9db435dfd35e882a9502ab1abc185297
author: Rick Hudson <rlh@golang.org> 2016-04-27 18:19:16 -0400
committer: Rick Hudson <rlh@golang.org> 2016-04-27 18:46:52 -0400
commit: 23aeb34df172b17b7bfaa85fb59ca64bef9073bb (patch)
tree: a8ab866f1e50f0059856ce628f036d93ab620155 /src/runtime
parent: 1354b32cd70f2702381764fd595dd2faa996840c (diff)
parent: d3c79d324acd7300b6f705e66af8ca711af00d9f (diff)
download: go-23aeb34df172b17b7bfaa85fb59ca64bef9073bb.tar.xz
161 files changed, 7417 insertions, 3569 deletions
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 7aacc8cf9b..66943495b5 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -146,7 +146,7 @@ func interhash(p unsafe.Pointer, h uintptr) uintptr {
 	t := tab._type
 	fn := t.alg.hash
 	if fn == nil {
-		panic(errorString("hash of unhashable type " + t._string))
+		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
@@ -163,7 +163,7 @@ func nilinterhash(p unsafe.Pointer, h uintptr) uintptr {
 	}
 	fn := t.alg.hash
 	if fn == nil {
-		panic(errorString("hash of unhashable type " + t._string))
+		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
@@ -221,7 +221,7 @@ func efaceeq(x, y eface) bool {
 	}
 	eq := t.alg.equal
 	if eq == nil {
-		panic(errorString("comparing uncomparable type " + t._string))
+		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
@@ -239,7 +239,7 @@ func ifaceeq(x, y iface) bool {
 	t := xtab._type
 	eq := t.alg.equal
 	if eq == nil {
-		panic(errorString("comparing uncomparable type " + t._string))
+		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
diff --git a/src/runtime/append_test.go b/src/runtime/append_test.go
index 3170870b0e..cd28e3dca6 100644
--- a/src/runtime/append_test.go
+++ b/src/runtime/append_test.go
@@ -7,6 +7,14 @@ import "testing"
 
 const N = 20
 
+func BenchmarkMakeSlice(b *testing.B) {
+	var x []byte
+	for i := 0; i < b.N; i++ {
+		x = make([]byte, 32)
+		_ = x
+	}
+}
+
 func BenchmarkGrowSliceBytes(b *testing.B) {
 	b.StopTimer()
 	var x = make([]byte, 9)
@@ -226,3 +234,132 @@ func BenchmarkCopy16String(b *testing.B)   { benchmarkCopyStr(b, 16) }
 func BenchmarkCopy32String(b *testing.B)   { benchmarkCopyStr(b, 32) }
 func BenchmarkCopy128String(b *testing.B)  { benchmarkCopyStr(b, 128) }
 func BenchmarkCopy1024String(b *testing.B) { benchmarkCopyStr(b, 1024) }
+
+var (
+	sByte []byte
+	s1Ptr []uintptr
+	s2Ptr [][2]uintptr
+	s3Ptr [][3]uintptr
+	s4Ptr [][4]uintptr
+)
+
+// BenchmarkAppendInPlace tests the performance of append
+// when the result is being written back to the same slice.
+// In order for the in-place optimization to occur,
+// the slice must be referred to by address;
+// using a global is an easy way to trigger that.
+// We test the "grow" and "no grow" paths separately,
+// but not the "normal" (occasionally grow) path,
+// because it is a blend of the other two.
+// We use small numbers and small sizes in an attempt
+// to avoid benchmarking memory allocation and copying.
+// We use scalars instead of pointers in an attempt
+// to avoid benchmarking the write barriers.
+// We benchmark four common sizes (byte, pointer, string/interface, slice),
+// and one larger size.
+func BenchmarkAppendInPlace(b *testing.B) {
+	b.Run("NoGrow", func(b *testing.B) {
+		const C = 128
+
+		b.Run("Byte", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sByte = make([]byte, C)
+				for j := 0; j < C; j++ {
+					sByte = append(sByte, 0x77)
+				}
+			}
+		})
+
+		b.Run("1Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s1Ptr = make([]uintptr, C)
+				for j := 0; j < C; j++ {
+					s1Ptr = append(s1Ptr, 0x77)
+				}
+			}
+		})
+
+		b.Run("2Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s2Ptr = make([][2]uintptr, C)
+				for j := 0; j < C; j++ {
+					s2Ptr = append(s2Ptr, [2]uintptr{0x77, 0x88})
+				}
+			}
+		})
+
+		b.Run("3Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s3Ptr = make([][3]uintptr, C)
+				for j := 0; j < C; j++ {
+					s3Ptr = append(s3Ptr, [3]uintptr{0x77, 0x88, 0x99})
+				}
+			}
+		})
+
+		b.Run("4Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s4Ptr = make([][4]uintptr, C)
+				for j := 0; j < C; j++ {
+					s4Ptr = append(s4Ptr, [4]uintptr{0x77, 0x88, 0x99, 0xAA})
+				}
+			}
+		})
+
+	})
+
+	b.Run("Grow", func(b *testing.B) {
+		const C = 5
+
+		b.Run("Byte", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sByte = make([]byte, 0)
+				for j := 0; j < C; j++ {
+					sByte = append(sByte, 0x77)
+					sByte = sByte[:cap(sByte)]
+				}
+			}
+		})
+
+		b.Run("1Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s1Ptr = make([]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s1Ptr = append(s1Ptr, 0x77)
+					s1Ptr = s1Ptr[:cap(s1Ptr)]
+				}
+			}
+		})
+
+		b.Run("2Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s2Ptr = make([][2]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s2Ptr = append(s2Ptr, [2]uintptr{0x77, 0x88})
+					s2Ptr = s2Ptr[:cap(s2Ptr)]
+				}
+			}
+		})
+
+		b.Run("3Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s3Ptr = make([][3]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s3Ptr = append(s3Ptr, [3]uintptr{0x77, 0x88, 0x99})
+					s3Ptr = s3Ptr[:cap(s3Ptr)]
+				}
+			}
+		})
+
+		b.Run("4Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s4Ptr = make([][4]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s4Ptr = append(s4Ptr, [4]uintptr{0x77, 0x88, 0x99, 0xAA})
+					s4Ptr = s4Ptr[:cap(s4Ptr)]
+				}
+			}
+		})
+
+	})
+}
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 83db4d3e81..cdda29f347 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1666,122 +1666,126 @@ big_loop_avx2_exit:
 // TODO: Also use this in bytes.Index
 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
 	MOVQ s+0(FP), DI
-	MOVQ s_len+8(FP), CX
-	MOVQ c+16(FP), AX
-	MOVQ c_len+24(FP), BX
-	CMPQ BX, CX
+	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
+	MOVQ s_len+8(FP), DX
+	MOVQ c+16(FP), BP
+	MOVQ c_len+24(FP), AX
+	CMPQ AX, DX
 	JA fail
-	CMPQ BX, $2
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
 	JA   _3_or_more
-	MOVW (AX), AX
-	LEAQ -1(DI)(CX*1), CX
+	MOVW (BP), BP
+	LEAQ -1(DI)(DX*1), DX
 loop2:
 	MOVW (DI), SI
-	CMPW SI,AX
+	CMPW SI,BP
 	JZ success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop2
 	JMP fail
 _3_or_more:
-	CMPQ BX, $3
+	CMPQ AX, $3
 	JA   _4_or_more
-	MOVW 1(AX), DX
-	MOVW (AX), AX
-	LEAQ -2(DI)(CX*1), CX
+	MOVW 1(BP), BX
+	MOVW (BP), BP
+	LEAQ -2(DI)(DX*1), DX
 loop3:
 	MOVW (DI), SI
-	CMPW SI,AX
+	CMPW SI,BP
 	JZ   partial_success3
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop3
 	JMP fail
 partial_success3:
 	MOVW 1(DI), SI
-	CMPW SI,DX
+	CMPW SI,BX
 	JZ success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop3
 	JMP fail
 _4_or_more:
-	CMPQ BX, $4
+	CMPQ AX, $4
 	JA   _5_or_more
-	MOVL (AX), AX
-	LEAQ -3(DI)(CX*1), CX
+	MOVL (BP), BP
+	LEAQ -3(DI)(DX*1), DX
 loop4:
 	MOVL (DI), SI
-	CMPL SI,AX
+	CMPL SI,BP
 	JZ   success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop4
 	JMP fail
 _5_or_more:
-	CMPQ BX, $7
+	CMPQ AX, $7
 	JA   _8_or_more
-	LEAQ 1(DI)(CX*1), CX
-	SUBQ BX, CX
-	MOVL -4(AX)(BX*1), DX
-	MOVL (AX), AX
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(BP)(AX*1), BX
+	MOVL (BP), BP
 loop5to7:
 	MOVL (DI), SI
-	CMPL SI,AX
+	CMPL SI,BP
 	JZ   partial_success5to7
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 partial_success5to7:
-	MOVL -4(BX)(DI*1), SI
-	CMPL SI,DX
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
 	JZ success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 _8_or_more:
-	CMPQ BX, $8
+	CMPQ AX, $8
 	JA   _9_or_more
-	MOVQ (AX), AX
-	LEAQ -7(DI)(CX*1), CX
+	MOVQ (BP), BP
+	LEAQ -7(DI)(DX*1), DX
 loop8:
 	MOVQ (DI), SI
-	CMPQ SI,AX
+	CMPQ SI,BP
 	JZ   success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop8
 	JMP fail
 _9_or_more:
-	CMPQ BX, $16
+	CMPQ AX, $16
 	JA   _16_or_more
-	LEAQ 1(DI)(CX*1), CX
-	SUBQ BX, CX
-	MOVQ -8(AX)(BX*1), DX
-	MOVQ (AX), AX
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(BP)(AX*1), BX
+	MOVQ (BP), BP
 loop9to15:
 	MOVQ (DI), SI
-	CMPQ SI,AX
+	CMPQ SI,BP
 	JZ   partial_success9to15
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 partial_success9to15:
-	MOVQ -8(BX)(DI*1), SI
-	CMPQ SI,DX
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
 	JZ success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 _16_or_more:
-	CMPQ BX, $16
+	CMPQ AX, $17
 	JA   _17_to_31
-	MOVOU (AX), X1
-	LEAQ -15(DI)(CX*1), CX
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), DX
 loop16:
 	MOVOU (DI), X2
 	PCMPEQB X1, X2
@@ -1789,14 +1793,14 @@ loop16:
 	CMPQ  SI, $0xffff
 	JE   success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop16
 	JMP fail
 _17_to_31:
-	LEAQ 1(DI)(CX*1), CX
-	SUBQ BX, CX
-	MOVOU -16(AX)(BX*1), X0
-	MOVOU (AX), X1
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(BP)(AX*1), X0
+	MOVOU (BP), X1
 loop17to31:
 	MOVOU (DI), X2
 	PCMPEQB X1,X2
@@ -1804,21 +1808,58 @@ loop17to31:
 	CMPQ  SI, $0xffff
 	JE   partial_success17to31
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop17to31
 	JMP fail
 partial_success17to31:
-	MOVOU -16(BX)(DI*1), X3
+	MOVOU -16(AX)(DI*1), X3
 	PCMPEQB X0, X3
 	PMOVMSKB X3, SI
 	CMPQ  SI, $0xffff
 	JE success
 	ADDQ $1,DI
-	CMPQ DI,CX
+	CMPQ DI,DX
 	JB loop17to31
 fail:
 	MOVQ $-1, ret+32(FP)
 	RET
+sse42:
+	MOVL runtime·cpuid_ecx(SB), CX
+	ANDL $0x100000, CX
+	JZ no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(BP), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
 success:
 	SUBQ s+0(FP), DI
 	MOVQ DI, ret+32(FP)
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
new file mode 100644
index 0000000000..fc74b0ddf9
--- /dev/null
+++ b/src/runtime/asm_s390x.s
@@ -0,0 +1,1130 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// Indicate the status of vector facility
+// -1: 	init value
+// 0:	vector not installed
+// 1:	vector installed and enabled
+// 2:	vector installed but not enabled
+
+DATA runtime·vectorfacility+0x00(SB)/4, $-1
+GLOBL runtime·vectorfacility(SB), NOPTR, $4
+
+TEXT runtime·checkvectorfacility(SB),NOSPLIT,$32-0
+	MOVD    $2, R0
+	MOVD	R1, tmp-32(SP)
+	MOVD    $x-24(SP), R1
+//      STFLE   0(R1)
+	WORD    $0xB2B01000
+	MOVBZ   z-8(SP), R1
+	AND     $0x40, R1
+	BNE     vectorinstalled
+	MOVB    $0, runtime·vectorfacility(SB) //Vector not installed
+	MOVD	tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB   $0, $0xF, V16
+	VLGVB   $0, V16, R0
+	CMPBEQ  R0, $0xF, vectorenabled
+	MOVB    $2, runtime·vectorfacility(SB) //Vector installed but not enabled
+	MOVD    tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+vectorenabled:
+	MOVB    $1, runtime·vectorfacility(SB) //Vector installed and enabled
+	MOVD    tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// R2 = argc; R3 = argv; R11 = temp; R13 = g; R15 = stack pointer
+	// C TLS base pointer in AR0:AR1
+
+	// initialize essential registers
+	XOR	R0, R0
+
+	SUB	$24, R15
+	MOVW	R2, 8(R15) // argc
+	MOVD	R3, 16(R15) // argv
+
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOVD	$runtime·g0(SB), g
+	MOVD	R15, R11
+	SUB	$(64*1024), R11
+	MOVD	R11, g_stackguard0(g)
+	MOVD	R11, g_stackguard1(g)
+	MOVD	R11, (g_stack+stack_lo)(g)
+	MOVD	R15, (g_stack+stack_hi)(g)
+
+	// if there is a _cgo_init, call it using the gcc ABI.
+	MOVD	_cgo_init(SB), R11
+	CMPBEQ	R11, $0, nocgo
+	MOVW	AR0, R4			// (AR0 << 32 | AR1) is the TLS base pointer; MOVD is translated to EAR
+	SLD	$32, R4, R4
+	MOVW	AR1, R4			// arg 2: TLS base pointer
+	MOVD	$setg_gcc<>(SB), R3 	// arg 1: setg
+	MOVD	g, R2			// arg 0: G
+	// C functions expect 160 bytes of space on caller stack frame
+	// and an 8-byte aligned stack pointer
+	MOVD	R15, R9			// save current stack (R9 is preserved in the Linux ABI)
+	SUB	$160, R15		// reserve 160 bytes
+	MOVD    $~7, R6
+	AND 	R6, R15			// 8-byte align
+	BL	R11			// this call clobbers volatile registers according to Linux ABI (R0-R5, R14)
+	MOVD	R9, R15			// restore stack
+	XOR	R0, R0			// zero R0
+
+nocgo:
+	// update stackguard after _cgo_init
+	MOVD	(g_stack+stack_lo)(g), R2
+	ADD	$const__StackGuard, R2
+	MOVD	R2, g_stackguard0(g)
+	MOVD	R2, g_stackguard1(g)
+
+	// set the per-goroutine and per-mach "registers"
+	MOVD	$runtime·m0(SB), R2
+
+	// save m->g0 = g0
+	MOVD	g, m_g0(R2)
+	// save m0 to g0->m
+	MOVD	R2, g_m(g)
+
+	BL	runtime·check(SB)
+
+	// argc/argv are already prepared on stack
+	BL	runtime·args(SB)
+	BL	runtime·osinit(SB)
+	BL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVD	$runtime·mainPC(SB), R2		// entry
+	SUB     $24, R15
+	MOVD 	R2, 16(R15)
+	MOVD 	R0, 8(R15)
+	MOVD 	R0, 0(R15)
+	BL	runtime·newproc(SB)
+	ADD	$24, R15
+
+	// start this M
+	BL	runtime·mstart(SB)
+
+	MOVD	R0, 1(R0)
+	RET
+
+DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·mainPC(SB),RODATA,$8
+
+TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R0, 2(R0)
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $-8-8
+	MOVD	buf+0(FP), R3
+	MOVD	R15, gobuf_sp(R3)
+	MOVD	LR, gobuf_pc(R3)
+	MOVD	g, gobuf_g(R3)
+	MOVD	$0, gobuf_lr(R3)
+	MOVD	$0, gobuf_ret(R3)
+	MOVD	$0, gobuf_ctxt(R3)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $-8-8
+	MOVD	buf+0(FP), R5
+	MOVD	gobuf_g(R5), g	// make sure g is not nil
+	BL	runtime·save_g(SB)
+
+	MOVD	0(g), R4
+	MOVD	gobuf_sp(R5), R15
+	MOVD	gobuf_lr(R5), LR
+	MOVD	gobuf_ret(R5), R3
+	MOVD	gobuf_ctxt(R5), R12
+	MOVD	$0, gobuf_sp(R5)
+	MOVD	$0, gobuf_ret(R5)
+	MOVD	$0, gobuf_lr(R5)
+	MOVD	$0, gobuf_ctxt(R5)
+	CMP	R0, R0 // set condition codes for == test, needed by stack split
+	MOVD	gobuf_pc(R5), R6
+	BR	(R6)
+
+// void mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $-8-8
+	// Save caller state in g->sched
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	LR, (g_sched+gobuf_pc)(g)
+	MOVD	R0, (g_sched+gobuf_lr)(g)
+	MOVD	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVD	g, R3
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), g
+	BL	runtime·save_g(SB)
+	CMP	g, R3
+	BNE	2(PC)
+	BR	runtime·badmcall(SB)
+	MOVD	fn+0(FP), R12			// context
+	MOVD	0(R12), R4			// code pointer
+	MOVD	(g_sched+gobuf_sp)(g), R15	// sp = m->g0->sched.sp
+	SUB	$16, R15
+	MOVD	R3, 8(R15)
+	MOVD	$0, 0(R15)
+	BL	(R4)
+	BR	runtime·badmcall2(SB)
+
+// systemstack_switch is a dummy routine that systemstack leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the system stack because the one at the top of
+// the system stack terminates the stack walk (see topofstack()).
+TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	UNDEF
+	BL	(LR)	// make sure this function is not leaf
+	RET
+
+// func systemstack(fn func())
+TEXT runtime·systemstack(SB), NOSPLIT, $0-8
+	MOVD	fn+0(FP), R3	// R3 = fn
+	MOVD	R3, R12		// context
+	MOVD	g_m(g), R4	// R4 = m
+
+	MOVD	m_gsignal(R4), R5	// R5 = gsignal
+	CMPBEQ	g, R5, noswitch
+
+	MOVD	m_g0(R4), R5	// R5 = g0
+	CMPBEQ	g, R5, noswitch
+
+	MOVD	m_curg(R4), R6
+	CMPBEQ	g, R6, switch
+
+	// Bad: g is not gsignal, not g0, not curg. What is it?
+	// Hide call from linker nosplit analysis.
+	MOVD	$runtime·badsystemstack(SB), R3
+	BL	(R3)
+
+switch:
+	// save our state in g->sched.  Pretend to
+	// be systemstack_switch if the G stack is scanned.
+	MOVD	$runtime·systemstack_switch(SB), R6
+	ADD	$16, R6	// get past prologue
+	MOVD	R6, (g_sched+gobuf_pc)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	R0, (g_sched+gobuf_lr)(g)
+	MOVD	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOVD	R5, g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R3
+	// make it look like mstart called systemstack on g0, to stop traceback
+	SUB	$8, R3
+	MOVD	$runtime·mstart(SB), R4
+	MOVD	R4, 0(R3)
+	MOVD	R3, R15
+
+	// call target function
+	MOVD	0(R12), R3	// code pointer
+	BL	(R3)
+
+	// switch back to g
+	MOVD	g_m(g), R3
+	MOVD	m_curg(R3), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	MOVD	$0, (g_sched+gobuf_sp)(g)
+	RET
+
+noswitch:
+	// already on m stack, just call directly
+	MOVD	0(R12), R3	// code pointer
+	BL	(R3)
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// Caller has already loaded:
+// R3: framesize, R4: argsize, R5: LR
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	MOVD	g_m(g), R7
+	MOVD	m_g0(R7), R8
+	CMPBNE	g, R8, 2(PC)
+	BL	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVD	m_gsignal(R7), R8
+	CMP	g, R8
+	BNE	2(PC)
+	BL	runtime·abort(SB)
+
+	// Called from f.
+	// Set g->sched to context in f.
+	MOVD	R12, (g_sched+gobuf_ctxt)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	LR, R8
+	MOVD	R8, (g_sched+gobuf_pc)(g)
+	MOVD	R5, (g_sched+gobuf_lr)(g)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVD	R5, (m_morebuf+gobuf_pc)(R7)	// f's caller's PC
+	MOVD	R15, (m_morebuf+gobuf_sp)(R7)	// f's caller's SP
+	MOVD	g, (m_morebuf+gobuf_g)(R7)
+
+	// Call newstack on m->g0's stack.
+	MOVD	m_g0(R7), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	BL	runtime·newstack(SB)
+
+	// Not reached, but make sure the return PC from the call to newstack
+	// is still in this function, and not the beginning of the next.
+	UNDEF
+
+TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	$0, R12
+	BR	runtime·morestack(SB)
+
+TEXT runtime·stackBarrier(SB),NOSPLIT,$0
+	// We came here via a RET to an overwritten LR.
+	// R3 may be live. Other registers are available.
+
+	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
+	MOVD	(g_stkbar+slice_array)(g), R4
+	MOVD	g_stkbarPos(g), R5
+	MOVD	$stkbar__size, R6
+	MULLD	R5, R6
+	ADD	R4, R6
+	MOVD	stkbar_savedLRVal(R6), R6
+	// Record that this stack barrier was hit.
+	ADD	$1, R5
+	MOVD	R5, g_stkbarPos(g)
+	// Jump to the original return PC.
+	BR	(R6)
+
+// reflectcall: call a function with the given argument list
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	MOVD	$MAXSIZE, R4;		\
+	CMP	R3, R4;		\
+	BGT	3(PC);			\
+	MOVD	$NAME(SB), R5;	\
+	BR	(R5)
+// Note: can't just "BR NAME(SB)" - bad inlining results.
+
+TEXT reflect·call(SB), NOSPLIT, $0-0
+	BR	·reflectcall(SB)
+
+TEXT ·reflectcall(SB), NOSPLIT, $-8-32
+	MOVWZ argsize+24(FP), R3
+	// NOTE(rsc): No call16, because CALLFN needs four words
+	// of argument space to invoke callwritebarrier.
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVD	$runtime·badreflectcall(SB), R5
+	BR	(R5)
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	argsize+24(FP), R4;			\
+	MOVD	R15, R5;				\
+	ADD	$(8-1), R5;			\
+	SUB	$1, R3;				\
+	ADD	R5, R4;				\
+	CMP	R5, R4;				\
+	BEQ	6(PC);				\
+	ADD	$1, R3;				\
+	ADD	$1, R5;				\
+	MOVBZ	0(R3), R6;			\
+	MOVBZ	R6, 0(R5);			\
+	BR	-6(PC);				\
+	/* call function */			\
+	MOVD	f+8(FP), R12;			\
+	MOVD	(R12), R8;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	BL	(R8);				\
+	/* copy return values back */		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
+	MOVD	R15, R5;				\
+	ADD	R6, R5; 			\
+	ADD	R6, R3;				\
+	SUB	R6, R4;				\
+	ADD	$(8-1), R5;			\
+	SUB	$1, R3;				\
+	ADD	R5, R4;				\
+loop:						\
+	CMP	R5, R4;				\
+	BEQ	end;				\
+	ADD	$1, R5;				\
+	ADD	$1, R3;				\
+	MOVBZ	0(R5), R6;			\
+	MOVBZ	R6, 0(R3);			\
+	BR	loop;				\
+end:						\
+	/* execute write barrier updates */	\
+	MOVD	argtype+0(FP), R7;		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
+	MOVD	R7, 8(R15);			\
+	MOVD	R3, 16(R15);			\
+	MOVD	R4, 24(R15);			\
+	MOVD	R6, 32(R15);			\
+	BL	runtime·callwritebarrier(SB);	\
+	RET
+
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	RET
+
+// void jmpdefer(fv, sp);
+// called from deferreturn.
+// 1. grab stored LR for caller
+// 2. sub 6 bytes to get back to BL deferreturn (size of BRASL instruction)
+// 3. BR to fn
+TEXT runtime·jmpdefer(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	0(R15), R1
+	SUB	$6, R1, LR
+
+	MOVD	fv+0(FP), R12
+	MOVD	argp+8(FP), R15
+	SUB	$8, R15
+	MOVD	0(R12), R3
+	BR	(R3)
+
+// Save state of caller into g->sched. Smashes R31.
+TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
+	MOVD	LR, (g_sched+gobuf_pc)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	$0, (g_sched+gobuf_lr)(g)
+	MOVD	$0, (g_sched+gobuf_ret)(g)
+	MOVD	$0, (g_sched+gobuf_ctxt)(g)
+	RET
+
+// func asmcgocall(fn, arg unsafe.Pointer) int32
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.go for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+	// R2 = argc; R3 = argv; R11 = temp; R13 = g; R15 = stack pointer
+	// C TLS base pointer in AR0:AR1
+	MOVD	fn+0(FP), R3
+	MOVD	arg+8(FP), R4
+
+	MOVD	R15, R2		// save original stack pointer
+	MOVD	g, R5
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOVD	g_m(g), R6
+	MOVD	m_g0(R6), R6
+	CMPBEQ	R6, g, g0
+	BL	gosave<>(SB)
+	MOVD	R6, g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+
+	// Now on a scheduling stack (a pthread-created stack).
+g0:
+	// Save room for two of our pointers, plus 160 bytes of callee
+	// save area that lives on the caller stack.
+	SUB	$176, R15
+	MOVD	$~7, R6
+	AND	R6, R15                 // 8-byte alignment for gcc ABI
+	MOVD	R5, 168(R15)             // save old g on stack
+	MOVD	(g_stack+stack_hi)(R5), R5
+	SUB	R2, R5
+	MOVD	R5, 160(R15)             // save depth in old g stack (can't just save SP, as stack might be copied during a callback)
+	MOVD	R0, 0(R15)              // clear back chain pointer (TODO can we give it real back trace information?)
+	MOVD	R4, R2                  // arg in R2
+	BL	R3                      // can clobber: R0-R5, R14, F0-F3, F5, F7-F15
+
+	XOR	R0, R0                  // set R0 back to 0.
+	// Restore g, stack pointer.
+	MOVD	168(R15), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_stack+stack_hi)(g), R5
+	MOVD	160(R15), R6
+	SUB	R6, R5
+	MOVD	R5, R15
+
+	MOVW	R2, ret+16(FP)
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
+	MOVD	$fn+0(FP), R3
+	MOVD	R3, 8(R15)
+	MOVD	frame+8(FP), R3
+	MOVD	R3, 16(R15)
+	MOVD	framesize+16(FP), R3
+	MOVD	R3, 24(R15)
+	MOVD	$runtime·cgocallback_gofunc(SB), R3
+	BL	(R3)
+	RET
+
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// See cgocall.go for more details.
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-24
+	NO_LOCAL_POINTERS
+
+	// Load m and g from thread-local storage.
+	MOVB	runtime·iscgo(SB), R3
+	CMPBEQ	R3, $0, nocgo
+	BL	runtime·load_g(SB)
+
+nocgo:
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	CMPBEQ	g, $0, needm
+
+	MOVD	g_m(g), R8
+	MOVD	R8, savedm-8(SP)
+	BR	havem
+
+needm:
+	MOVD	g, savedm-8(SP) // g is zero, so is m.
+	MOVD	$runtime·needm(SB), R3
+	BL	(R3)
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then systemstack will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), R3
+	MOVD	R15, (g_sched+gobuf_sp)(R3)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
+	MOVD	m_g0(R8), R3
+	MOVD	(g_sched+gobuf_sp)(R3), R4
+	MOVD	R4, savedsp-16(SP)
+	MOVD	R15, (g_sched+gobuf_sp)(R3)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, -16(SP) and -8(SP) are unused.
+	MOVD	m_curg(R8), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
+	MOVD	(g_sched+gobuf_pc)(g), R5
+	MOVD	R5, -24(R4)
+	MOVD	$-24(R4), R15
+	BL	runtime·cgocallbackg(SB)
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	MOVD	0(R15), R5
+	MOVD	R5, (g_sched+gobuf_pc)(g)
+	MOVD	$24(R15), R4
+	MOVD	R4, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	MOVD	savedsp-16(SP), R4
+	MOVD	R4, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVD	savedm-8(SP), R6
+	CMPBNE	R6, $0, droppedm
+	MOVD	$runtime·dropm(SB), R3
+	BL	(R3)
+droppedm:
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVD	gg+0(FP), g
+	// This only happens if iscgo, so jump straight to save_g
+	BL	runtime·save_g(SB)
+	RET
+
+// void setg_gcc(G*); set g in C TLS.
+// Must obey the gcc calling convention.
+TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
+	// The standard prologue clobbers LR (R14), which is callee-save in
+	// the C ABI, so we have to use NOFRAME and save LR ourselves.
+	MOVD	LR, R1
+	// Also save g, R10, and R11 since they're callee-save in C ABI
+	MOVD	R10, R3
+	MOVD	g, R4
+	MOVD	R11, R5
+
+	MOVD	R2, g
+	BL	runtime·save_g(SB)
+
+	MOVD	R5, R11
+	MOVD	R4, g
+	MOVD	R3, R10
+	MOVD	R1, LR
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
+	MOVD	16(R15), R3		// LR saved by caller
+	MOVD	runtime·stackBarrierPC(SB), R4
+	CMPBNE	R3, R4, nobar
+	// Get original return PC.
+	BL	runtime·nextBarrierPC(SB)
+	MOVD	8(R15), R3
+nobar:
+	MOVD	R3, ret+8(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
+	MOVD	pc+8(FP), R3
+	MOVD	16(R15), R4
+	MOVD	runtime·stackBarrierPC(SB), R5
+	CMPBEQ	R4, R5, setbar
+	MOVD	R3, 16(R15)		// set LR in caller
+	RET
+setbar:
+	// Set the stack barrier return PC.
+	MOVD	R3, 8(R15)
+	BL	runtime·setNextBarrierPC(SB)
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
+	MOVD	argp+0(FP), R3
+	SUB	$8, R3
+	MOVD	R3, ret+8(FP)
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R0
+	UNDEF
+
+// int64 runtime·cputicks(void)
+TEXT runtime·cputicks(SB),NOSPLIT,$0-8
+	// The TOD clock on s390 counts from the year 1900 in ~250ps intervals.
+	// This means that since about 1972 the msb has been set, making the
+	// result of a call to STORE CLOCK (stck) a negative number.
+	// We clear the msb to make it positive.
+	STCK	ret+0(FP)      // serialises before and after call
+	MOVD	ret+0(FP), R3  // R3 will wrap to 0 in the year 2043
+	SLD	$1, R3
+	SRD	$1, R3
+	MOVD	R3, ret+0(FP)
+	RET
+
+// memhash_varlen(p unsafe.Pointer, h seed) uintptr
+// redirects to memhash(p, h, size) using the size
+// stored in the closure.
+TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
+	GO_ARGS
+	NO_LOCAL_POINTERS
+	MOVD	p+0(FP), R3
+	MOVD	h+8(FP), R4
+	MOVD	8(R12), R5
+	MOVD	R3, 8(R15)
+	MOVD	R4, 16(R15)
+	MOVD	R5, 24(R15)
+	BL	runtime·memhash(SB)
+	MOVD	32(R15), R3
+	MOVD	R3, ret+16(FP)
+	RET
+
+// AES hashing not implemented for s390x
+TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	p+0(FP), R3
+	MOVD	q+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	runtime·memeqbody(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	runtime·memeqbody(SB)
+
+// eqstring tests whether two strings are equal.
+// The compiler guarantees that strings passed
+// to eqstring have equal length.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT|NOFRAME,$0-33
+	MOVD	s1str+0(FP), R3
+	MOVD	s1len+8(FP), R6
+	MOVD	s2str+16(FP), R5
+	LA	ret+32(FP), R7
+	BR	runtime·memeqbody(SB)
+
+TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49
+	MOVD	a_len+8(FP), R2
+	MOVD	b_len+32(FP), R6
+	MOVD	a+0(FP), R3
+	MOVD	b+24(FP), R5
+	LA	ret+48(FP), R7
+	CMPBNE	R2, R6, notequal
+	BR	runtime·memeqbody(SB)
+notequal:
+	MOVB	$0, ret+48(FP)
+	RET
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$runtime·memeqbodyclc(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT runtime·memeqbodyclc(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	MOVD	g_m(g), R4
+	MOVWZ	m_fastrand(R4), R3
+	ADD	R3, R3
+	CMPW	R3, $0
+	BGE	2(PC)
+	XOR	$0x88888eef, R3
+	MOVW	R3, m_fastrand(R4)
+	MOVW	R3, ret+0(FP)
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	s+0(FP), R3     // s => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	runtime·indexbytebody(SB)
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0-32
+	MOVD	s+0(FP), R3     // s => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	runtime·indexbytebody(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT runtime·indexbytebody(SB),NOSPLIT,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVB	runtime·vectorfacility(SB), R1
+	CMPBEQ	R1, $-1, checkvector	// vectorfacility = -1, vector not checked yet
+vectorchecked:
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+
+srstimpl:                       // vectorfacility != 1, not support or enable vector
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	AND	$15, R3, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	AND	$-16, R8, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	AND	$-16, R3, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
+
+checkvector:
+	CALL	runtime·checkvectorfacility(SB)
+	MOVB    runtime·vectorfacility(SB), R1
+	BR	vectorchecked
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVW	$0, R3
+	RET
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT|NOFRAME,$0
+	// g (R13), R10, R11 and LR (R14) are callee-save in the C ABI, so save them
+	MOVD	g, R1
+	MOVD	R10, R3
+	MOVD	LR, R4
+	MOVD	R11, R5
+
+	BL	runtime·load_g(SB)	// clobbers g (R13), R10, R11
+	MOVD	g_m(g), R2
+	MOVD	m_curg(R2), R2
+	MOVD	(g_stack+stack_hi)(R2), R2
+
+	MOVD	R1, g
+	MOVD	R3, R10
+	MOVD	R4, LR
+	MOVD	R5, R11
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT|NOFRAME,$0-0
+	BYTE $0x07; BYTE $0x00; // 2-byte nop
+	BL	runtime·goexit1(SB)	// does not return
+	// traceback from goexit1 must hit code range of goexit
+	BYTE $0x07; BYTE $0x00; // 2-byte nop
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0-8
+	RET
+
+TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
+	SYNC
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	s1_base+0(FP), R3
+	MOVD	s1_len+8(FP), R4
+	MOVD	s2_base+16(FP), R5
+	MOVD	s2_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	runtime·cmpbody(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	s1+0(FP), R3
+	MOVD	s1+8(FP), R4
+	MOVD	s2+24(FP), R5
+	MOVD	s2+32(FP), R6
+	LA	res+48(FP), R7
+	BR	runtime·cmpbody(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT runtime·cmpbody(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$runtime·cmpbodyclc(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT runtime·cmpbodyclc(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
+
+// This is called from .init_array and follows the platform, not Go, ABI.
+// We are overly conservative. We could only save the registers we use.
+// However, since this function is only called once per loaded module
+// performance is unimportant.
+TEXT runtime·addmoduledata(SB),NOSPLIT|NOFRAME,$0-0
+	// Save R6-R15, F0, F2, F4 and F6 in the
+	// register save area of the calling function
+	STMG	R6, R15, 48(R15)
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	// append the argument (passed in R2, as per the ELF ABI) to the
+	// moduledata linked list.
+	MOVD	runtime·lastmoduledatap(SB), R1
+	MOVD	R2, moduledata_next(R1)
+	MOVD	R2, runtime·lastmoduledatap(SB)
+
+	// Restore R6-R15, F0, F2, F4 and F6
+	LMG	48(R15), R6, R15
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVB	$1, ret+0(FP)
+	RET
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index bd21b49945..4fe334014d 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -15,13 +15,12 @@ import (
 // escape analysis decisions about the pointer value being stored.
 // Instead, these are wrappers around the actual atomics (casp1 and so on)
 // that use noescape to convey which arguments do not escape.
-//
-// Additionally, these functions must update the shadow heap for
-// write barrier checking.
 
+// atomicstorep performs *ptr = new atomically and invokes a write barrier.
+//
 //go:nosplit
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
-	atomic.Storep1(noescape(ptr), new)
+	atomic.StorepNoWB(noescape(ptr), new)
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
 }
 
@@ -45,7 +44,6 @@ func sync_atomic_StoreUintptr(ptr *uintptr, new uintptr)
 //go:nosplit
 func sync_atomic_StorePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
 	sync_atomic_StoreUintptr((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	atomic.Storep1(noescape(unsafe.Pointer(ptr)), new)
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 }
 
@@ -54,9 +52,9 @@ func sync_atomic_SwapUintptr(ptr *uintptr, new uintptr) uintptr
 
 //go:linkname sync_atomic_SwapPointer sync/atomic.SwapPointer
 //go:nosplit
-func sync_atomic_SwapPointer(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
-	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(ptr)), uintptr(new)))
-	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
+func sync_atomic_SwapPointer(ptr *unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
+	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(unsafe.Pointer(ptr))), uintptr(new)))
+	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 	return old
 }
 
diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
new file mode 100644
index 0000000000..5ed13cfe1e
--- /dev/null
+++ b/src/runtime/cgo/asm_s390x.s
@@ -0,0 +1,44 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ * crosscall2 obeys the C ABI; fn obeys the Go ABI.
+ */
+TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
+	// Start with standard C stack frame layout and linkage
+
+	// Save R6-R15, F0, F2, F4 and F6 in the
+	// register save area of the calling function
+	STMG	R6, R15, 48(R15)
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	// Initialize Go ABI environment
+	XOR	R0, R0
+	BL	runtime·load_g(SB)
+
+	// Allocate 24 bytes on the stack
+	SUB	$24, R15
+
+	MOVD	R3, 8(R15)  // arg1
+	MOVW	R4, 16(R15) // arg2
+	BL	(R2)        // fn(arg1, arg2)
+
+	ADD	$24, R15
+
+	// Restore R6-R15, F0, F2, F4 and F6
+	LMG	48(R15), R6, R15
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	RET
+
diff --git a/src/runtime/cgo/gcc_libinit.c b/src/runtime/cgo/gcc_libinit.c
index bdbaa2973c..06b9557709 100644
--- a/src/runtime/cgo/gcc_libinit.c
+++ b/src/runtime/cgo/gcc_libinit.c
@@ -4,7 +4,6 @@
 
 // +build cgo
 // +build darwin dragonfly freebsd linux netbsd solaris
-// +build !ppc64,!ppc64le
 
 #include <pthread.h>
 #include <stdio.h>
diff --git a/src/runtime/cgo/gcc_libinit_linux_ppc64x.c b/src/runtime/cgo/gcc_libinit_linux_ppc64x.c
deleted file mode 100644
index c133142f93..0000000000
--- a/src/runtime/cgo/gcc_libinit_linux_ppc64x.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// TODO: see issue #10410
-// +build linux
-// +build ppc64 ppc64le
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
-	fprintf(stderr, "x_cgo_sys_thread_create not implemented");
-	abort();
-}
-
-void
-_cgo_wait_runtime_init_done() {
-	// TODO(spetrovic): implement this method.
-}
-
-void
-x_cgo_notify_runtime_init_done(void* dummy) {
-	// TODO(spetrovic): implement this method.
-}
-\ No newline at end of file
diff --git a/src/runtime/cgo/gcc_linux_s390x.c b/src/runtime/cgo/gcc_linux_s390x.c
new file mode 100644
index 0000000000..81e3b339b0
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_s390x.c
@@ -0,0 +1,68 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+extern void crosscall_s390x(void (*fn)(void), void *g);
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	// Save g for this thread in C TLS
+	setg_gcc((void*)ts.g);
+
+	crosscall_s390x(ts.fn, (void*)ts.g);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_s390x.S b/src/runtime/cgo/gcc_s390x.S
new file mode 100644
index 0000000000..6b163d0d21
--- /dev/null
+++ b/src/runtime/cgo/gcc_s390x.S
@@ -0,0 +1,43 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall_s390x(void (*fn)(void), void *g)
+ *
+ * Calling into the go tool chain, where all registers are caller save.
+ * Called from standard s390x C ABI, where r6-r13, r15, and f0, f2, f4 and f6 are
+ * callee-save, so they must be saved explicitly.
+ */
+.globl crosscall_s390x
+crosscall_s390x:
+	/*
+	 * save r6-r15, f0, f2, f4 and f6 in the
+	 * register save area of the calling function
+	 */
+	stmg	%r6, %r15, 48(%r15)
+	stdy	%f0, 128(%r15)
+	stdy	%f2, 136(%r15)
+	stdy	%f4, 144(%r15)
+	stdy	%f6, 152(%r15)
+
+	/* assumes this call does not clobber r2 or r15 */
+	xgr	%r0, %r0
+
+	/* grow stack 8 bytes and call fn */
+	agfi    %r15, -8
+	basr    %r14, %r2
+	agfi	%r15, 8
+
+	/* restore registers */
+	lmg	%r6, %r15, 48(%r15)
+	ldy	%f0, 128(%r15)
+	ldy	%f2, 136(%r15)
+	ldy	%f4, 144(%r15)
+	ldy	%f6, 152(%r15)
+
+	br      %r14 /* restored by lmg */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/runtime/cgo/signal_darwin_armx.go b/src/runtime/cgo/signal_darwin_armx.go
index 9c1ba5dee1..9f6741eb08 100644
--- a/src/runtime/cgo/signal_darwin_armx.go
+++ b/src/runtime/cgo/signal_darwin_armx.go
@@ -13,10 +13,14 @@ import "unsafe"
 //go:linkname x_cgo_panicmem x_cgo_panicmem
 var x_cgo_panicmem uintptr
 
+// use a pointer to avoid relocation of external symbol in __TEXT
+// make linker happy
+var _cgo_panicmem = &x_cgo_panicmem
+
 // TODO(crawshaw): move this into x_cgo_init, it will not run until
 // runtime has finished loading, which may be after its use.
 func init() {
-	x_cgo_panicmem = funcPC(panicmem)
+	*_cgo_panicmem = funcPC(panicmem)
 }
 
 func funcPC(f interface{}) uintptr {
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index d5248803a4..c6000bf98f 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -246,8 +246,8 @@ func cgocallbackg1() {
 	case "386":
 		// On 386, stack frame is three words, plus caller PC.
 		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "ppc64", "ppc64le":
-		// On ppc64, the callback arguments are in the arguments area of
+	case "ppc64", "ppc64le", "s390x":
+		// On ppc64 and s390x, the callback arguments are in the arguments area of
 		// cgocallback's stack frame. The stack looks like this:
 		// +--------------------+------------------------------+
 		// |                    | ...                          |
@@ -300,7 +300,7 @@ func unwindm(restore *bool) {
 	switch GOARCH {
 	default:
 		throw("unwindm not implemented")
-	case "386", "amd64", "arm", "ppc64", "ppc64le":
+	case "386", "amd64", "arm", "ppc64", "ppc64le", "s390x":
 		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + sys.MinFrameSize))
 	case "arm64":
 		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 16))
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 954b389f47..712ad8cef9 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -64,7 +64,7 @@ func makechan(t *chantype, size int64) *hchan {
 		throw("makechan: bad alignment")
 	}
 	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/elem.size) {
-		panic("makechan: size out of range")
+		panic(plainError("makechan: size out of range"))
 	}
 
 	var c *hchan
@@ -74,7 +74,7 @@ func makechan(t *chantype, size int64) *hchan {
 		// buf points into the same allocation, elemtype is persistent.
 		// SudoG's are referenced from their owning thread so they can't be collected.
 		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
-		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, flagNoScan))
+		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
 		if size > 0 && elem.size != 0 {
 			c.buf = add(unsafe.Pointer(c), hchanSize)
 		} else {
@@ -84,7 +84,7 @@ func makechan(t *chantype, size int64) *hchan {
 		}
 	} else {
 		c = new(hchan)
-		c.buf = newarray(elem, uintptr(size))
+		c.buf = newarray(elem, int(size))
 	}
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
@@ -171,7 +171,7 @@ func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uin
 
 	if c.closed != 0 {
 		unlock(&c.lock)
-		panic("send on closed channel")
+		panic(plainError("send on closed channel"))
 	}
 
 	if sg := c.recvq.dequeue(); sg != nil {
@@ -231,7 +231,7 @@ func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uin
 		if c.closed == 0 {
 			throw("chansend: spurious wakeup")
 		}
-		panic("send on closed channel")
+		panic(plainError("send on closed channel"))
 	}
 	gp.param = nil
 	if mysg.releasetime > 0 {
@@ -302,13 +302,13 @@ func sendDirect(t *_type, sg *sudog, src unsafe.Pointer) {
 
 func closechan(c *hchan) {
 	if c == nil {
-		panic("close of nil channel")
+		panic(plainError("close of nil channel"))
 	}
 
 	lock(&c.lock)
 	if c.closed != 0 {
 		unlock(&c.lock)
-		panic("close of closed channel")
+		panic(plainError("close of closed channel"))
 	}
 
 	if raceenabled {
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 85fcc69fed..2941b8e8f8 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -273,6 +273,52 @@ func TestGoexitInPanic(t *testing.T) {
 	}
 }
 
+// Issue 14965: Runtime panics should be of type runtime.Error
+func TestRuntimePanicWithRuntimeError(t *testing.T) {
+	testCases := [...]func(){
+		0: func() {
+			var m map[uint64]bool
+			m[1234] = true
+		},
+		1: func() {
+			ch := make(chan struct{})
+			close(ch)
+			close(ch)
+		},
+		2: func() {
+			var ch = make(chan struct{})
+			close(ch)
+			ch <- struct{}{}
+		},
+		3: func() {
+			var s = make([]int, 2)
+			_ = s[2]
+		},
+		4: func() {
+			n := -1
+			_ = make(chan bool, n)
+		},
+		5: func() {
+			close((chan bool)(nil))
+		},
+	}
+
+	for i, fn := range testCases {
+		got := panicValue(fn)
+		if _, ok := got.(runtime.Error); !ok {
+			t.Errorf("test #%d: recovered value %v(type %T) does not implement runtime.Error", i, got, got)
+		}
+	}
+}
+
+func panicValue(fn func()) (recovered interface{}) {
+	defer func() {
+		recovered = recover()
+	}()
+	fn()
+	return
+}
+
 func TestPanicAfterGoexit(t *testing.T) {
 	// an uncaught panic should still work after goexit
 	output := runTestProg(t, "testprog", "PanicAfterGoexit")
diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go
new file mode 100644
index 0000000000..5f55d5a889
--- /dev/null
+++ b/src/runtime/defs_linux_s390x.go
@@ -0,0 +1,167 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_EINTR  = 0x4
+	_EAGAIN = 0xb
+	_ENOMEM = 0xc
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x20
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED   = 0x4
+	_MADV_HUGEPAGE   = 0xe
+	_MADV_NOHUGEPAGE = 0xf
+
+	_SA_RESTART = 0x10000000
+	_SA_ONSTACK = 0x8000000
+	_SA_SIGINFO = 0x4
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGBUS    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGUSR1   = 0xa
+	_SIGSEGV   = 0xb
+	_SIGUSR2   = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGSTKFLT = 0x10
+	_SIGCHLD   = 0x11
+	_SIGCONT   = 0x12
+	_SIGSTOP   = 0x13
+	_SIGTSTP   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGURG    = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGIO     = 0x1d
+	_SIGPWR    = 0x1e
+	_SIGSYS    = 0x1f
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EPOLLIN       = 0x1
+	_EPOLLOUT      = 0x4
+	_EPOLLERR      = 0x8
+	_EPOLLHUP      = 0x10
+	_EPOLLRDHUP    = 0x2000
+	_EPOLLET       = 0x80000000
+	_EPOLL_CLOEXEC = 0x80000
+	_EPOLL_CTL_ADD = 0x1
+	_EPOLL_CTL_DEL = 0x2
+	_EPOLL_CTL_MOD = 0x3
+)
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+func (ts *timespec) set_sec(x int64) {
+	ts.tv_sec = x
+}
+
+func (ts *timespec) set_nsec(x int32) {
+	ts.tv_nsec = int64(x)
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type sigactiont struct {
+	sa_handler  uintptr
+	sa_flags    uint64
+	sa_restorer uintptr
+	sa_mask     uint64
+}
+
+type siginfo struct {
+	si_signo int32
+	si_errno int32
+	si_code  int32
+	// below here is a union; si_addr is the only field we use
+	si_addr uint64
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type epollevent struct {
+	events    uint32
+	pad_cgo_0 [4]byte
+	data      [8]byte // unaligned uintptr
+}
+
+const (
+	_O_RDONLY    = 0x0
+	_O_CLOEXEC   = 0x80000
+	_SA_RESTORER = 0
+)
+
+type sigaltstackt struct {
+	ss_sp    *byte
+	ss_flags int32
+	ss_size  uintptr
+}
+
+type sigcontext struct {
+	psw_mask uint64
+	psw_addr uint64
+	gregs    [16]uint64
+	aregs    [16]uint32
+	fpc      uint32
+	fpregs   [16]uint64
+}
+
+type ucontext struct {
+	uc_flags    uint64
+	uc_link     *ucontext
+	uc_stack    sigaltstackt
+	uc_mcontext sigcontext
+	uc_sigmask  uint64
+}
diff --git a/src/runtime/error.go b/src/runtime/error.go
index 3e1ec4bc5a..0238c5e592 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -50,13 +50,24 @@ func (e errorString) Error() string {
 	return "runtime error: " + string(e)
 }
 
+// plainError represents a runtime error described a string without
+// the prefix "runtime error: " after invoking errorString.Error().
+// See Issue #14965.
+type plainError string
+
+func (e plainError) RuntimeError() {}
+
+func (e plainError) Error() string {
+	return string(e)
+}
+
 type stringer interface {
 	String() string
 }
 
 func typestring(x interface{}) string {
 	e := efaceOf(&x)
-	return e._type._string
+	return e._type.string()
 }
 
 // For calling from C.
@@ -82,5 +93,5 @@ func printany(i interface{}) {
 
 // called from generated code
 func panicwrap(pkg, typ, meth string) {
-	panic("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer")
+	panic(plainError("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer"))
 }
diff --git a/src/runtime/export_windows_test.go b/src/runtime/export_windows_test.go
index 66c103709c..536b398fd7 100644
--- a/src/runtime/export_windows_test.go
+++ b/src/runtime/export_windows_test.go
@@ -8,8 +8,11 @@ package runtime
 
 import "unsafe"
 
-var TestingWER = &testingWER
-var OsYield = osyield
+var (
+	TestingWER              = &testingWER
+	OsYield                 = osyield
+	TimeBeginPeriodRetValue = &timeBeginPeriodRetValue
+)
 
 func NumberOfProcessors() int32 {
 	var info systeminfo
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 984b0ca817..1df8691cfc 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -224,8 +224,8 @@ func Version() string {
 
 // GOOS is the running program's operating system target:
 // one of darwin, freebsd, linux, and so on.
-const GOOS string = sys.TheGoos
+const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
-// 386, amd64, or arm.
-const GOARCH string = sys.TheGoarch
+// 386, amd64, arm, or s390x.
+const GOARCH string = sys.GOARCH
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index edb6361642..9a61b4f2b2 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -59,7 +59,7 @@ func TestGCInfo(t *testing.T) {
 
 func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	mask := runtime.GCMask(p)
-	if bytes.Compare(mask, mask0) != 0 {
+	if !bytes.Equal(mask, mask0) {
 		t.Errorf("bad GC program for %v:\nwant %+v\ngot  %+v", name, mask0, mask)
 		return
 	}
@@ -144,7 +144,7 @@ func infoBigStruct() []byte {
 			typeScalar, typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
 			typePointer, typeScalar, // i string
 		}
-	case "arm64", "amd64", "mips64", "mips64le", "ppc64", "ppc64le":
+	case "arm64", "amd64", "mips64", "mips64le", "ppc64", "ppc64le", "s390x":
 		return []byte{
 			typePointer,                        // q *int
 			typeScalar, typeScalar, typeScalar, // w byte; e [17]byte
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index fb3dba4000..d61f114475 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -6,7 +6,7 @@
 //   xxhash: https://code.google.com/p/xxhash/
 // cityhash: https://code.google.com/p/cityhash/
 
-// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le
+// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x
 
 package runtime
 
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index 80b2b5338c..509cab2f0f 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go
@@ -194,7 +194,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 	}
 
 	if hint < 0 || int64(int32(hint)) != hint {
-		panic("makemap: size out of range")
+		panic(plainError("makemap: size out of range"))
 		// TODO: make hint an int, then none of this nonsense
 	}
 
@@ -236,9 +236,6 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 		throw("need padding in bucket (value)")
 	}
 
-	// make sure zeroptr is large enough
-	mapzero(t.elem)
-
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
 	for ; hint > bucketCnt && float32(hint) > loadFactor*float32(uintptr(1)<<B); B++ {
@@ -249,7 +246,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 	// If hint is large zeroing this memory could take a while.
 	buckets := bucket
 	if B != 0 {
-		buckets = newarray(t.bucket, uintptr(1)<<B)
+		buckets = newarray(t.bucket, 1<<B)
 	}
 
 	// initialize Hmap
@@ -283,7 +280,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		msanread(key, t.key.size)
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr))
+		return unsafe.Pointer(&zeroVal[0])
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -321,7 +318,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr))
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -337,7 +334,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 		msanread(key, t.key.size)
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -375,7 +372,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
@@ -426,9 +423,25 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	}
 }
 
+func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
+	v := mapaccess1(t, h, key)
+	if v == unsafe.Pointer(&zeroVal[0]) {
+		return zero
+	}
+	return v
+}
+
+func mapaccess2_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) (unsafe.Pointer, bool) {
+	v := mapaccess1(t, h, key)
+	if v == unsafe.Pointer(&zeroVal[0]) {
+		return zero, false
+	}
+	return v, true
+}
+
 func mapassign1(t *maptype, h *hmap, key unsafe.Pointer, val unsafe.Pointer) {
 	if h == nil {
-		panic("assignment to entry in nil map")
+		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
 		callerpc := getcallerpc(unsafe.Pointer(&t))
@@ -790,7 +803,9 @@ next:
 				}
 			}
 			it.bucket = bucket
-			it.bptr = b
+			if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+				it.bptr = b
+			}
 			it.i = i + 1
 			it.checkBucket = checkBucket
 			return
@@ -806,7 +821,7 @@ func hashGrow(t *maptype, h *hmap) {
 		throw("evacuation not done in time")
 	}
 	oldbuckets := h.buckets
-	newbuckets := newarray(t.bucket, uintptr(1)<<(h.B+1))
+	newbuckets := newarray(t.bucket, 1<<(h.B+1))
 	flags := h.flags &^ (iterator | oldIterator)
 	if h.flags&iterator != 0 {
 		flags |= oldIterator
@@ -1042,39 +1057,5 @@ func reflect_ismapkey(t *_type) bool {
 	return ismapkey(t)
 }
 
-var zerolock mutex
-
-const initialZeroSize = 1024
-
-var zeroinitial [initialZeroSize]byte
-
-// All accesses to zeroptr and zerosize must be atomic so that they
-// can be accessed without locks in the common case.
-var zeroptr unsafe.Pointer = unsafe.Pointer(&zeroinitial)
-var zerosize uintptr = initialZeroSize
-
-// mapzero ensures that zeroptr points to a buffer large enough to
-// serve as the zero value for t.
-func mapzero(t *_type) {
-	// Is the type small enough for existing buffer?
-	cursize := uintptr(atomic.Loadp(unsafe.Pointer(&zerosize)))
-	if t.size <= cursize {
-		return
-	}
-
-	// Allocate a new buffer.
-	lock(&zerolock)
-	cursize = uintptr(atomic.Loadp(unsafe.Pointer(&zerosize)))
-	if cursize < t.size {
-		for cursize < t.size {
-			cursize *= 2
-			if cursize == 0 {
-				// need >2GB zero on 32-bit machine
-				throw("map element too large")
-			}
-		}
-		atomic.Storep1(unsafe.Pointer(&zeroptr), persistentalloc(cursize, 64, &memstats.other_sys))
-		atomic.Storep1(unsafe.Pointer(&zerosize), unsafe.Pointer(zerosize))
-	}
-	unlock(&zerolock)
-}
+const maxZero = 1024 // must match value in ../cmd/compile/internal/gc/walk.go
+var zeroVal [maxZero]byte
diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
index 6a5484edee..8f9bb5a6fc 100644
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -16,7 +15,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr))
+		return unsafe.Pointer(&zeroVal[0])
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -50,7 +49,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr))
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -61,7 +60,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -95,7 +94,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
@@ -106,7 +105,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr))
+		return unsafe.Pointer(&zeroVal[0])
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -140,7 +139,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr))
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -151,7 +150,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -185,7 +184,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
@@ -196,7 +195,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr))
+		return unsafe.Pointer(&zeroVal[0])
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -220,7 +219,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 				}
 			}
-			return atomic.Loadp(unsafe.Pointer(&zeroptr))
+			return unsafe.Pointer(&zeroVal[0])
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -258,7 +257,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+keymaybe*uintptr(t.valuesize))
 			}
 		}
-		return atomic.Loadp(unsafe.Pointer(&zeroptr))
+		return unsafe.Pointer(&zeroVal[0])
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -290,7 +289,7 @@ dohash:
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr))
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -301,7 +300,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
@@ -325,7 +324,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
 				}
 			}
-			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -361,7 +360,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+keymaybe*uintptr(t.valuesize)), true
 			}
 		}
-		return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -393,7 +392,7 @@ dohash:
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return atomic.Loadp(unsafe.Pointer(&zeroptr)), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 96dd6ff867..6085c6866c 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -183,10 +183,11 @@ func dumptype(t *_type) {
 	dumpint(tagType)
 	dumpint(uint64(uintptr(unsafe.Pointer(t))))
 	dumpint(uint64(t.size))
-	if x := t.uncommon(); x == nil || x.pkgpath == nil {
-		dumpstr(t._string)
+	if x := t.uncommon(); x == nil || t.nameOff(x.pkgpath).name() == "" {
+		dumpstr(t.string())
 	} else {
-		pkgpath := stringStructOf(x.pkgpath)
+		pkgpathstr := t.nameOff(x.pkgpath).name()
+		pkgpath := stringStructOf(&pkgpathstr)
 		namestr := t.name()
 		name := stringStructOf(&namestr)
 		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
@@ -500,7 +501,7 @@ func dumpparams() {
 	dumpint(sys.PtrSize)
 	dumpint(uint64(mheap_.arena_start))
 	dumpint(uint64(mheap_.arena_used))
-	dumpint(sys.TheChar)
+	dumpstr(sys.GOARCH)
 	dumpstr(sys.Goexperiment)
 	dumpint(uint64(ncpu))
 }
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 3ce1e237d3..b57d1cc63c 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -37,7 +37,8 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 		if canfail {
 			return nil
 		}
-		panic(&TypeAssertionError{"", typ._string, inter.typ._string, inter.mhdr[0].name.name()})
+		name := inter.typ.nameOff(inter.mhdr[0].name)
+		panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), name.name()})
 	}
 
 	h := itabhash(inter, typ)
@@ -93,26 +94,30 @@ func additab(m *itab, locked, canfail bool) {
 	// so can iterate over both in lock step;
 	// the loop is O(ni+nt) not O(ni*nt).
 	ni := len(inter.mhdr)
-	nt := len(x.mhdr)
+	nt := int(x.mcount)
+	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
 	j := 0
 	for k := 0; k < ni; k++ {
 		i := &inter.mhdr[k]
-		iname := i.name.name()
-		itype := i._type
-		ipkg := i.name.pkgPath()
-		if ipkg == nil {
-			ipkg = inter.pkgpath
+		itype := inter.typ.typeOff(i.ityp)
+		name := inter.typ.nameOff(i.name)
+		iname := name.name()
+		ipkg := name.pkgPath()
+		if ipkg == "" {
+			ipkg = inter.pkgpath.name()
 		}
 		for ; j < nt; j++ {
-			t := &x.mhdr[j]
-			if t.mtyp == itype && t.name.name() == iname {
-				pkgPath := t.name.pkgPath()
-				if pkgPath == nil {
-					pkgPath = x.pkgpath
+			t := &xmhdr[j]
+			tname := typ.nameOff(t.name)
+			if typ.typeOff(t.mtyp) == itype && tname.name() == iname {
+				pkgPath := tname.pkgPath()
+				if pkgPath == "" {
+					pkgPath = typ.nameOff(x.pkgpath).name()
 				}
-				if t.name.isExported() || pkgPath == ipkg {
+				if tname.isExported() || pkgPath == ipkg {
 					if m != nil {
-						*(*unsafe.Pointer)(add(unsafe.Pointer(&m.fun[0]), uintptr(k)*sys.PtrSize)) = t.ifn
+						ifn := typ.textOff(t.ifn)
+						*(*unsafe.Pointer)(add(unsafe.Pointer(&m.fun[0]), uintptr(k)*sys.PtrSize)) = ifn
 					}
 					goto nextimethod
 				}
@@ -123,7 +128,7 @@ func additab(m *itab, locked, canfail bool) {
 			if locked {
 				unlock(&ifaceLock)
 			}
-			panic(&TypeAssertionError{"", typ._string, inter.typ._string, iname})
+			panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), iname})
 		}
 		m.bad = 1
 		break
@@ -155,58 +160,54 @@ func convT2E(t *_type, elem unsafe.Pointer, x unsafe.Pointer) (e eface) {
 		msanread(elem, t.size)
 	}
 	if isDirectIface(t) {
-		e._type = t
-		typedmemmove(t, unsafe.Pointer(&e.data), elem)
-	} else {
-		if x == nil {
-			x = newobject(t)
-		}
+		throw("direct convT2E")
+	}
+	if x == nil {
+		x = newobject(t)
 		// TODO: We allocate a zeroed object only to overwrite it with
 		// actual data. Figure out how to avoid zeroing. Also below in convT2I.
-		typedmemmove(t, x, elem)
-		e._type = t
-		e.data = x
 	}
+	typedmemmove(t, x, elem)
+	e._type = t
+	e.data = x
 	return
 }
 
 func convT2I(tab *itab, elem unsafe.Pointer, x unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
-		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2I))
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I))
 	}
 	if msanenabled {
 		msanread(elem, t.size)
 	}
 	if isDirectIface(t) {
-		i.tab = tab
-		typedmemmove(t, unsafe.Pointer(&i.data), elem)
-	} else {
-		if x == nil {
-			x = newobject(t)
-		}
-		typedmemmove(t, x, elem)
-		i.tab = tab
-		i.data = x
+		throw("direct convT2I")
+	}
+	if x == nil {
+		x = newobject(t)
 	}
+	typedmemmove(t, x, elem)
+	i.tab = tab
+	i.data = x
 	return
 }
 
 func panicdottype(have, want, iface *_type) {
 	haveString := ""
 	if have != nil {
-		haveString = have._string
+		haveString = have.string()
 	}
-	panic(&TypeAssertionError{iface._string, haveString, want._string, ""})
+	panic(&TypeAssertionError{iface.string(), haveString, want.string(), ""})
 }
 
 func assertI2T(t *_type, i iface, r unsafe.Pointer) {
 	tab := i.tab
 	if tab == nil {
-		panic(&TypeAssertionError{"", "", t._string, ""})
+		panic(&TypeAssertionError{"", "", t.string(), ""})
 	}
 	if tab._type != t {
-		panic(&TypeAssertionError{tab.inter.typ._string, tab._type._string, t._string, ""})
+		panic(&TypeAssertionError{tab.inter.typ.string(), tab._type.string(), t.string(), ""})
 	}
 	if r != nil {
 		if isDirectIface(t) {
@@ -237,10 +238,10 @@ func assertI2T2(t *_type, i iface, r unsafe.Pointer) bool {
 
 func assertE2T(t *_type, e eface, r unsafe.Pointer) {
 	if e._type == nil {
-		panic(&TypeAssertionError{"", "", t._string, ""})
+		panic(&TypeAssertionError{"", "", t.string(), ""})
 	}
 	if e._type != t {
-		panic(&TypeAssertionError{"", e._type._string, t._string, ""})
+		panic(&TypeAssertionError{"", e._type.string(), t.string(), ""})
 	}
 	if r != nil {
 		if isDirectIface(t) {
@@ -284,7 +285,7 @@ func assertI2E(inter *interfacetype, i iface, r *eface) {
 	tab := i.tab
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
 	r._type = tab._type
 	r.data = i.data
@@ -321,7 +322,7 @@ func assertI2I(inter *interfacetype, i iface, r *iface) {
 	tab := i.tab
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
 	if tab.inter == inter {
 		r.tab = tab
@@ -360,7 +361,7 @@ func assertE2I(inter *interfacetype, e eface, r *iface) {
 	t := e._type
 	if t == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
 	r.tab = getitab(inter, t, false)
 	r.data = e.data
@@ -401,7 +402,7 @@ func reflect_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
 func assertE2E(inter *interfacetype, e eface, r *eface) {
 	if e._type == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
 	*r = e
 }
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index ce84fd83d1..ebecd0b4cb 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -102,7 +102,7 @@ TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Xchg(SB)
 
 
-TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-8
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 7463fec4a1..94d4ac2698 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -115,7 +115,7 @@ TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
 TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
 	JMP	runtime∕internal∕atomic·Xchg64(SB)
 
-TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	MOVQ	ptr+0(FP), BX
 	MOVQ	val+8(FP), AX
 	XCHGQ	AX, 0(BX)
diff --git a/src/runtime/internal/atomic/asm_amd64p32.s b/src/runtime/internal/atomic/asm_amd64p32.s
index f1e2c3aca6..74c79d08fd 100644
--- a/src/runtime/internal/atomic/asm_amd64p32.s
+++ b/src/runtime/internal/atomic/asm_amd64p32.s
@@ -115,7 +115,7 @@ TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
 TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Xchg(SB)
 
-TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-8
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index a454f284ab..d0f5c7bdd3 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -155,7 +155,7 @@ TEXT ·Xchg64(SB), NOSPLIT, $0-24
 TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 	JMP	·Xchg64(SB)
 
-TEXT ·Storep1(SB), NOSPLIT, $0-16
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 	JMP	·Store64(SB)
 
 TEXT ·Store(SB), NOSPLIT, $0-12
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 45a48b6203..4a776787a2 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -150,7 +150,7 @@ TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
 	BR	runtime∕internal∕atomic·Xchg64(SB)
 
 
-TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	BR	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
new file mode 100644
index 0000000000..c84718cb8f
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -0,0 +1,174 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Cas(ptr *uint32, old, new uint32) bool
+// Atomically:
+//	if *ptr == old {
+//		*val = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R3
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+	CS	R4, R5, 0(R3)    //  if (R4 == 0(R3)) then 0(R3)= R5
+	BNE	cas_fail
+	MOVB	$1, ret+16(FP)
+	RET
+cas_fail:
+	MOVB	$0, ret+16(FP)
+	RET
+
+// func Cas64(ptr *uint64, old, new uint64) bool
+// Atomically:
+//	if *ptr == old {
+//		*ptr = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	CSG	R4, R5, 0(R3)    //  if (R4 == 0(R3)) then 0(R3)= R5
+	BNE	cas64_fail
+	MOVB	$1, ret+24(FP)
+	RET
+cas64_fail:
+	MOVB	$0, ret+24(FP)
+	RET
+
+// func Casuintptr(ptr *uintptr, old, new uintptr) bool
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	BR	·Cas64(SB)
+
+// func Loaduintptr(ptr *uintptr) uintptr
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Loaduint(ptr *uint) uint
+TEXT ·Loaduint(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Storeuintptr(ptr *uintptr, new uintptr)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
+// func Loadint64(ptr *int64) int64
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	BR	·Xadd64(SB)
+
+// func Xaddint64(ptr *int64, delta int64) int64
+TEXT ·Xaddint64(SB), NOSPLIT, $0-16
+	BR	·Xadd64(SB)
+
+// func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+// Atomically:
+//	if *ptr == old {
+//		*ptr = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	BR ·Cas64(SB)
+
+// func Xadd(ptr *uint32, delta int32) uint32
+// Atomically:
+//	*ptr += delta
+//	return *ptr
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	delta+8(FP), R5
+	MOVW	(R4), R3
+repeat:
+	ADD	R5, R3, R6
+	CS	R3, R6, (R4) // if R3==(R4) then (R4)=R6 else R3=(R4)
+	BNE	repeat
+	MOVW	R6, ret+16(FP)
+	RET
+
+// func Xadd64(ptr *uint64, delta int64) uint64
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	delta+8(FP), R5
+	MOVD	(R4), R3
+repeat:
+	ADD	R5, R3, R6
+	CSG	R3, R6, (R4) // if R3==(R4) then (R4)=R6 else R3=(R4)
+	BNE	repeat
+	MOVD	R6, ret+16(FP)
+	RET
+
+// func Xchg(ptr *uint32, new uint32) uint32
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	new+8(FP), R3
+	MOVW	(R4), R6
+repeat:
+	CS	R6, R3, (R4) // if R6==(R4) then (R4)=R3 else R6=(R4)
+	BNE	repeat
+	MOVW	R6, ret+16(FP)
+	RET
+
+// func Xchg64(ptr *uint64, new uint64) uint64
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	new+8(FP), R3
+	MOVD	(R4), R6
+repeat:
+	CSG	R6, R3, (R4) // if R6==(R4) then (R4)=R3 else R6=(R4)
+	BNE	repeat
+	MOVD	R6, ret+16(FP)
+	RET
+
+// func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	BR	·Xchg64(SB)
+
+// func Or8(addr *uint8, v uint8)
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVD    ptr+0(FP), R3
+	MOVBZ   val+8(FP), R4
+	// Calculate shift.
+	AND	$3, R3, R5
+	XOR	$3, R5 // big endian - flip direction
+	SLD	$3, R5 // MUL $8, R5
+	SLD	R5, R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	AND	$-4, R3
+	MOVWZ	0(R3), R6
+again:
+	OR	R4, R6, R7
+	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
+	BNE	again
+	RET
+
+// func And8(addr *uint8, v uint8)
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVD    ptr+0(FP), R3
+	MOVBZ   val+8(FP), R4
+	// Calculate shift.
+	AND	$3, R3, R5
+	XOR	$3, R5 // big endian - flip direction
+	SLD	$3, R5 // MUL $8, R5
+	OR	$-256, R4 // create 0xffffffffffffffxx
+	RLLG	R5, R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	AND	$-4, R3
+	MOVWZ	0(R3), R6
+again:
+	AND	R4, R6, R7
+	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
+	BNE	again
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index f4c50b0be1..23a8479515 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -73,4 +73,4 @@ func Store(ptr *uint32, val uint32)
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
-func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64x.go b/src/runtime/internal/atomic/atomic_amd64x.go
index bd40fb3ea2..54851d30f4 100644
--- a/src/runtime/internal/atomic/atomic_amd64x.go
+++ b/src/runtime/internal/atomic/atomic_amd64x.go
@@ -61,5 +61,8 @@ func Store(ptr *uint32, val uint32)
 //go:noescape
 func Store64(ptr *uint64, val uint64)
 
+// StorepNoWB performs *ptr = val atomically and without a write
+// barrier.
+//
 // NO go:noescape annotation; see atomic_pointer.go.
-func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index c361aef382..244237df4d 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -85,7 +85,7 @@ func Loadp(addr unsafe.Pointer) unsafe.Pointer {
 }
 
 //go:nosplit
-func Storep1(addr unsafe.Pointer, v unsafe.Pointer) {
+func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer) {
 	for {
 		old := *(*unsafe.Pointer)(addr)
 		if Casp1((*unsafe.Pointer)(addr), old, v) {
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 6b32346656..dc82c3396d 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -77,4 +77,4 @@ func Store(ptr *uint32, val uint32)
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
-func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index 7b1b0efaf6..eb32f378aa 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -25,7 +25,7 @@ TEXT ·Loadp(SB),NOSPLIT,$-8-16
 	MOVD	R0, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Storep1(SB), NOSPLIT, $0-16
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	B	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index 8094db58a0..d06ea4809a 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -53,4 +53,4 @@ func Store(ptr *uint32, val uint32)
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
-func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index bf82b82643..72c98eb0c5 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -53,4 +53,4 @@ func Store(ptr *uint32, val uint32)
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
-func Storep1(ptr unsafe.Pointer, val unsafe.Pointer)
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
new file mode 100644
index 0000000000..9343853485
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -0,0 +1,73 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+//go:nosplit
+//go:noinline
+func Load(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+//go:noinline
+func Load64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:noinline
+//go:nosplit
+func Store(ptr *uint32, val uint32) {
+	*ptr = val
+}
+
+//go:noinline
+//go:nosplit
+func Store64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+// NO go:noescape annotation; see atomic_pointer.go.
+//go:noinline
+//go:nosplit
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer) {
+	*(*uintptr)(ptr) = uintptr(val)
+}
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
diff --git a/src/runtime/internal/sys/arch.go b/src/runtime/internal/sys/arch.go
new file mode 100644
index 0000000000..c1757041d8
--- /dev/null
+++ b/src/runtime/internal/sys/arch.go
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+type ArchFamilyType int
+
+const (
+	AMD64 ArchFamilyType = iota
+	ARM
+	ARM64
+	I386
+	MIPS64
+	PPC64
+	S390X
+)
diff --git a/src/runtime/internal/sys/arch_386.go b/src/runtime/internal/sys/arch_386.go
index 1f1c704f9a..48c42f7584 100644
--- a/src/runtime/internal/sys/arch_386.go
+++ b/src/runtime/internal/sys/arch_386.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '8'
+	ArchFamily    = I386
 	BigEndian     = 0
 	CacheLineSize = 64
 	PhysPageSize  = GoosNacl*65536 + (1-GoosNacl)*4096 // 4k normally; 64k on NaCl
diff --git a/src/runtime/internal/sys/arch_amd64.go b/src/runtime/internal/sys/arch_amd64.go
index 80fff557f2..1bbdb99e07 100644
--- a/src/runtime/internal/sys/arch_amd64.go
+++ b/src/runtime/internal/sys/arch_amd64.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '6'
+	ArchFamily    = AMD64
 	BigEndian     = 0
 	CacheLineSize = 64
 	PhysPageSize  = 4096
diff --git a/src/runtime/internal/sys/arch_amd64p32.go b/src/runtime/internal/sys/arch_amd64p32.go
index ca29f698a2..b7011a4ff2 100644
--- a/src/runtime/internal/sys/arch_amd64p32.go
+++ b/src/runtime/internal/sys/arch_amd64p32.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '6'
+	ArchFamily    = AMD64
 	BigEndian     = 0
 	CacheLineSize = 64
 	PhysPageSize  = 65536*GoosNacl + 4096*(1-GoosNacl)
diff --git a/src/runtime/internal/sys/arch_arm.go b/src/runtime/internal/sys/arch_arm.go
index b185e8fb69..f90f52da7f 100644
--- a/src/runtime/internal/sys/arch_arm.go
+++ b/src/runtime/internal/sys/arch_arm.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '5'
+	ArchFamily    = ARM
 	BigEndian     = 0
 	CacheLineSize = 32
 	PhysPageSize  = 65536*GoosNacl + 4096*(1-GoosNacl)
diff --git a/src/runtime/internal/sys/arch_arm64.go b/src/runtime/internal/sys/arch_arm64.go
index b63a7a6f9a..aaaa4b0947 100644
--- a/src/runtime/internal/sys/arch_arm64.go
+++ b/src/runtime/internal/sys/arch_arm64.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '7'
+	ArchFamily    = ARM64
 	BigEndian     = 0
 	CacheLineSize = 32
 	PhysPageSize  = 65536
diff --git a/src/runtime/internal/sys/arch_mips64.go b/src/runtime/internal/sys/arch_mips64.go
index 5b933d4e1a..d5672599d2 100644
--- a/src/runtime/internal/sys/arch_mips64.go
+++ b/src/runtime/internal/sys/arch_mips64.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '0'
+	ArchFamily    = MIPS64
 	BigEndian     = 1
 	CacheLineSize = 32
 	PhysPageSize  = 16384
diff --git a/src/runtime/internal/sys/arch_mips64le.go b/src/runtime/internal/sys/arch_mips64le.go
index ce2e98b19f..f8cdf2b2d2 100644
--- a/src/runtime/internal/sys/arch_mips64le.go
+++ b/src/runtime/internal/sys/arch_mips64le.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '0'
+	ArchFamily    = MIPS64
 	BigEndian     = 0
 	CacheLineSize = 32
 	PhysPageSize  = 16384
diff --git a/src/runtime/internal/sys/arch_ppc64.go b/src/runtime/internal/sys/arch_ppc64.go
index 3aa07e1f56..cdec63ff71 100644
--- a/src/runtime/internal/sys/arch_ppc64.go
+++ b/src/runtime/internal/sys/arch_ppc64.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '9'
+	ArchFamily    = PPC64
 	BigEndian     = 1
 	CacheLineSize = 64
 	PhysPageSize  = 65536
diff --git a/src/runtime/internal/sys/arch_ppc64le.go b/src/runtime/internal/sys/arch_ppc64le.go
index 0f02f0bf3c..4fd68f9ce3 100644
--- a/src/runtime/internal/sys/arch_ppc64le.go
+++ b/src/runtime/internal/sys/arch_ppc64le.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = '9'
+	ArchFamily    = PPC64
 	BigEndian     = 0
 	CacheLineSize = 64
 	PhysPageSize  = 65536
diff --git a/src/runtime/internal/sys/arch_s390x.go b/src/runtime/internal/sys/arch_s390x.go
index 8690571c81..ca1cb8646e 100644
--- a/src/runtime/internal/sys/arch_s390x.go
+++ b/src/runtime/internal/sys/arch_s390x.go
@@ -5,7 +5,7 @@
 package sys
 
 const (
-	TheChar       = 'z'
+	ArchFamily    = S390X
 	BigEndian     = 1
 	CacheLineSize = 256
 	PhysPageSize  = 4096
diff --git a/src/runtime/internal/sys/gengoos.go b/src/runtime/internal/sys/gengoos.go
index e2bd87de4e..4c45c0af02 100644
--- a/src/runtime/internal/sys/gengoos.go
+++ b/src/runtime/internal/sys/gengoos.go
@@ -50,7 +50,7 @@ func main() {
 			fmt.Fprintf(&buf, "// +build !android\n\n") // must explicitly exclude android for linux
 		}
 		fmt.Fprintf(&buf, "package sys\n\n")
-		fmt.Fprintf(&buf, "const TheGoos = `%s`\n\n", target)
+		fmt.Fprintf(&buf, "const GOOS = `%s`\n\n", target)
 		for _, goos := range gooses {
 			value := 0
 			if goos == target {
@@ -68,7 +68,7 @@ func main() {
 		var buf bytes.Buffer
 		fmt.Fprintf(&buf, "// generated by gengoos.go using 'go generate'\n\n")
 		fmt.Fprintf(&buf, "package sys\n\n")
-		fmt.Fprintf(&buf, "const TheGoarch = `%s`\n\n", target)
+		fmt.Fprintf(&buf, "const GOARCH = `%s`\n\n", target)
 		for _, goarch := range goarches {
 			value := 0
 			if goarch == target {
diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
index 8feb754dbd..1054c6948f 100644
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go
@@ -4,88 +4,97 @@
 
 package sys
 
+// Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf
+
+const deBruijn64 = 0x0218a392cd3d5dbf
+
+var deBruijnIdx64 = [64]byte{
+	0, 1, 2, 7, 3, 13, 8, 19,
+	4, 25, 14, 28, 9, 34, 20, 40,
+	5, 17, 26, 38, 15, 46, 29, 48,
+	10, 31, 35, 54, 21, 50, 41, 57,
+	63, 6, 12, 18, 24, 27, 33, 39,
+	16, 37, 45, 47, 30, 53, 49, 56,
+	62, 11, 23, 32, 36, 44, 52, 55,
+	61, 22, 43, 51, 60, 42, 59, 58,
+}
+
+const deBruijn32 = 0x04653adf
+
+var deBruijnIdx32 = [32]byte{
+	0, 1, 2, 6, 3, 11, 7, 16,
+	4, 14, 12, 21, 8, 23, 17, 26,
+	31, 5, 10, 15, 13, 20, 22, 25,
+	30, 9, 19, 24, 29, 18, 28, 27,
+}
+
+const deBruijn16 = 0x09af
+
+var deBruijnIdx16 = [16]byte{
+	0, 1, 2, 5, 3, 9, 6, 11,
+	15, 4, 8, 10, 14, 7, 13, 12,
+}
+
+const deBruijn8 = 0x17
+
+var deBruijnIdx8 = [8]byte{
+	0, 1, 2, 4, 7, 3, 6, 5,
+}
+
 // Ctz64 counts trailing (low-order) zeroes,
 // and if all are zero, then 64.
 func Ctz64(x uint64) uint64 {
-	if x&0xffffffff == 0 {
-		return 32 + uint64(Ctz32(uint32(x>>32)))
-	}
-	return uint64(Ctz32(uint32(x)))
-
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn64 >> 58    // extract part of deBruijn sequence
+	y = uint64(deBruijnIdx64[y]) // convert to bit index
+	z := (x - 1) >> 57 & 64      // adjustment if zero
+	return y + z
 }
 
 // Ctz32 counts trailing (low-order) zeroes,
 // and if all are zero, then 32.
 func Ctz32(x uint32) uint32 {
-	if x&0xffff == 0 {
-		return 16 + uint32(Ctz16(uint16(x>>16)))
-	}
-	return uint32(Ctz16(uint16(x)))
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn32 >> 27    // extract part of deBruijn sequence
+	y = uint32(deBruijnIdx32[y]) // convert to bit index
+	z := (x - 1) >> 26 & 32      // adjustment if zero
+	return y + z
 }
 
 // Ctz16 counts trailing (low-order) zeroes,
 // and if all are zero, then 16.
 func Ctz16(x uint16) uint16 {
-	if x&0xff == 0 {
-		return 8 + uint16(Ctz8(uint8(x>>8)))
-	}
-	return uint16(Ctz8(uint8(x)))
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn16 >> 12    // extract part of deBruijn sequence
+	y = uint16(deBruijnIdx16[y]) // convert to bit index
+	z := (x - 1) >> 11 & 16      // adjustment if zero
+	return y + z
 }
 
 // Ctz8 counts trailing (low-order) zeroes,
 // and if all are zero, then 8.
 func Ctz8(x uint8) uint8 {
-	return ctzVals[x]
+	x &= -x                    // isolate low-order bit
+	y := x * deBruijn8 >> 5    // extract part of deBruijn sequence
+	y = uint8(deBruijnIdx8[y]) // convert to bit index
+	z := (x - 1) >> 4 & 8      // adjustment if zero
+	return y + z
 }
 
-var ctzVals = [256]uint8{
-	8, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	6, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	7, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	6, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0,
-	3, 0, 1, 0, 2, 0, 1, 0}
-
 // Bswap64 returns its input with byte order reversed
 // 0x0102030405060708 -> 0x0807060504030201
 func Bswap64(x uint64) uint64 {
-	c8 := uint64(0xff00ff00ff00ff00)
-	a := (x & c8) >> 8
-	b := (x &^ c8) << 8
+	c8 := uint64(0x00ff00ff00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
 	x = a | b
-	c16 := uint64(0xffff0000ffff0000)
-	a = (x & c16) >> 16
-	b = (x &^ c16) << 16
+	c16 := uint64(0x0000ffff0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
 	x = a | b
-	c32 := uint64(0xffffffff00000000)
-	a = (x & c32) >> 32
-	b = (x &^ c32) << 32
+	c32 := uint64(0x00000000ffffffff)
+	a = x >> 32 & c32
+	b = (x & c32) << 32
 	x = a | b
 	return x
 }
@@ -93,13 +102,13 @@ func Bswap64(x uint64) uint64 {
 // Bswap32 returns its input with byte order reversed
 // 0x01020304 -> 0x04030201
 func Bswap32(x uint32) uint32 {
-	c8 := uint32(0xff00ff00)
-	a := (x & c8) >> 8
-	b := (x &^ c8) << 8
+	c8 := uint32(0x00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
 	x = a | b
-	c16 := uint32(0xffff0000)
-	a = (x & c16) >> 16
-	b = (x &^ c16) << 16
+	c16 := uint32(0x0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
 	x = a | b
 	return x
 }
diff --git a/src/runtime/internal/sys/intrinsics_test.go b/src/runtime/internal/sys/intrinsics_test.go
new file mode 100644
index 0000000000..097631bc1e
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics_test.go
@@ -0,0 +1,54 @@
+package sys_test
+
+import (
+	"runtime/internal/sys"
+	"testing"
+)
+
+func TestCtz64(t *testing.T) {
+	for i := uint(0); i <= 64; i++ {
+		x := uint64(5) << i
+		if got := sys.Ctz64(x); got != uint64(i) {
+			t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz32(t *testing.T) {
+	for i := uint(0); i <= 32; i++ {
+		x := uint32(5) << i
+		if got := sys.Ctz32(x); got != uint32(i) {
+			t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz16(t *testing.T) {
+	for i := uint(0); i <= 16; i++ {
+		x := uint16(5) << i
+		if got := sys.Ctz16(x); got != uint16(i) {
+			t.Errorf("Ctz16(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz8(t *testing.T) {
+	for i := uint(0); i <= 8; i++ {
+		x := uint8(5) << i
+		if got := sys.Ctz8(x); got != uint8(i) {
+			t.Errorf("Ctz8(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+
+func TestBswap64(t *testing.T) {
+	x := uint64(0x1122334455667788)
+	y := sys.Bswap64(x)
+	if y != 0x8877665544332211 {
+		t.Errorf("Bswap(%x)=%x, want 0x8877665544332211", x, y)
+	}
+}
+func TestBswap32(t *testing.T) {
+	x := uint32(0x11223344)
+	y := sys.Bswap32(x)
+	if y != 0x44332211 {
+		t.Errorf("Bswap(%x)=%x, want 0x44332211", x, y)
+	}
+}
diff --git a/src/runtime/internal/sys/zgoarch_386.go b/src/runtime/internal/sys/zgoarch_386.go
index 3ad244509d..3bcf83b8e3 100644
--- a/src/runtime/internal/sys/zgoarch_386.go
+++ b/src/runtime/internal/sys/zgoarch_386.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `386`
+const GOARCH = `386`
 
 const Goarch386 = 1
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_amd64.go b/src/runtime/internal/sys/zgoarch_amd64.go
index 7c858e3f5d..699f191fba 100644
--- a/src/runtime/internal/sys/zgoarch_amd64.go
+++ b/src/runtime/internal/sys/zgoarch_amd64.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `amd64`
+const GOARCH = `amd64`
 
 const Goarch386 = 0
 const GoarchAmd64 = 1
diff --git a/src/runtime/internal/sys/zgoarch_amd64p32.go b/src/runtime/internal/sys/zgoarch_amd64p32.go
index 772031c090..cc2d658406 100644
--- a/src/runtime/internal/sys/zgoarch_amd64p32.go
+++ b/src/runtime/internal/sys/zgoarch_amd64p32.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `amd64p32`
+const GOARCH = `amd64p32`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_arm.go b/src/runtime/internal/sys/zgoarch_arm.go
index 276e8a869b..a5fd789f13 100644
--- a/src/runtime/internal/sys/zgoarch_arm.go
+++ b/src/runtime/internal/sys/zgoarch_arm.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `arm`
+const GOARCH = `arm`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_arm64.go b/src/runtime/internal/sys/zgoarch_arm64.go
index d124ec0343..084d2c7330 100644
--- a/src/runtime/internal/sys/zgoarch_arm64.go
+++ b/src/runtime/internal/sys/zgoarch_arm64.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `arm64`
+const GOARCH = `arm64`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_mips64.go b/src/runtime/internal/sys/zgoarch_mips64.go
index b4a97d6da9..2ad62bd68c 100644
--- a/src/runtime/internal/sys/zgoarch_mips64.go
+++ b/src/runtime/internal/sys/zgoarch_mips64.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `mips64`
+const GOARCH = `mips64`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_mips64le.go b/src/runtime/internal/sys/zgoarch_mips64le.go
index 3328a35bd2..047c8b425a 100644
--- a/src/runtime/internal/sys/zgoarch_mips64le.go
+++ b/src/runtime/internal/sys/zgoarch_mips64le.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `mips64le`
+const GOARCH = `mips64le`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_ppc64.go b/src/runtime/internal/sys/zgoarch_ppc64.go
index 06f78b2023..748b5b562c 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `ppc64`
+const GOARCH = `ppc64`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_ppc64le.go b/src/runtime/internal/sys/zgoarch_ppc64le.go
index 50b56dbe3f..d3dcba467d 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64le.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64le.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `ppc64le`
+const GOARCH = `ppc64le`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoarch_s390x.go b/src/runtime/internal/sys/zgoarch_s390x.go
index ce85f20e0a..1ead5d573c 100644
--- a/src/runtime/internal/sys/zgoarch_s390x.go
+++ b/src/runtime/internal/sys/zgoarch_s390x.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoarch = `s390x`
+const GOARCH = `s390x`
 
 const Goarch386 = 0
 const GoarchAmd64 = 0
diff --git a/src/runtime/internal/sys/zgoos_android.go b/src/runtime/internal/sys/zgoos_android.go
index 03d91760ed..6503b15246 100644
--- a/src/runtime/internal/sys/zgoos_android.go
+++ b/src/runtime/internal/sys/zgoos_android.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `android`
+const GOOS = `android`
 
 const GoosAndroid = 1
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_darwin.go b/src/runtime/internal/sys/zgoos_darwin.go
index eb2efeb7af..6a285984bd 100644
--- a/src/runtime/internal/sys/zgoos_darwin.go
+++ b/src/runtime/internal/sys/zgoos_darwin.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `darwin`
+const GOOS = `darwin`
 
 const GoosAndroid = 0
 const GoosDarwin = 1
diff --git a/src/runtime/internal/sys/zgoos_dragonfly.go b/src/runtime/internal/sys/zgoos_dragonfly.go
index 403cf65311..886ac2698f 100644
--- a/src/runtime/internal/sys/zgoos_dragonfly.go
+++ b/src/runtime/internal/sys/zgoos_dragonfly.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `dragonfly`
+const GOOS = `dragonfly`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_freebsd.go b/src/runtime/internal/sys/zgoos_freebsd.go
index 632d5db9db..0bf2403eab 100644
--- a/src/runtime/internal/sys/zgoos_freebsd.go
+++ b/src/runtime/internal/sys/zgoos_freebsd.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `freebsd`
+const GOOS = `freebsd`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_linux.go b/src/runtime/internal/sys/zgoos_linux.go
index 2d43869a84..c8664db15d 100644
--- a/src/runtime/internal/sys/zgoos_linux.go
+++ b/src/runtime/internal/sys/zgoos_linux.go
@@ -4,7 +4,7 @@
 
 package sys
 
-const TheGoos = `linux`
+const GOOS = `linux`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_nacl.go b/src/runtime/internal/sys/zgoos_nacl.go
index a56b6ef3c9..054122638a 100644
--- a/src/runtime/internal/sys/zgoos_nacl.go
+++ b/src/runtime/internal/sys/zgoos_nacl.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `nacl`
+const GOOS = `nacl`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_netbsd.go b/src/runtime/internal/sys/zgoos_netbsd.go
index 46fd0a7cd5..5c509a1250 100644
--- a/src/runtime/internal/sys/zgoos_netbsd.go
+++ b/src/runtime/internal/sys/zgoos_netbsd.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `netbsd`
+const GOOS = `netbsd`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_openbsd.go b/src/runtime/internal/sys/zgoos_openbsd.go
index 7ee650afbb..dc43157d49 100644
--- a/src/runtime/internal/sys/zgoos_openbsd.go
+++ b/src/runtime/internal/sys/zgoos_openbsd.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `openbsd`
+const GOOS = `openbsd`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_plan9.go b/src/runtime/internal/sys/zgoos_plan9.go
index 162e7f6260..4b0934f77a 100644
--- a/src/runtime/internal/sys/zgoos_plan9.go
+++ b/src/runtime/internal/sys/zgoos_plan9.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `plan9`
+const GOOS = `plan9`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_solaris.go b/src/runtime/internal/sys/zgoos_solaris.go
index b2a8f98504..42511a36ad 100644
--- a/src/runtime/internal/sys/zgoos_solaris.go
+++ b/src/runtime/internal/sys/zgoos_solaris.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `solaris`
+const GOOS = `solaris`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/internal/sys/zgoos_windows.go b/src/runtime/internal/sys/zgoos_windows.go
index 817ec79e4c..d77f62c396 100644
--- a/src/runtime/internal/sys/zgoos_windows.go
+++ b/src/runtime/internal/sys/zgoos_windows.go
@@ -2,7 +2,7 @@
 
 package sys
 
-const TheGoos = `windows`
+const GOOS = `windows`
 
 const GoosAndroid = 0
 const GoosDarwin = 0
diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index ea640eb12f..db54ecb05e 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -3,6 +3,9 @@
 // license that can be found in the LICENSE file.
 
 // Lock-free stack.
+// Initialize head to 0, compare with 0 to test for emptiness.
+// The stack does not keep pointers to nodes,
+// so they can be garbage collected if there are no other pointers to nodes.
 // The following code runs only on g0 stack.
 
 package runtime
@@ -15,7 +18,7 @@ import (
 func lfstackpush(head *uint64, node *lfnode) {
 	node.pushcnt++
 	new := lfstackPack(node, node.pushcnt)
-	if node1, _ := lfstackUnpack(new); node1 != node {
+	if node1 := lfstackUnpack(new); node1 != node {
 		print("runtime: lfstackpush invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
 		throw("lfstackpush")
 	}
@@ -34,7 +37,7 @@ func lfstackpop(head *uint64) unsafe.Pointer {
 		if old == 0 {
 			return nil
 		}
-		node, _ := lfstackUnpack(old)
+		node := lfstackUnpack(old)
 		next := atomic.Load64(&node.next)
 		if atomic.Cas64(head, old, next) {
 			return unsafe.Pointer(node)
diff --git a/src/runtime/lfstack_32bit.go b/src/runtime/lfstack_32bit.go
index 36811c1e47..2f59e0212e 100644
--- a/src/runtime/lfstack_32bit.go
+++ b/src/runtime/lfstack_32bit.go
@@ -14,8 +14,6 @@ func lfstackPack(node *lfnode, cnt uintptr) uint64 {
 	return uint64(uintptr(unsafe.Pointer(node)))<<32 | uint64(cnt)
 }
 
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> 32)))
-	cnt = uintptr(val)
-	return
+func lfstackUnpack(val uint64) *lfnode {
+	return (*lfnode)(unsafe.Pointer(uintptr(val >> 32)))
 }
diff --git a/src/runtime/lfstack_64bit.go b/src/runtime/lfstack_64bit.go
new file mode 100644
index 0000000000..5367f08c56
--- /dev/null
+++ b/src/runtime/lfstack_64bit.go
@@ -0,0 +1,48 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 mips64 mips64le ppc64 ppc64le s390x
+
+package runtime
+
+import "unsafe"
+
+const (
+	// addrBits is the number of bits needed to represent a virtual address.
+	//
+	// In Linux the user address space for each architecture is limited as
+	// follows (taken from the processor.h file for the architecture):
+	//
+	// Architecture  Name              Maximum Value (exclusive)
+	// ---------------------------------------------------------------------
+	// arm64         TASK_SIZE_64      Depends on configuration.
+	// ppc64{,le}    TASK_SIZE_USER64  0x400000000000UL (46 bit addresses)
+	// mips64{,le}   TASK_SIZE64       0x010000000000UL (40 bit addresses)
+	// s390x         TASK_SIZE         0x020000000000UL (41 bit addresses)
+	//
+	// These values may increase over time.
+	//
+	// On AMD64, virtual addresses are 48-bit numbers sign extended to 64.
+	// We shift the address left 16 to eliminate the sign extended part and make
+	// room in the bottom for the count.
+	addrBits = 48
+
+	// In addition to the 16 bits taken from the top, we can take 3 from the
+	// bottom, because node must be pointer-aligned, giving a total of 19 bits
+	// of count.
+	cntBits = 64 - addrBits + 3
+)
+
+func lfstackPack(node *lfnode, cnt uintptr) uint64 {
+	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
+}
+
+func lfstackUnpack(val uint64) *lfnode {
+	if GOARCH == "amd64" {
+		// amd64 systems can place the stack above the VA hole, so we need to sign extend
+		// val before unpacking.
+		return (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> cntBits << 3)))
+	}
+	return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
+}
diff --git a/src/runtime/lfstack_amd64.go b/src/runtime/lfstack_amd64.go
deleted file mode 100644
index 0a71455c6b..0000000000
--- a/src/runtime/lfstack_amd64.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// On AMD64, virtual addresses are 48-bit numbers sign extended to 64.
-// We shift the address left 16 to eliminate the sign extended part and make
-// room in the bottom for the count.
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<16 | uint64(cnt&(1<<19-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> 19 << 3)))
-	cnt = uintptr(val & (1<<19 - 1))
-	return
-}
diff --git a/src/runtime/lfstack_darwin_arm64.go b/src/runtime/lfstack_darwin_arm64.go
deleted file mode 100644
index f48d76382b..0000000000
--- a/src/runtime/lfstack_darwin_arm64.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}
diff --git a/src/runtime/lfstack_linux_arm64.go b/src/runtime/lfstack_linux_arm64.go
deleted file mode 100644
index f48d76382b..0000000000
--- a/src/runtime/lfstack_linux_arm64.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}
diff --git a/src/runtime/lfstack_linux_mips64x.go b/src/runtime/lfstack_linux_mips64x.go
deleted file mode 100644
index 7ff95f77ae..0000000000
--- a/src/runtime/lfstack_linux_mips64x.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips64 mips64le
-// +build linux
-
-package runtime
-
-import "unsafe"
-
-// On mips64, Linux limits the user address space to 40 bits (see
-// TASK_SIZE64 in the Linux kernel).  This has grown over time,
-// so here we allow 48 bit addresses.
-//
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}
diff --git a/src/runtime/lfstack_linux_ppc64x.go b/src/runtime/lfstack_linux_ppc64x.go
deleted file mode 100644
index 83b7cf4f58..0000000000
--- a/src/runtime/lfstack_linux_ppc64x.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-// +build linux
-
-package runtime
-
-import "unsafe"
-
-// On ppc64, Linux limits the user address space to 46 bits (see
-// TASK_SIZE_USER64 in the Linux kernel).  This has grown over time,
-// so here we allow 48 bit addresses.
-//
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index d28fd92720..073136abd0 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -13,13 +13,13 @@ import (
 
 // This implementation depends on OS-specific implementations of
 //
-//	runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+//	futexsleep(addr *uint32, val uint32, ns int64)
 //		Atomically,
-//			if(*addr == val) sleep
+//			if *addr == val { sleep }
 //		Might be woken up spuriously; that's allowed.
 //		Don't sleep longer than ns; ns < 0 means forever.
 //
-//	runtime·futexwakeup(uint32 *addr, uint32 cnt)
+//	futexwakeup(addr *uint32, cnt uint32)
 //		If any procs are sleeping on addr, wake up at most cnt.
 
 const (
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 31335dae80..6fe4656603 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -87,9 +87,6 @@ import (
 const (
 	debugMalloc = false
 
-	flagNoScan = _FlagNoScan
-	flagNoZero = _FlagNoZero
-
 	maxTinySize   = _TinySize
 	tinySizeClass = _TinySizeClass
 	maxSmallSize  = _MaxSmallSize
@@ -490,12 +487,6 @@ func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
 // base address for all 0-byte allocations
 var zerobase uintptr
 
-const (
-	// flags to malloc
-	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
-	_FlagNoZero = 1 << 1 // don't zero memory
-)
-
 // nextFreeFast returns the next free object if one is quickly available.
 // Otherwise it returns 0.
 func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr {
@@ -564,7 +555,7 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) {
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
-func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
+func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if gcphase == _GCmarktermination {
 		throw("mallocgc called with gcphase == _GCmarktermination")
 	}
@@ -573,10 +564,6 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if flags&flagNoScan == 0 && typ == nil {
-		throw("malloc missing type")
-	}
-
 	if debug.sbrk != 0 {
 		align := uintptr(16)
 		if typ != nil {
@@ -620,14 +607,15 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	dataSize := size
 	c := gomcache()
 	var x unsafe.Pointer
+	noscan := typ == nil || typ.kind&kindNoPointers != 0
 	if size <= maxSmallSize {
-		if flags&flagNoScan != 0 && size < maxTinySize {
+		if noscan && size < maxTinySize {
 			// Tiny allocator.
 			//
 			// Tiny allocator combines several tiny allocation requests
 			// into a single memory block. The resulting memory block
 			// is freed when all subobjects are unreachable. The subobjects
-			// must be FlagNoScan (don't have pointers), this ensures that
+			// must be noscan (don't have pointers), this ensures that
 			// the amount of potentially wasted memory is bounded.
 			//
 			// Size of the memory block used for combining (maxTinySize) is tunable.
@@ -699,7 +687,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 				v, shouldhelpgc = c.nextFree(sizeclass)
 			}
 			x = unsafe.Pointer(v)
-			if flags&flagNoZero == 0 {
+			if needzero {
 				memclr(unsafe.Pointer(v), size)
 				// TODO:(rlh) Only clear if object is not known to be zeroed.
 			}
@@ -708,14 +696,15 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, flags)
+			s = largeAlloc(size, needzero)
 		})
 		s.freeindex = 1
 		x = unsafe.Pointer(s.base())
 		size = s.elemsize
 	}
 
-	if flags&flagNoScan != 0 {
+	var scanSize uintptr
+	if noscan {
 		heapBitsSetTypeNoScan(uintptr(x), size)
 	} else {
 		// If allocating a defer+arg block, now that we've picked a malloc size
@@ -733,11 +722,12 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			// pointers, GC has to scan to the last
 			// element.
 			if typ.ptrdata != 0 {
-				c.local_scan += dataSize - typ.size + typ.ptrdata
+				scanSize = dataSize - typ.size + typ.ptrdata
 			}
 		} else {
-			c.local_scan += typ.ptrdata
+			scanSize = typ.ptrdata
 		}
+		c.local_scan += scanSize
 
 		// Ensure that the stores above that initialize x to
 		// type-safe memory and set the heap bits occur before
@@ -748,14 +738,12 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		publicationBarrier()
 	}
 
-	// GCmarkterminate allocates black
+	// Allocate black during GC.
 	// All slots hold nil so no scanning is needed.
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
-	if gcphase == _GCmarktermination || gcBlackenPromptly {
-		systemstack(func() {
-			gcmarknewobject_m(uintptr(x), size)
-		})
+	if gcphase != _GCoff {
+		gcmarknewobject(uintptr(x), size, scanSize)
 	}
 
 	// The object x is about to be reused but tracefree and msanfree
@@ -813,7 +801,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, flag uint32) *mspan {
+func largeAlloc(size uintptr, needzero bool) *mspan {
 	// print("largeAlloc size=", size, "\n")
 
 	if size+_PageSize < size {
@@ -829,7 +817,7 @@ func largeAlloc(size uintptr, flag uint32) *mspan {
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, 0, true, flag&_FlagNoZero == 0)
+	s := mheap_.alloc(npages, 0, true, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -840,11 +828,7 @@ func largeAlloc(size uintptr, flag uint32) *mspan {
 
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
-	flags := uint32(0)
-	if typ.kind&kindNoPointers != 0 {
-		flags |= flagNoScan
-	}
-	return mallocgc(typ.size, typ, flags)
+	return mallocgc(typ.size, typ, true)
 }
 
 //go:linkname reflect_unsafe_New reflect.unsafe_New
@@ -852,29 +836,19 @@ func reflect_unsafe_New(typ *_type) unsafe.Pointer {
 	return newobject(typ)
 }
 
-// implementation of make builtin for slices
-func newarray(typ *_type, n uintptr) unsafe.Pointer {
-	flags := uint32(0)
-	if typ.kind&kindNoPointers != 0 {
-		flags |= flagNoScan
+// newarray allocates an array of n elements of type typ.
+func newarray(typ *_type, n int) unsafe.Pointer {
+	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
+		panic(plainError("runtime: allocation size out of range"))
 	}
-	if int(n) < 0 || (typ.size > 0 && n > _MaxMem/typ.size) {
-		panic("runtime: allocation size out of range")
-	}
-	return mallocgc(typ.size*n, typ, flags)
+	return mallocgc(typ.size*uintptr(n), typ, true)
 }
 
 //go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
-func reflect_unsafe_NewArray(typ *_type, n uintptr) unsafe.Pointer {
+func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 	return newarray(typ, n)
 }
 
-// rawmem returns a chunk of pointerless memory. It is
-// not zeroed.
-func rawmem(size uintptr) unsafe.Pointer {
-	return mallocgc(size, nil, flagNoScan|flagNoZero)
-}
-
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	mp.mcache.next_sample = nextSample()
 	mProf_Malloc(x, size)
diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index 9d2894cb6f..496f8e8868 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go
@@ -317,6 +317,22 @@ func TestBigItems(t *testing.T) {
 	}
 }
 
+func TestMapHugeZero(t *testing.T) {
+	type T [4000]byte
+	m := map[int]T{}
+	x := m[0]
+	if x != (T{}) {
+		t.Errorf("map value not zero")
+	}
+	y, ok := m[0]
+	if ok {
+		t.Errorf("map value should be missing")
+	}
+	if y != (T{}) {
+		t.Errorf("map value not zero")
+	}
+}
+
 type empty struct {
 }
 
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index f03bf18ebc..637d9b886a 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -87,6 +87,17 @@ import (
 // frames that have potentially been active since the concurrent scan,
 // so it depends on write barriers to track changes to pointers in
 // stack frames that have not been active.
+//
+//
+// Global writes:
+//
+// The Go garbage collector requires write barriers when heap pointers
+// are stored in globals. Many garbage collectors ignore writes to
+// globals and instead pick up global -> heap pointers during
+// termination. This increases pause time, so we instead rely on write
+// barriers for writes to globals so that we don't have to rescan
+// global during mark termination.
+//
 //go:nowritebarrierrec
 func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 	if writeBarrier.needed {
@@ -185,7 +196,7 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
 	if writeBarrier.cgo {
 		cgoCheckMemmove(typ, dst, src, off, size)
 	}
-	if !writeBarrier.needed || typ.kind&kindNoPointers != 0 || size < sys.PtrSize || !inheap(uintptr(dst)) {
+	if !writeBarrier.needed || typ.kind&kindNoPointers != 0 || size < sys.PtrSize {
 		return
 	}
 
@@ -201,11 +212,11 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
 // values have just been copied to frame, starting at retoffset
 // and continuing to framesize. The entire frame (not just the return
 // values) is described by typ. Because the copy has already
-// happened, we call writebarrierptr_nostore, and we must be careful
-// not to be preempted before the write barriers have been run.
+// happened, we call writebarrierptr_nostore, and this is nosplit so
+// the copy and write barrier appear atomic to GC.
 //go:nosplit
 func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) {
-	if !writeBarrier.needed || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < sys.PtrSize || !inheap(uintptr(frame)) {
+	if !writeBarrier.needed || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < sys.PtrSize {
 		return
 	}
 	heapBitsBulkBarrier(uintptr(add(frame, retoffset)), framesize-retoffset)
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index b342de600e..af89577703 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -88,6 +88,7 @@ const (
 
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
+//go:nosplit
 func addb(p *byte, n uintptr) *byte {
 	// Note: wrote out full expression instead of calling add(p, n)
 	// to reduce the number of temporaries generated by the
@@ -99,6 +100,7 @@ func addb(p *byte, n uintptr) *byte {
 // subtractb is typically used when traversing the pointer tables referred to by hbits
 // which are arranged in reverse order.
 //go:nowritebarrier
+//go:nosplit
 func subtractb(p *byte, n uintptr) *byte {
 	// Note: wrote out full expression instead of calling add(p, -n)
 	// to reduce the number of temporaries generated by the
@@ -108,6 +110,7 @@ func subtractb(p *byte, n uintptr) *byte {
 
 // add1 returns the byte pointer p+1.
 //go:nowritebarrier
+//go:nosplit
 func add1(p *byte) *byte {
 	// Note: wrote out full expression instead of calling addb(p, 1)
 	// to reduce the number of temporaries generated by the
@@ -596,10 +599,10 @@ func (h heapBits) setCheckmarked(size uintptr) {
 
 // heapBitsBulkBarrier executes writebarrierptr_nostore
 // for every pointer slot in the memory range [p, p+size),
-// using the heap bitmap to locate those pointer slots.
+// using the heap, data, or BSS bitmap to locate those pointer slots.
 // This executes the write barriers necessary after a memmove.
 // Both p and size must be pointer-aligned.
-// The range [p, p+size) must lie within a single allocation.
+// The range [p, p+size) must lie within a single object.
 //
 // Callers should call heapBitsBulkBarrier immediately after
 // calling memmove(p, src, size). This function is marked nosplit
@@ -643,6 +646,22 @@ func heapBitsBulkBarrier(p, size uintptr) {
 			systemstack(func() {
 				gcUnwindBarriers(gp, p)
 			})
+			return
+		}
+
+		// If p is a global, use the data or BSS bitmaps to
+		// execute write barriers.
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			if datap.data <= p && p < datap.edata {
+				bulkBarrierBitmap(p, size, p-datap.data, datap.gcdatamask.bytedata)
+				return
+			}
+		}
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			if datap.bss <= p && p < datap.ebss {
+				bulkBarrierBitmap(p, size, p-datap.bss, datap.gcbssmask.bytedata)
+				return
+			}
 		}
 		return
 	}
@@ -657,6 +676,36 @@ func heapBitsBulkBarrier(p, size uintptr) {
 	}
 }
 
+// bulkBarrierBitmap executes write barriers for [p, p+size) using a
+// 1-bit pointer bitmap. p is assumed to start maskOffset bytes into
+// the data covered by the bitmap in bits.
+//
+// This is used by heapBitsBulkBarrier for writes to data and BSS.
+//
+//go:nosplit
+func bulkBarrierBitmap(p, size, maskOffset uintptr, bits *uint8) {
+	word := maskOffset / sys.PtrSize
+	bits = addb(bits, word/8)
+	mask := uint8(1) << (word % 8)
+
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if mask == 0 {
+			bits = addb(bits, 1)
+			if *bits == 0 {
+				// Skip 8 words.
+				i += 7 * sys.PtrSize
+				continue
+			}
+			mask = 1
+		}
+		if *bits&mask != 0 {
+			x := (*uintptr)(unsafe.Pointer(p + i))
+			writebarrierptr_nostore(x, *x)
+		}
+		mask <<= 1
+	}
+}
+
 // typeBitsBulkBarrier executes writebarrierptr_nostore
 // for every pointer slot in the memory range [p, p+size),
 // using the type bitmap to locate those pointer slots.
@@ -676,11 +725,11 @@ func typeBitsBulkBarrier(typ *_type, p, size uintptr) {
 		throw("runtime: typeBitsBulkBarrier without type")
 	}
 	if typ.size != size {
-		println("runtime: typeBitsBulkBarrier with type ", typ._string, " of size ", typ.size, " but memory size", size)
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
 		throw("runtime: invalid typeBitsBulkBarrier")
 	}
 	if typ.kind&kindGCProg != 0 {
-		println("runtime: typeBitsBulkBarrier with type ", typ._string, " with GC prog")
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
 		throw("runtime: invalid typeBitsBulkBarrier")
 	}
 	if !writeBarrier.needed {
@@ -1128,7 +1177,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	}
 	if nw == 0 {
 		// No pointers! Caller was supposed to check.
-		println("runtime: invalid type ", typ._string)
+		println("runtime: invalid type ", typ.string())
 		throw("heapBitsSetType: called with non-pointer type")
 		return
 	}
@@ -1314,7 +1363,7 @@ Phase4:
 	if doubleCheck {
 		end := heapBitsForAddr(x + size)
 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
-			println("ended at wrong bitmap byte for", typ._string, "x", dataSize/typ.size)
+			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
 			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 			h0 := heapBitsForAddr(x)
@@ -1350,7 +1399,7 @@ Phase4:
 				}
 			}
 			if have != want {
-				println("mismatch writing bits for", typ._string, "x", dataSize/typ.size)
+				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
 				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
 				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 1ee13bd7e6..61fdcee543 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -132,6 +132,13 @@ func sysUnused(v unsafe.Pointer, n uintptr) {
 		}
 	}
 
+	if uintptr(v)&(sys.PhysPageSize-1) != 0 || n&(sys.PhysPageSize-1) != 0 {
+		// madvise will round this to any physical page
+		// *covered* by this range, so an unaligned madvise
+		// will release more memory than intended.
+		throw("unaligned sysUnused")
+	}
+
 	madvise(v, n, _MADV_DONTNEED)
 }
 
diff --git a/src/runtime/memclr_s390x.s b/src/runtime/memclr_s390x.s
new file mode 100644
index 0000000000..86eafec0a9
--- /dev/null
+++ b/src/runtime/memclr_s390x.s
@@ -0,0 +1,122 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	ptr+0(FP), R4
+	MOVD	n+8(FP), R5
+
+start:
+	CMPBLE	R5, $3, clear0to3
+	CMPBLE	R5, $7, clear4to7
+	CMPBLE	R5, $11, clear8to11
+	CMPBLE	R5, $15, clear12to15
+	CMP	R5, $32
+	BGE	clearmt32
+	MOVD	R0, 0(R4)
+	MOVD	R0, 8(R4)
+	ADD	$16, R4
+	SUB	$16, R5
+	BR	start
+
+clear0to3:
+	CMPBEQ	R5, $0, done
+	CMPBNE	R5, $1, clear2
+	MOVB	R0, 0(R4)
+	RET
+clear2:
+	CMPBNE	R5, $2, clear3
+	MOVH	R0, 0(R4)
+	RET
+clear3:
+	MOVH	R0, 0(R4)
+	MOVB	R0, 2(R4)
+	RET
+
+clear4to7:
+	CMPBNE	R5, $4, clear5
+	MOVW	R0, 0(R4)
+	RET
+clear5:
+	CMPBNE	R5, $5, clear6
+	MOVW	R0, 0(R4)
+	MOVB	R0, 4(R4)
+	RET
+clear6:
+	CMPBNE	R5, $6, clear7
+	MOVW	R0, 0(R4)
+	MOVH	R0, 4(R4)
+	RET
+clear7:
+	MOVW	R0, 0(R4)
+	MOVH	R0, 4(R4)
+	MOVB	R0, 6(R4)
+	RET
+
+clear8to11:
+	CMPBNE	R5, $8, clear9
+	MOVD	R0, 0(R4)
+	RET
+clear9:
+	CMPBNE	R5, $9, clear10
+	MOVD	R0, 0(R4)
+	MOVB	R0, 8(R4)
+	RET
+clear10:
+	CMPBNE	R5, $10, clear11
+	MOVD	R0, 0(R4)
+	MOVH	R0, 8(R4)
+	RET
+clear11:
+	MOVD	R0, 0(R4)
+	MOVH	R0, 8(R4)
+	MOVB	R0, 10(R4)
+	RET
+
+clear12to15:
+	CMPBNE	R5, $12, clear13
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	RET
+clear13:
+	CMPBNE	R5, $13, clear14
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVB	R0, 12(R4)
+	RET
+clear14:
+	CMPBNE	R5, $14, clear15
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVH	R0, 12(R4)
+	RET
+clear15:
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVH	R0, 12(R4)
+	MOVB	R0, 14(R4)
+	RET
+
+clearmt32:
+	CMP	R5, $256
+	BLT	clearlt256
+	XC	$256, 0(R4), 0(R4)
+	ADD	$256, R4
+	ADD	$-256, R5
+	BR	clearmt32
+clearlt256:
+	CMPBEQ	R5, $0, done
+	ADD	$-1, R5
+	EXRL	$runtime·memclr_s390x_exrl_xc(SB), R5
+done:
+	RET
+
+// DO NOT CALL - target for exrl (execute relative long) instruction.
+TEXT runtime·memclr_s390x_exrl_xc(SB),NOSPLIT|NOFRAME,$0-0
+	XC	$1, 0(R4), 0(R4)
+	MOVD	R0, 0(R0)
+	RET
+
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index ea73b455b4..26dabd9e69 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVD	to+0(FP), R3
 	MOVD	from+8(FP), R4
 	MOVD	n+16(FP), R5
-	CMP	R5, $0
-	BNE	check
-	RET
 
+	// Determine if there are doublewords to
+	// copy so a more efficient move can be done
 check:
-	ANDCC	$7, R5, R7	// R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
-	SRAD	$3, R5, R6	// R6 is the number of words to copy
-	CMP	R6, $0, CR1	// CR1[EQ] is set if there are no words to copy.
+	ANDCC	$7, R5, R7	// R7: bytes to copy
+	SRAD	$3, R5, R6	// R6: double words to copy
+	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
+
+	// Determine overlap by subtracting dest - src and comparing against the
+	// length.  The catches the cases where src and dest are in different types
+	// of storage such as stack and static to avoid doing backward move when not
+	// necessary.
 
-	CMP	R3, R4, CR2
-	BC	12, 9, backward	// I think you should be able to write this as "BGT CR2, backward"
+	SUB	R4, R3, R8	// dest - src
+	CMPU	R8, R5, CR2	// < len?
+	BC	12, 8, backward // BLT CR2 backward
 
-	// Copying forward proceeds by copying R6 words then copying R7 bytes.
-	// R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
-	// load/store, R3 and R4 point before the bytes that are to be copied.
+	// Copying forward if no overlap.
 
 	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
+	MOVD	R6,CTR			// R6 = number of double words
+	SRADCC	$2,R6,R8		// 32 byte chunks?
+	BNE	forward32setup		//
 
-	MOVD	R6, CTR
+	// Move double words
 
-	SUB	$8, R3
-	SUB	$8, R4
+forward8:
+	MOVD    0(R4), R8		// double word
+	ADD     $8,R4
+	MOVD    R8, 0(R3)		//
+	ADD     $8,R3
+	BC      16, 0, forward8
+	BR	noforwardlarge		// handle remainder
 
-forwardlargeloop:
-	MOVDU	8(R4), R8
-	MOVDU	R8, 8(R3)
-	BC	16, 0, forwardlargeloop // "BDNZ"
+	// Prepare for moves of 32 bytes at a time.
 
-	ADD	$8, R3
-	ADD	$8, R4
+forward32setup:
+	DCBTST	(R3)			// prepare data cache
+	DCBT	(R4)
+	MOVD	R8, CTR			// double work count
+
+forward32:
+	MOVD	0(R4), R8		// load 4 double words
+	MOVD	8(R4), R9
+	MOVD	16(R4), R14
+	MOVD	24(R4), R15
+	ADD	$32,R4
+	MOVD	R8, 0(R3)		// store those 4
+	MOVD	R9, 8(R3)
+	MOVD	R14,16(R3)
+	MOVD	R15,24(R3)
+	ADD	$32,R3			// bump up for next set
+	BC	16, 0, forward32	// continue
+	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
+	BEQ	noforwardlarge
+	MOVD	R6,CTR			// set up the CTR
+	BR	forward8
 
 noforwardlarge:
-	BNE	forwardtail	// Tests the bit set by ANDCC above
-	RET
+	CMP	R7,$0			// any remaining bytes
+	BC	4, 1, LR
 
 forwardtail:
-	SUB	$1, R3
-	SUB	$1, R4
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// move tail bytes
 
 forwardtailloop:
-	MOVBZU	1(R4), R8
-	MOVBZU	R8, 1(R3)
+	MOVBZ	0(R4), R8		// move single bytes
+	ADD	$1,R4
+	MOVBZ	R8, 0(R3)
+	ADD	$1,R3
 	BC	16, 0, forwardtailloop
 	RET
 
 backward:
-	// Copying backwards proceeds by copying R7 bytes then copying R6 words.
+	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
-	ADD	R5, R4, R4
-	ADD	R3, R5, R3
+	ADD	R5, R4, R4		// end of source
+	ADD	R3, R5, R3		// end of dest
 
-	BEQ	nobackwardtail
+	BEQ	nobackwardtail		// earlier condition
 
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// bytes to move
 
 backwardtailloop:
-	MOVBZU	-1(R4), R8
-	MOVBZU	R8, -1(R3)
+	MOVBZ 	-1(R4), R8		// point to last byte
+	SUB	$1,R4
+	MOVBZ 	R8, -1(R3)
+	SUB	$1,R3
 	BC	16, 0, backwardtailloop
 
 nobackwardtail:
-	BC	4, 6, backwardlarge		// "BNE CR1"
-	RET
+	CMP	R6,$0
+	BC	4, 5, LR
 
 backwardlarge:
 	MOVD	R6, CTR
 
 backwardlargeloop:
-	MOVDU	-8(R4), R8
-	MOVDU	R8, -8(R3)
-	BC	16, 0, backwardlargeloop	// "BDNZ"
+	MOVD 	-8(R4), R8
+	SUB	$8,R4
+	MOVD 	R8, -8(R3)
+	SUB	$8,R3
+	BC	16, 0, backwardlargeloop	//
 	RET
diff --git a/src/runtime/memmove_s390x.s b/src/runtime/memmove_s390x.s
new file mode 100644
index 0000000000..238f30891d
--- /dev/null
+++ b/src/runtime/memmove_s390x.s
@@ -0,0 +1,189 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB),NOSPLIT|NOFRAME,$0-24
+	MOVD	to+0(FP), R6
+	MOVD	from+8(FP), R4
+	MOVD	n+16(FP), R5
+
+	CMPBEQ	R6, R4, done
+
+start:
+	CMPBLE	R5, $3, move0to3
+	CMPBLE	R5, $7, move4to7
+	CMPBLE	R5, $11, move8to11
+	CMPBLE	R5, $15, move12to15
+	CMPBNE	R5, $16, movemt16
+	MOVD	0(R4), R7
+	MOVD	8(R4), R8
+	MOVD	R7, 0(R6)
+	MOVD	R8, 8(R6)
+	RET
+
+movemt16:
+	CMPBGT	R4, R6, forwards
+	ADD	R5, R4, R7
+	CMPBLE	R7, R6, forwards
+	ADD	R5, R6, R8
+backwards:
+	MOVD	-8(R7), R3
+	MOVD	R3, -8(R8)
+	MOVD	-16(R7), R3
+	MOVD	R3, -16(R8)
+	ADD	$-16, R5
+	ADD	$-16, R7
+	ADD	$-16, R8
+	CMP	R5, $16
+	BGE	backwards
+	BR	start
+
+forwards:
+	CMPBGT	R5, $64, forwards_fast
+	MOVD	0(R4), R3
+	MOVD	R3, 0(R6)
+	MOVD	8(R4), R3
+	MOVD	R3, 8(R6)
+	ADD	$16, R4
+	ADD	$16, R6
+	ADD	$-16, R5
+	CMP	R5, $16
+	BGE	forwards
+	BR	start
+
+forwards_fast:
+	CMP	R5, $256
+	BLE	forwards_small
+	MVC	$256, 0(R4), 0(R6)
+	ADD	$256, R4
+	ADD	$256, R6
+	ADD	$-256, R5
+	BR	forwards_fast
+
+forwards_small:
+	CMPBEQ	R5, $0, done
+	ADD	$-1, R5
+	EXRL	$runtime·memmove_s390x_exrl_mvc(SB), R5
+	RET
+
+move0to3:
+	CMPBEQ	R5, $0, done
+move1:
+	CMPBNE	R5, $1, move2
+	MOVB	0(R4), R3
+	MOVB	R3, 0(R6)
+	RET
+move2:
+	CMPBNE	R5, $2, move3
+	MOVH	0(R4), R3
+	MOVH	R3, 0(R6)
+	RET
+move3:
+	MOVH	0(R4), R3
+	MOVB	2(R4), R7
+	MOVH	R3, 0(R6)
+	MOVB	R7, 2(R6)
+	RET
+
+move4to7:
+	CMPBNE	R5, $4, move5
+	MOVW	0(R4), R3
+	MOVW	R3, 0(R6)
+	RET
+move5:
+	CMPBNE	R5, $5, move6
+	MOVW	0(R4), R3
+	MOVB	4(R4), R7
+	MOVW	R3, 0(R6)
+	MOVB	R7, 4(R6)
+	RET
+move6:
+	CMPBNE	R5, $6, move7
+	MOVW	0(R4), R3
+	MOVH	4(R4), R7
+	MOVW	R3, 0(R6)
+	MOVH	R7, 4(R6)
+	RET
+move7:
+	MOVW	0(R4), R3
+	MOVH	4(R4), R7
+	MOVB	6(R4), R8
+	MOVW	R3, 0(R6)
+	MOVH	R7, 4(R6)
+	MOVB	R8, 6(R6)
+	RET
+
+move8to11:
+	CMPBNE	R5, $8, move9
+	MOVD	0(R4), R3
+	MOVD	R3, 0(R6)
+	RET
+move9:
+	CMPBNE	R5, $9, move10
+	MOVD	0(R4), R3
+	MOVB	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVB	R7, 8(R6)
+	RET
+move10:
+	CMPBNE	R5, $10, move11
+	MOVD	0(R4), R3
+	MOVH	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVH	R7, 8(R6)
+	RET
+move11:
+	MOVD	0(R4), R3
+	MOVH	8(R4), R7
+	MOVB	10(R4), R8
+	MOVD	R3, 0(R6)
+	MOVH	R7, 8(R6)
+	MOVB	R8, 10(R6)
+	RET
+
+move12to15:
+	CMPBNE	R5, $12, move13
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	RET
+move13:
+	CMPBNE	R5, $13, move14
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVB	12(R4), R8
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVB	R8, 12(R6)
+	RET
+move14:
+	CMPBNE	R5, $14, move15
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVH	12(R4), R8
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVH	R8, 12(R6)
+	RET
+move15:
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVH	12(R4), R8
+	MOVB	14(R4), R10
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVH	R8, 12(R6)
+	MOVB	R10, 14(R6)
+done:
+	RET
+
+// DO NOT CALL - target for exrl (execute relative long) instruction.
+TEXT runtime·memmove_s390x_exrl_mvc(SB),NOSPLIT|NOFRAME,$0-0
+	MVC	$1, 0(R4), 0(R6)
+	MOVD	R0, 0(R0)
+	RET
+
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index b862f019b6..e81650d842 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -172,7 +172,7 @@ func runfinq() {
 					// all not yet finalized objects are stored in finq.
 					// If we do not mark it as FlagNoScan,
 					// the last finalized object is not collected.
-					frame = mallocgc(framesz, nil, flagNoScan)
+					frame = mallocgc(framesz, nil, true)
 					framecap = framesz
 				}
 
@@ -274,7 +274,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 		throw("runtime.SetFinalizer: first argument is nil")
 	}
 	if etyp.kind&kindMask != kindPtr {
-		throw("runtime.SetFinalizer: first argument is " + etyp._string + ", not pointer")
+		throw("runtime.SetFinalizer: first argument is " + etyp.string() + ", not pointer")
 	}
 	ot := (*ptrtype)(unsafe.Pointer(etyp))
 	if ot.elem == nil {
@@ -328,14 +328,14 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 	}
 
 	if ftyp.kind&kindMask != kindFunc {
-		throw("runtime.SetFinalizer: second argument is " + ftyp._string + ", not a function")
+		throw("runtime.SetFinalizer: second argument is " + ftyp.string() + ", not a function")
 	}
 	ft := (*functype)(unsafe.Pointer(ftyp))
 	if ft.dotdotdot() {
-		throw("runtime.SetFinalizer: cannot pass " + etyp._string + " to finalizer " + ftyp._string + " because dotdotdot")
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string() + " because dotdotdot")
 	}
 	if ft.dotdotdot() || ft.inCount != 1 {
-		throw("runtime.SetFinalizer: cannot pass " + etyp._string + " to finalizer " + ftyp._string)
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
 	}
 	fint := ft.in()[0]
 	switch {
@@ -358,7 +358,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 			goto okarg
 		}
 	}
-	throw("runtime.SetFinalizer: cannot pass " + etyp._string + " to finalizer " + ftyp._string)
+	throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
 okarg:
 	// compute size needed for return parameters
 	nret := uintptr(0)
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 1c184db10b..ae8338ac10 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -24,6 +24,10 @@
 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
 //
+// TODO(austin): The rest of this comment is woefully out of date and
+// needs to be rewritten. There is no distinct scan phase any more and
+// we allocate black during GC.
+//
 //  0. Set phase = GCscan from GCoff.
 //  1. Wait for all P's to acknowledge phase change.
 //         At this point all goroutines have passed through a GC safepoint and
@@ -244,7 +248,7 @@ var gcBlackenPromptly bool
 
 const (
 	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
-	_GCmark                   // GC marking roots and workbufs, write barrier ENABLED
+	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 )
 
@@ -304,7 +308,8 @@ type gcControllerState struct {
 	// scanWork is the total scan work performed this cycle. This
 	// is updated atomically during the cycle. Updates occur in
 	// bounded batches, since it is both written and read
-	// throughout the cycle.
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
 	//
 	// Currently this is the bytes of heap scanned. For most uses,
 	// this is an opaque unit of work, but for estimation the
@@ -466,14 +471,18 @@ func (c *gcControllerState) startCycle() {
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
 // available).
+//
+// TODO: Consider removing the periodic controller update altogether.
+// Since we switched to allocating black, in theory we shouldn't have
+// to change the assist ratio. However, this is still a useful hook
+// that we've found many uses for when experimenting.
 func (c *gcControllerState) revise() {
 	// Compute the expected scan work remaining.
 	//
-	// Note that the scannable heap size is likely to increase
-	// during the GC cycle. This is why it's important to revise
-	// the assist ratio throughout the cycle: if the scannable
-	// heap size increases, the assist ratio based on the initial
-	// scannable heap size may target too little scan work.
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heap_scan) and scan work completed
+	// (scanWork), so this difference won't be changed by
+	// allocations during GC.
 	//
 	// This particular estimate is a strict upper bound on the
 	// possible remaining scan work for the current heap.
@@ -753,7 +762,7 @@ var work struct {
 	alldone note
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
 
 	// markrootDone indicates that roots have been marked at least
 	// once during the current GC cycle. This is checked by root
@@ -821,6 +830,14 @@ var work struct {
 		head, tail guintptr
 	}
 
+	// rescan is a list of G's that need to be rescanned during
+	// mark termination. A G adds itself to this list when it
+	// first invalidates its stack scan.
+	rescan struct {
+		lock mutex
+		list []guintptr
+	}
+
 	// Timing/utilization stats for this cycle.
 	stwprocs, maxprocs                 int32
 	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
@@ -1069,13 +1086,6 @@ top:
 		// cached workbufs.
 		atomic.Xadd(&work.nwait, -1)
 
-		// Rescan global data and BSS. There may still work
-		// workers running at this point, so bump "jobs" down
-		// before "next" so they won't try running root jobs
-		// until we set next.
-		atomic.Store(&work.markrootJobs, uint32(fixedRootCount+work.nDataRoots+work.nBSSRoots))
-		atomic.Store(&work.markrootNext, fixedRootCount)
-
 		// GC is set up for mark 2. Let Gs blocked on the
 		// transition lock go while we flush caches.
 		semrelease(&work.markDoneSema)
@@ -1257,6 +1267,13 @@ func gcMarkTermination() {
 	// Free stack spans. This must be done between GC cycles.
 	systemstack(freeStackSpans)
 
+	// Best-effort remove stack barriers so they don't get in the
+	// way of things like GDB and perf.
+	lock(&allglock)
+	myallgs := allgs
+	unlock(&allglock)
+	gcTryRemoveAllStackBarriers(myallgs)
+
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
 	// we're trying to print.
@@ -1578,9 +1595,13 @@ func gcMark(start_time int64) {
 	work.markrootDone = true
 
 	for i := 0; i < int(gomaxprocs); i++ {
-		if !allp[i].gcw.empty() {
+		gcw := &allp[i].gcw
+		if !gcw.empty() {
 			throw("P has cached GC work at end of mark termination")
 		}
+		if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
+			throw("P has unflushed stats at end of mark termination")
+		}
 	}
 
 	if trace.enabled {
@@ -1589,27 +1610,8 @@ func gcMark(start_time int64) {
 
 	cachestats()
 
-	// Compute the reachable heap size at the beginning of the
-	// cycle. This is approximately the marked heap size at the
-	// end (which we know) minus the amount of marked heap that
-	// was allocated after marking began (which we don't know, but
-	// is approximately the amount of heap that was allocated
-	// since marking began).
-	allocatedDuringCycle := memstats.heap_live - work.initialHeapLive
-	if memstats.heap_live < work.initialHeapLive {
-		// This can happen if mCentral_UncacheSpan tightens
-		// the heap_live approximation.
-		allocatedDuringCycle = 0
-	}
-	if work.bytesMarked >= allocatedDuringCycle {
-		memstats.heap_reachable = work.bytesMarked - allocatedDuringCycle
-	} else {
-		// This can happen if most of the allocation during
-		// the cycle never became reachable from the heap.
-		// Just set the reachable heap approximation to 0 and
-		// let the heapminimum kick in below.
-		memstats.heap_reachable = 0
-	}
+	// Update the reachable heap stat.
+	memstats.heap_reachable = work.bytesMarked
 
 	// Trigger the next GC cycle when the allocated heap has grown
 	// by triggerRatio over the reachable heap size. Assume that
@@ -1735,14 +1737,22 @@ func gcCopySpans() {
 func gcResetMarkState() {
 	// This may be called during a concurrent phase, so make sure
 	// allgs doesn't change.
+	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
+		// Accessing gcRescan is unsafe.
+		throw("bad GC phase")
+	}
 	lock(&allglock)
 	for _, gp := range allgs {
 		gp.gcscandone = false  // set to true in gcphasework
 		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcRescan = -1
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
 
+	// Clear rescan list.
+	work.rescan.list = work.rescan.list[:0]
+
 	work.bytesMarked = 0
 	work.initialHeapLive = memstats.heap_live
 	work.markrootDone = false
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index d05ad6549f..3704164527 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -15,6 +15,7 @@ import (
 const (
 	fixedRootFinalizers = iota
 	fixedRootFlushCaches
+	fixedRootFreeGStacks
 	fixedRootCount
 
 	// rootBlockBytes is the number of bytes to scan per data or
@@ -31,6 +32,8 @@ const (
 //
 // The caller must have call gcCopySpans().
 //
+// The world must be stopped.
+//
 //go:nowritebarrier
 func gcMarkRootPrepare() {
 	// Compute how many data and BSS root blocks there are.
@@ -39,36 +42,58 @@ func gcMarkRootPrepare() {
 	}
 
 	work.nDataRoots = 0
-	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		nDataRoots := nBlocks(datap.edata - datap.data)
-		if nDataRoots > work.nDataRoots {
-			work.nDataRoots = nDataRoots
+	work.nBSSRoots = 0
+
+	// Only scan globals once per cycle; preferably concurrently.
+	if !work.markrootDone {
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			nDataRoots := nBlocks(datap.edata - datap.data)
+			if nDataRoots > work.nDataRoots {
+				work.nDataRoots = nDataRoots
+			}
 		}
-	}
 
-	work.nBSSRoots = 0
-	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		nBSSRoots := nBlocks(datap.ebss - datap.bss)
-		if nBSSRoots > work.nBSSRoots {
-			work.nBSSRoots = nBSSRoots
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			nBSSRoots := nBlocks(datap.ebss - datap.bss)
+			if nBSSRoots > work.nBSSRoots {
+				work.nBSSRoots = nBSSRoots
+			}
 		}
 	}
 
-	// Compute number of span roots.
-	work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
+	if !work.markrootDone {
+		// On the first markroot, we need to scan span roots.
+		// In concurrent GC, this happens during concurrent
+		// mark and we depend on addfinalizer to ensure the
+		// above invariants for objects that get finalizers
+		// after concurrent mark. In STW GC, this will happen
+		// during mark termination.
+		work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
+
+		// On the first markroot, we need to scan all Gs. Gs
+		// may be created after this point, but it's okay that
+		// we ignore them because they begin life without any
+		// roots, so there's nothing to scan, and any roots
+		// they create during the concurrent phase will be
+		// scanned during mark termination. During mark
+		// termination, allglen isn't changing, so we'll scan
+		// all Gs.
+		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		work.nRescanRoots = 0
+	} else {
+		// We've already scanned span roots and kept the scan
+		// up-to-date during concurrent mark.
+		work.nSpanRoots = 0
 
-	// Snapshot of allglen. During concurrent scan, we just need
-	// to be consistent about how many markroot jobs we create and
-	// how many Gs we check. Gs may be created after this point,
-	// but it's okay that we ignore them because they begin life
-	// without any roots, so there's nothing to scan, and any
-	// roots they create during the concurrent phase will be
-	// scanned during mark termination. During mark termination,
-	// allglen isn't changing, so we'll scan all Gs.
-	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		// On the second pass of markroot, we're just scanning
+		// dirty stacks. It's safe to access rescan since the
+		// world is stopped.
+		work.nStackRoots = 0
+		work.nRescanRoots = len(work.rescan.list)
+	}
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -80,11 +105,24 @@ func gcMarkRootCheck() {
 	}
 
 	lock(&allglock)
-	// Check that gc work is done.
-	for i := 0; i < work.nStackRoots; i++ {
-		gp := allgs[i]
-		if !gp.gcscandone {
-			throw("scan missed a g")
+	// Check that stacks have been scanned.
+	if gcphase == _GCmarktermination {
+		for i := 0; i < len(allgs); i++ {
+			gp := allgs[i]
+			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+				println("gp", gp, "goid", gp.goid,
+					"status", readgstatus(gp),
+					"gcscandone", gp.gcscandone,
+					"gcscanvalid", gp.gcscanvalid)
+				throw("scan missed a g")
+			}
+		}
+	} else {
+		for i := 0; i < work.nStackRoots; i++ {
+			gp := allgs[i]
+			if !gp.gcscandone {
+				throw("scan missed a g")
+			}
 		}
 	}
 	unlock(&allglock)
@@ -97,12 +135,18 @@ var oneptrmask = [...]uint8{1}
 //
 // Preemption must be disabled (because this uses a gcWork).
 //
+// nowritebarrier is only advisory here.
+//
 //go:nowritebarrier
 func markroot(gcw *gcWork, i uint32) {
+	// TODO(austin): This is a bit ridiculous. Compute and store
+	// the bases in gcMarkRootPrepare instead of the counts.
 	baseData := uint32(fixedRootCount)
 	baseBSS := baseData + uint32(work.nDataRoots)
 	baseSpans := baseBSS + uint32(work.nBSSRoots)
 	baseStacks := baseSpans + uint32(work.nSpanRoots)
+	baseRescan := baseStacks + uint32(work.nStackRoots)
+	end := baseRescan + uint32(work.nRescanRoots)
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
@@ -126,16 +170,27 @@ func markroot(gcw *gcWork, i uint32) {
 			flushallmcaches()
 		}
 
+	case i == fixedRootFreeGStacks:
+		// Only do this once per GC cycle; preferably
+		// concurrently.
+		if !work.markrootDone {
+			markrootFreeGStacks()
+		}
+
 	case baseSpans <= i && i < baseStacks:
 		// mark MSpan.specials
 		markrootSpans(gcw, int(i-baseSpans))
 
 	default:
 		// the rest is scanning goroutine stacks
-		if uintptr(i-baseStacks) >= allglen {
+		var gp *g
+		if baseStacks <= i && i < baseRescan {
+			gp = allgs[i-baseStacks]
+		} else if baseRescan <= i && i < end {
+			gp = work.rescan.list[i-baseRescan].ptr()
+		} else {
 			throw("markroot: bad index")
 		}
-		gp := allgs[i-baseStacks]
 
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
@@ -144,20 +199,14 @@ func markroot(gcw *gcWork, i uint32) {
 			gp.waitsince = work.tstart
 		}
 
-		if gcphase == _GCmarktermination && status == _Gdead {
-			// Free gp's stack if necessary. Only do this
-			// during mark termination because otherwise
-			// _Gdead may be transient.
-			shrinkstack(gp)
-		}
-
-		if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC {
+		if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC && readgstatus(gp) != _Gdead {
 			// GC background workers may be
 			// non-preemptible, so we may deadlock if we
 			// try to scan them during a concurrent phase.
 			// They also have tiny stacks, so just ignore
 			// them until mark termination.
 			gp.gcscandone = true
+			queueRescan(gp)
 			break
 		}
 
@@ -215,6 +264,36 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
 	scanblock(b, n, ptrmask, gcw)
 }
 
+// markrootFreeGStacks frees stacks of dead Gs.
+//
+// This does not free stacks of dead Gs cached on Ps, but having a few
+// cached stacks around isn't a problem.
+//
+//TODO go:nowritebarrier
+func markrootFreeGStacks() {
+	// Take list of dead Gs with stacks.
+	lock(&sched.gflock)
+	list := sched.gfreeStack
+	sched.gfreeStack = nil
+	unlock(&sched.gflock)
+	if list == nil {
+		return
+	}
+
+	// Free stacks.
+	tail := list
+	for gp := list; gp != nil; gp = gp.schedlink.ptr() {
+		shrinkstack(gp)
+		tail = gp
+	}
+
+	// Put Gs back on the free list.
+	lock(&sched.gflock)
+	tail.schedlink.set(sched.gfreeNoStack)
+	sched.gfreeNoStack = list
+	unlock(&sched.gflock)
+}
+
 // markrootSpans marks roots for one shard of work.spans.
 //
 //go:nowritebarrier
@@ -232,14 +311,8 @@ func markrootSpans(gcw *gcWork, shard int) {
 	// TODO(austin): There are several ideas for making this more
 	// efficient in issue #11485.
 
-	// We process objects with finalizers only during the first
-	// markroot pass. In concurrent GC, this happens during
-	// concurrent mark and we depend on addfinalizer to ensure the
-	// above invariants for objects that get finalizers after
-	// concurrent mark. In STW GC, this will happen during mark
-	// termination.
 	if work.markrootDone {
-		return
+		throw("markrootSpans during second markroot")
 	}
 
 	sg := mheap_.sweepgen
@@ -566,9 +639,6 @@ func gcFlushBgCredit(scanWork int64) {
 //go:nowritebarrier
 func scanstack(gp *g) {
 	if gp.gcscanvalid {
-		if gcphase == _GCmarktermination {
-			gcRemoveStackBarriers(gp)
-		}
 		return
 	}
 
@@ -611,6 +681,7 @@ func scanstack(gp *g) {
 	} else {
 		sp = gp.sched.sp
 	}
+	gcLockStackBarriers(gp) // Not necessary during mark term, but harmless.
 	switch gcphase {
 	case _GCmark:
 		// Install stack barriers during stack scan.
@@ -621,16 +692,18 @@ func scanstack(gp *g) {
 			nextBarrier = ^uintptr(0)
 		}
 
-		if gp.stkbarPos != 0 || len(gp.stkbar) != 0 {
-			// If this happens, it's probably because we
-			// scanned a stack twice in the same phase.
-			print("stkbarPos=", gp.stkbarPos, " len(stkbar)=", len(gp.stkbar), " goid=", gp.goid, " gcphase=", gcphase, "\n")
-			throw("g already has stack barriers")
-		}
-
-		gcLockStackBarriers(gp)
+		// Remove any existing stack barriers before we
+		// install new ones.
+		gcRemoveStackBarriers(gp)
 
 	case _GCmarktermination:
+		if !work.markrootDone {
+			// This is a STW GC. There may be stale stack
+			// barriers from an earlier cycle since we
+			// never passed through mark phase.
+			gcRemoveStackBarriers(gp)
+		}
+
 		if int(gp.stkbarPos) == len(gp.stkbar) {
 			// gp hit all of the stack barriers (or there
 			// were none). Re-scan the whole stack.
@@ -647,8 +720,6 @@ func scanstack(gp *g) {
 			}
 		}
 
-		gcRemoveStackBarriers(gp)
-
 	default:
 		throw("scanstack in wrong phase")
 	}
@@ -686,8 +757,14 @@ func scanstack(gp *g) {
 	if gcphase == _GCmarktermination {
 		gcw.dispose()
 	}
+	gcUnlockStackBarriers(gp)
 	if gcphase == _GCmark {
-		gcUnlockStackBarriers(gp)
+		// gp may have added itself to the rescan list between
+		// when GC started and now. It's clean now, so remove
+		// it. This isn't safe during mark termination because
+		// mark termination is consuming this list, but it's
+		// also not necessary.
+		dequeueRescan(gp)
 	}
 	gp.gcscanvalid = true
 }
@@ -719,8 +796,8 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
 	// Scan local variables if stack frame has been allocated.
 	size := frame.varp - frame.sp
 	var minsize uintptr
-	switch sys.TheChar {
-	case '7':
+	switch sys.ArchFamily {
+	case sys.ARM64:
 		minsize = sys.SpAlign
 	default:
 		minsize = sys.MinFrameSize
@@ -765,6 +842,60 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
 	}
 }
 
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+	if gcphase == _GCoff {
+		gp.gcscanvalid = false
+		return
+	}
+	if gp.gcRescan != -1 {
+		throw("g already on rescan list")
+	}
+
+	lock(&work.rescan.lock)
+	gp.gcscanvalid = false
+
+	// Recheck gcphase under the lock in case there was a phase change.
+	if gcphase == _GCoff {
+		unlock(&work.rescan.lock)
+		return
+	}
+	if len(work.rescan.list) == cap(work.rescan.list) {
+		throw("rescan list overflow")
+	}
+	n := len(work.rescan.list)
+	gp.gcRescan = int32(n)
+	work.rescan.list = work.rescan.list[:n+1]
+	work.rescan.list[n].set(gp)
+	unlock(&work.rescan.lock)
+}
+
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+	if gp.gcRescan == -1 {
+		return
+	}
+	if gcphase == _GCoff {
+		gp.gcRescan = -1
+		return
+	}
+
+	lock(&work.rescan.lock)
+	if work.rescan.list[gp.gcRescan].ptr() != gp {
+		throw("bad dequeueRescan")
+	}
+	// Careful: gp may itself be the last G on the list.
+	last := work.rescan.list[len(work.rescan.list)-1]
+	work.rescan.list[gp.gcRescan] = last
+	last.ptr().gcRescan = gp.gcRescan
+	gp.gcRescan = -1
+	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+	unlock(&work.rescan.lock)
+}
+
 type gcDrainFlags int
 
 const (
@@ -1140,14 +1271,21 @@ func gcDumpObject(label string, obj, off uintptr) {
 	}
 }
 
-// If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
+// gcmarknewobject marks a newly allocated object black. obj must
+// not contain any non-nil pointers.
+//
+// This is nosplit so it can manipulate a gcWork without preemption.
+//
 //go:nowritebarrier
-func gcmarknewobject_m(obj, size uintptr) {
+//go:nosplit
+func gcmarknewobject(obj, size, scanSize uintptr) {
 	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
 	markBitsForAddr(obj).setMarked()
-	atomic.Xadd64(&work.bytesMarked, int64(size))
+	gcw := &getg().m.p.ptr().gcw
+	gcw.bytesMarked += uint64(size)
+	gcw.scanWork += int64(scanSize)
 }
 
 // Checkmarking
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 9f07dfbb99..1333dd696b 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -870,15 +870,6 @@ func (h *mheap) busyList(npages uintptr) *mSpanList {
 }
 
 func scavengelist(list *mSpanList, now, limit uint64) uintptr {
-	if sys.PhysPageSize > _PageSize {
-		// golang.org/issue/9993
-		// If the physical page size of the machine is larger than
-		// our logical heap page size the kernel may round up the
-		// amount to be freed to its page size and corrupt the heap
-		// pages surrounding the unused block.
-		return 0
-	}
-
 	if list.isEmpty() {
 		return 0
 	}
@@ -886,11 +877,30 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 	var sumreleased uintptr
 	for s := list.first; s != nil; s = s.next {
 		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-			released := (s.npages - s.npreleased) << _PageShift
+			start := uintptr(s.start) << _PageShift
+			end := start + s.npages<<_PageShift
+			if sys.PhysPageSize > _PageSize {
+				// We can only release pages in
+				// PhysPageSize blocks, so round start
+				// and end in. (Otherwise, madvise
+				// will round them *out* and release
+				// more memory than we want.)
+				start = (start + sys.PhysPageSize - 1) &^ (sys.PhysPageSize - 1)
+				end &^= sys.PhysPageSize - 1
+				if start == end {
+					continue
+				}
+			}
+			len := end - start
+
+			released := len - (s.npreleased << _PageShift)
+			if sys.PhysPageSize > _PageSize && released == 0 {
+				continue
+			}
 			memstats.heap_released += uint64(released)
 			sumreleased += released
-			s.npreleased = s.npages
-			sysUnused(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift)
+			s.npreleased = len >> _PageShift
+			sysUnused(unsafe.Pointer(start), len)
 		}
 	}
 	return sumreleased
diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 6363a90242..53617e41e4 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go
@@ -13,4 +13,7 @@ package runtime
 import "unsafe"
 
 // mmap calls the mmap system call. It is implemented in assembly.
+// We only pass the lower 32 bits of file offset to the
+// assembly routine; the higher bits (if required), should be provided
+// by the assembly routine as 0.
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index f3b9b4bc78..c3e4e2cb87 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -624,7 +624,7 @@ func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) {
 	if typ == nil {
 		print("tracealloc(", p, ", ", hex(size), ")\n")
 	} else {
-		print("tracealloc(", p, ", ", hex(size), ", ", typ._string, ")\n")
+		print("tracealloc(", p, ", ", hex(size), ", ", typ.string(), ")\n")
 	}
 	if gp.m.curg == nil || gp == gp.m.curg {
 		goroutineheader(gp)
diff --git a/src/runtime/mstkbar.go b/src/runtime/mstkbar.go
index 016625ae92..1bf9d573b7 100644
--- a/src/runtime/mstkbar.go
+++ b/src/runtime/mstkbar.go
@@ -214,14 +214,15 @@ func gcInstallStackBarrier(gp *g, frame *stkframe) bool {
 }
 
 // gcRemoveStackBarriers removes all stack barriers installed in gp's stack.
+//
+// gp's stack barriers must be locked.
+//
 //go:nowritebarrier
 func gcRemoveStackBarriers(gp *g) {
 	if debugStackBarrier && gp.stkbarPos != 0 {
 		print("hit ", gp.stkbarPos, " stack barriers, goid=", gp.goid, "\n")
 	}
 
-	gcLockStackBarriers(gp)
-
 	// Remove stack barriers that we didn't hit.
 	for _, stkbar := range gp.stkbar[gp.stkbarPos:] {
 		gcRemoveStackBarrier(gp, stkbar)
@@ -231,8 +232,6 @@ func gcRemoveStackBarriers(gp *g) {
 	// adjust them.
 	gp.stkbarPos = 0
 	gp.stkbar = gp.stkbar[:0]
-
-	gcUnlockStackBarriers(gp)
 }
 
 // gcRemoveStackBarrier removes a single stack barrier. It is the
@@ -258,6 +257,31 @@ func gcRemoveStackBarrier(gp *g, stkbar stkbar) {
 	*lrPtr = sys.Uintreg(stkbar.savedLRVal)
 }
 
+// gcTryRemoveAllStackBarriers tries to remove stack barriers from all
+// Gs in gps. It is best-effort and efficient. If it can't remove
+// barriers from a G immediately, it will simply skip it.
+func gcTryRemoveAllStackBarriers(gps []*g) {
+	for _, gp := range gps {
+	retry:
+		for {
+			switch s := readgstatus(gp); s {
+			default:
+				break retry
+
+			case _Grunnable, _Gsyscall, _Gwaiting:
+				if !castogscanstatus(gp, s, s|_Gscan) {
+					continue
+				}
+				gcLockStackBarriers(gp)
+				gcRemoveStackBarriers(gp)
+				gcUnlockStackBarriers(gp)
+				restartg(gp)
+				break retry
+			}
+		}
+	}
+}
+
 // gcPrintStkbars prints the stack barriers of gp for debugging. It
 // places a "@@@" marker at gp.stkbarPos. If marker >= 0, it will also
 // place a "==>" marker before the marker'th entry.
diff --git a/src/runtime/noasm.go b/src/runtime/noasm.go
index 351e325f4f..0a8f9e6f52 100644
--- a/src/runtime/noasm.go
+++ b/src/runtime/noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Routines that are implemented in assembly in asm_{amd64,386,arm,arm64,ppc64x}.s
+// Routines that are implemented in assembly in asm_{amd64,386,arm,arm64,ppc64x,s390x}.s
 
 // +build mips64 mips64le
 
diff --git a/src/runtime/os1_darwin.go b/src/runtime/os1_darwin.go
deleted file mode 100644
index 01dc90f97c..0000000000
--- a/src/runtime/os1_darwin.go
+++ /dev/null
@@ -1,538 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-//extern SigTabTT runtime·sigtab[];
-
-type sigset uint32
-
-var sigset_all = ^sigset(0)
-
-func unimplemented(name string) {
-	println(name, "not implemented")
-	*(*int)(unsafe.Pointer(uintptr(1231))) = 1231
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	mach_semrelease(mp.waitsema)
-}
-
-//go:nosplit
-func semacreate(mp *m) {
-	if mp.waitsema != 0 {
-		return
-	}
-	systemstack(func() {
-		mp.waitsema = mach_semcreate()
-	})
-}
-
-// BSD interface for threading.
-func osinit() {
-	// bsdthread_register delayed until end of goenvs so that we
-	// can look at the environment first.
-
-	ncpu = getncpu()
-}
-
-func getncpu() int32 {
-	// Use sysctl to fetch hw.ncpu.
-	mib := [2]uint32{6, 3}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 && int32(out) > 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-
-	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
-	// but only if we're not using cgo. If we are using cgo we need
-	// to let the C pthread library install its own thread-creation callback.
-	if !iscgo {
-		if bsdthread_register() != 0 {
-			if gogetenv("DYLD_INSERT_LIBRARIES") != "" {
-				throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)")
-			}
-			throw("runtime: bsdthread_register error")
-		}
-	}
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
-	}
-
-	var oset sigset
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	errno := bsdthread_create(stk, unsafe.Pointer(mp), funcPC(mstart))
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-
-	if errno < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -errno, ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-// newosproc0 is a version of newosproc that can be called before the runtime
-// is initialized.
-//
-// As Go uses bsdthread_register when running without cgo, this function is
-// not safe to use after initialization as it does not pass an M as fnarg.
-//
-//go:nosplit
-func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
-	stack := sysAlloc(stacksize, &memstats.stacks_sys)
-	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
-		exit(1)
-	}
-	stk := unsafe.Pointer(uintptr(stack) + stacksize)
-
-	var oset sigset
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	errno := bsdthread_create(stk, fn, fnarg)
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-
-	if errno < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
-		exit(1)
-	}
-}
-
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
-// Called to do synchronous initialization of Go code built with
-// -buildmode=c-archive or -buildmode=c-shared.
-// None of the Go runtime is initialized.
-//go:nosplit
-//go:nowritebarrierrec
-func libpreinit() {
-	initsig(true)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
-	mp.gsignal.m = mp
-}
-
-//go:nosplit
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-	sigprocmask(_SIG_SETMASK, &sigmask, nil)
-}
-
-//go:nosplit
-func sigblock() {
-	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	// Initialize signal handling.
-	_g_ := getg()
-
-	// The alternate signal stack is buggy on arm and arm64.
-	// The signal handler handles it directly.
-	// The sigaltstack assembly function does nothing.
-	if GOARCH != "arm" && GOARCH != "arm64" {
-		var st stackt
-		sigaltstack(nil, &st)
-		if st.ss_flags&_SS_DISABLE != 0 {
-			signalstack(&_g_.m.gsignal.stack)
-			_g_.m.newSigstack = true
-		} else {
-			// Use existing signal stack.
-			stsp := uintptr(unsafe.Pointer(st.ss_sp))
-			_g_.m.gsignal.stack.lo = stsp
-			_g_.m.gsignal.stack.hi = stsp + st.ss_size
-			_g_.m.gsignal.stackguard0 = stsp + _StackGuard
-			_g_.m.gsignal.stackguard1 = stsp + _StackGuard
-			_g_.m.gsignal.stackAlloc = st.ss_size
-			_g_.m.newSigstack = false
-		}
-	}
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := _g_.m.sigmask
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask &^= 1 << (uint32(i) - 1)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-//go:nosplit
-func unminit() {
-	if getg().m.newSigstack {
-		signalstack(nil)
-	}
-}
-
-// Mach IPC, to get at semaphores
-// Definitions are in /usr/include/mach on a Mac.
-
-func macherror(r int32, fn string) {
-	print("mach error ", fn, ": ", r, "\n")
-	throw("mach error")
-}
-
-const _DebugMach = false
-
-var zerondr machndr
-
-func mach_msgh_bits(a, b uint32) uint32 {
-	return a | b<<8
-}
-
-func mach_msg(h *machheader, op int32, send_size, rcv_size, rcv_name, timeout, notify uint32) int32 {
-	// TODO: Loop on interrupt.
-	return mach_msg_trap(unsafe.Pointer(h), op, send_size, rcv_size, rcv_name, timeout, notify)
-}
-
-// Mach RPC (MIG)
-const (
-	_MinMachMsg = 48
-	_MachReply  = 100
-)
-
-type codemsg struct {
-	h    machheader
-	ndr  machndr
-	code int32
-}
-
-func machcall(h *machheader, maxsize int32, rxsize int32) int32 {
-	_g_ := getg()
-	port := _g_.m.machport
-	if port == 0 {
-		port = mach_reply_port()
-		_g_.m.machport = port
-	}
-
-	h.msgh_bits |= mach_msgh_bits(_MACH_MSG_TYPE_COPY_SEND, _MACH_MSG_TYPE_MAKE_SEND_ONCE)
-	h.msgh_local_port = port
-	h.msgh_reserved = 0
-	id := h.msgh_id
-
-	if _DebugMach {
-		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
-		print("send:\t")
-		var i uint32
-		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
-			print(" ", p[i])
-			if i%8 == 7 {
-				print("\n\t")
-			}
-		}
-		if i%8 != 0 {
-			print("\n")
-		}
-	}
-	ret := mach_msg(h, _MACH_SEND_MSG|_MACH_RCV_MSG, h.msgh_size, uint32(maxsize), port, 0, 0)
-	if ret != 0 {
-		if _DebugMach {
-			print("mach_msg error ", ret, "\n")
-		}
-		return ret
-	}
-	if _DebugMach {
-		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
-		var i uint32
-		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
-			print(" ", p[i])
-			if i%8 == 7 {
-				print("\n\t")
-			}
-		}
-		if i%8 != 0 {
-			print("\n")
-		}
-	}
-	if h.msgh_id != id+_MachReply {
-		if _DebugMach {
-			print("mach_msg _MachReply id mismatch ", h.msgh_id, " != ", id+_MachReply, "\n")
-		}
-		return -303 // MIG_REPLY_MISMATCH
-	}
-	// Look for a response giving the return value.
-	// Any call can send this back with an error,
-	// and some calls only have return values so they
-	// send it back on success too. I don't quite see how
-	// you know it's one of these and not the full response
-	// format, so just look if the message is right.
-	c := (*codemsg)(unsafe.Pointer(h))
-	if uintptr(h.msgh_size) == unsafe.Sizeof(*c) && h.msgh_bits&_MACH_MSGH_BITS_COMPLEX == 0 {
-		if _DebugMach {
-			print("mig result ", c.code, "\n")
-		}
-		return c.code
-	}
-	if h.msgh_size != uint32(rxsize) {
-		if _DebugMach {
-			print("mach_msg _MachReply size mismatch ", h.msgh_size, " != ", rxsize, "\n")
-		}
-		return -307 // MIG_ARRAY_TOO_LARGE
-	}
-	return 0
-}
-
-// Semaphores!
-
-const (
-	tmach_semcreate = 3418
-	rmach_semcreate = tmach_semcreate + _MachReply
-
-	tmach_semdestroy = 3419
-	rmach_semdestroy = tmach_semdestroy + _MachReply
-
-	_KERN_ABORTED             = 14
-	_KERN_OPERATION_TIMED_OUT = 49
-)
-
-type tmach_semcreatemsg struct {
-	h      machheader
-	ndr    machndr
-	policy int32
-	value  int32
-}
-
-type rmach_semcreatemsg struct {
-	h         machheader
-	body      machbody
-	semaphore machport
-}
-
-type tmach_semdestroymsg struct {
-	h         machheader
-	body      machbody
-	semaphore machport
-}
-
-func mach_semcreate() uint32 {
-	var m [256]uint8
-	tx := (*tmach_semcreatemsg)(unsafe.Pointer(&m))
-	rx := (*rmach_semcreatemsg)(unsafe.Pointer(&m))
-
-	tx.h.msgh_bits = 0
-	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
-	tx.h.msgh_remote_port = mach_task_self()
-	tx.h.msgh_id = tmach_semcreate
-	tx.ndr = zerondr
-
-	tx.policy = 0 // 0 = SYNC_POLICY_FIFO
-	tx.value = 0
-
-	for {
-		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), int32(unsafe.Sizeof(*rx)))
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_create")
-	}
-	if rx.body.msgh_descriptor_count != 1 {
-		unimplemented("mach_semcreate desc count")
-	}
-	return rx.semaphore.name
-}
-
-func mach_semdestroy(sem uint32) {
-	var m [256]uint8
-	tx := (*tmach_semdestroymsg)(unsafe.Pointer(&m))
-
-	tx.h.msgh_bits = _MACH_MSGH_BITS_COMPLEX
-	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
-	tx.h.msgh_remote_port = mach_task_self()
-	tx.h.msgh_id = tmach_semdestroy
-	tx.body.msgh_descriptor_count = 1
-	tx.semaphore.name = sem
-	tx.semaphore.disposition = _MACH_MSG_TYPE_MOVE_SEND
-	tx.semaphore._type = 0
-
-	for {
-		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), 0)
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_destroy")
-	}
-}
-
-// The other calls have simple system call traps in sys_darwin_{amd64,386}.s
-
-func mach_semaphore_wait(sema uint32) int32
-func mach_semaphore_timedwait(sema, sec, nsec uint32) int32
-func mach_semaphore_signal(sema uint32) int32
-func mach_semaphore_signal_all(sema uint32) int32
-
-func semasleep1(ns int64) int32 {
-	_g_ := getg()
-
-	if ns >= 0 {
-		var nsecs int32
-		secs := timediv(ns, 1000000000, &nsecs)
-		r := mach_semaphore_timedwait(_g_.m.waitsema, uint32(secs), uint32(nsecs))
-		if r == _KERN_ABORTED || r == _KERN_OPERATION_TIMED_OUT {
-			return -1
-		}
-		if r != 0 {
-			macherror(r, "semaphore_wait")
-		}
-		return 0
-	}
-
-	for {
-		r := mach_semaphore_wait(_g_.m.waitsema)
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_wait")
-	}
-	return 0
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	var r int32
-	systemstack(func() {
-		r = semasleep1(ns)
-	})
-	return r
-}
-
-//go:nosplit
-func mach_semrelease(sem uint32) {
-	for {
-		r := mach_semaphore_signal(sem)
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-
-		// mach_semrelease must be completely nosplit,
-		// because it is called from Go code.
-		// If we're going to die, start that process on the system stack
-		// to avoid a Go stack split.
-		systemstack(func() { macherror(r, "semaphore_signal") })
-	}
-}
-
-//go:nosplit
-func osyield() {
-	usleep(1)
-}
-
-func memlimit() uintptr {
-	// NOTE(rsc): Could use getrlimit here,
-	// like on FreeBSD or Linux, but Darwin doesn't enforce
-	// ulimit -v, so it's unclear why we'd try to stay within
-	// the limit.
-	return 0
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = ^uint32(0)
-	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp)) // runtime·sigtramp's job is to call into real handler
-	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
-	sigaction(uint32(i), &sa, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsigstack(i int32) {
-	var osa usigactiont
-	sigaction(uint32(i), nil, &osa)
-	handler := *(*uintptr)(unsafe.Pointer(&osa.__sigaction_u))
-	if handler == 0 || handler == _SIG_DFL || handler == _SIG_IGN || osa.sa_flags&_SA_ONSTACK != 0 {
-		return
-	}
-	var sa sigactiont
-	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = handler
-	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp))
-	sa.sa_mask = osa.sa_mask
-	sa.sa_flags = osa.sa_flags | _SA_ONSTACK
-	sigaction(uint32(i), &sa, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func getsig(i int32) uintptr {
-	var sa usigactiont
-	sigaction(uint32(i), nil, &sa)
-	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
-}
-
-//go:nosplit
-func signalstack(s *stack) {
-	var st stackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func updatesigmask(m sigmask) {
-	s := sigset(m[0])
-	sigprocmask(_SIG_SETMASK, &s, nil)
-}
-
-func unblocksig(sig int32) {
-	mask := sigset(1) << (uint32(sig) - 1)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}
diff --git a/src/runtime/os1_dragonfly.go b/src/runtime/os1_dragonfly.go
deleted file mode 100644
index d7044ae4b0..0000000000
--- a/src/runtime/os1_dragonfly.go
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// From DragonFly's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-//go:nosplit
-func futexsleep(addr *uint32, val uint32, ns int64) {
-	systemstack(func() {
-		futexsleep1(addr, val, ns)
-	})
-}
-
-func futexsleep1(addr *uint32, val uint32, ns int64) {
-	var timeout int32
-	if ns >= 0 {
-		// The timeout is specified in microseconds - ensure that we
-		// do not end up dividing to zero, which would put us to sleep
-		// indefinitely...
-		timeout = timediv(ns, 1000, nil)
-		if timeout == 0 {
-			timeout = 1
-		}
-	}
-
-	// sys_umtx_sleep will return EWOULDBLOCK (EAGAIN) when the timeout
-	// expires or EBUSY if the mutex value does not match.
-	ret := sys_umtx_sleep(addr, int32(val), timeout)
-	if ret >= 0 || ret == -_EINTR || ret == -_EAGAIN || ret == -_EBUSY {
-		return
-	}
-
-	print("umtx_sleep addr=", addr, " val=", val, " ret=", ret, "\n")
-	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
-}
-
-//go:nosplit
-func futexwakeup(addr *uint32, cnt uint32) {
-	ret := sys_umtx_wakeup(addr, int32(cnt))
-	if ret >= 0 {
-		return
-	}
-
-	systemstack(func() {
-		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
-		*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
-	})
-}
-
-func lwp_start(uintptr)
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " lwp_start=", funcPC(lwp_start), " id=", mp.id, " ostk=", &mp, "\n")
-	}
-
-	var oset sigset
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-
-	params := lwpparams{
-		start_func: funcPC(lwp_start),
-		arg:        unsafe.Pointer(mp),
-		stack:      uintptr(stk),
-		tid1:       unsafe.Pointer(&mp.procid),
-		tid2:       nil,
-	}
-
-	lwp_create(&params)
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-//go:nosplit
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-	sigprocmask(_SIG_SETMASK, &sigmask, nil)
-}
-
-//go:nosplit
-func sigblock() {
-	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// m.procid is a uint64, but lwp_start writes an int32. Fix it up.
-	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
-
-	// Initialize signal handling.
-
-	// On DragonFly a thread created by pthread_create inherits
-	// the signal stack of the creating thread. We always create
-	// a new signal stack here, to avoid having two Go threads
-	// using the same signal stack. This breaks the case of a
-	// thread created in C that calls sigaltstack and then calls a
-	// Go function, because we will lose track of the C code's
-	// sigaltstack, but it's the best we can do.
-	signalstack(&_g_.m.gsignal.stack)
-	_g_.m.newSigstack = true
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := _g_.m.sigmask
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-//go:nosplit
-func unminit() {
-	if getg().m.newSigstack {
-		signalstack(nil)
-	}
-}
-
-func memlimit() uintptr {
-	/*
-		                TODO: Convert to Go when something actually uses the result.
-
-				Rlimit rl;
-				extern byte runtime·text[], runtime·end[];
-				uintptr used;
-
-				if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
-					return 0;
-				if(rl.rlim_cur >= 0x7fffffff)
-					return 0;
-
-				// Estimate our VM footprint excluding the heap.
-				// Not an exact science: use size of binary plus
-				// some room for thread stacks.
-				used = runtime·end - runtime·text + (64<<20);
-				if(used >= rl.rlim_cur)
-					return 0;
-
-				// If there's not at least 16 MB left, we're probably
-				// not going to be able to do much. Treat as no limit.
-				rl.rlim_cur -= used;
-				if(rl.rlim_cur < (16<<20))
-					return 0;
-
-				return rl.rlim_cur - used;
-	*/
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_sigaction uintptr
-	sa_flags     int32
-	sa_mask      sigset
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_sigaction = fn
-	sigaction(i, &sa, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_sigaction == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_sigaction
-}
-
-//go:nosplit
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func updatesigmask(m sigmask) {
-	var mask sigset
-	copy(mask.__bits[:], m[:])
-	sigprocmask(_SIG_SETMASK, &mask, nil)
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}
diff --git a/src/runtime/os1_linux.go b/src/runtime/os1_linux.go
deleted file mode 100644
index 726dd649fe..0000000000
--- a/src/runtime/os1_linux.go
+++ /dev/null
@@ -1,393 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-// Linux futex.
-//
-//	futexsleep(uint32 *addr, uint32 val)
-//	futexwakeup(uint32 *addr)
-//
-// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
-// Futexwakeup wakes up threads sleeping on addr.
-// Futexsleep is allowed to wake up spuriously.
-
-const (
-	_FUTEX_WAIT = 0
-	_FUTEX_WAKE = 1
-)
-
-// Atomically,
-//	if(*addr == val) sleep
-// Might be woken up spuriously; that's allowed.
-// Don't sleep longer than ns; ns < 0 means forever.
-//go:nosplit
-func futexsleep(addr *uint32, val uint32, ns int64) {
-	var ts timespec
-
-	// Some Linux kernels have a bug where futex of
-	// FUTEX_WAIT returns an internal error code
-	// as an errno. Libpthread ignores the return value
-	// here, and so can we: as it says a few lines up,
-	// spurious wakeups are allowed.
-	if ns < 0 {
-		futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
-		return
-	}
-
-	// It's difficult to live within the no-split stack limits here.
-	// On ARM and 386, a 64-bit divide invokes a general software routine
-	// that needs more stack than we can afford. So we use timediv instead.
-	// But on real 64-bit systems, where words are larger but the stack limit
-	// is not, even timediv is too heavy, and we really need to use just an
-	// ordinary machine instruction.
-	if sys.PtrSize == 8 {
-		ts.set_sec(ns / 1000000000)
-		ts.set_nsec(int32(ns % 1000000000))
-	} else {
-		ts.tv_nsec = 0
-		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
-	}
-	futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
-}
-
-// If any procs are sleeping on addr, wake up at most cnt.
-//go:nosplit
-func futexwakeup(addr *uint32, cnt uint32) {
-	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
-	if ret >= 0 {
-		return
-	}
-
-	// I don't know that futex wakeup can return
-	// EAGAIN or EINTR, but if it does, it would be
-	// safe to loop and call futex again.
-	systemstack(func() {
-		print("futexwakeup addr=", addr, " returned ", ret, "\n")
-	})
-
-	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
-}
-
-func getproccount() int32 {
-	// This buffer is huge (8 kB) but we are on the system stack
-	// and there should be plenty of space (64 kB).
-	// Also this is a leaf, so we're not holding up the memory for long.
-	// See golang.org/issue/11823.
-	// The suggested behavior here is to keep trying with ever-larger
-	// buffers, but we don't have a dynamic memory allocator at the
-	// moment, so that's a bit tricky and seems like overkill.
-	const maxCPUs = 64 * 1024
-	var buf [maxCPUs / (sys.PtrSize * 8)]uintptr
-	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
-	n := int32(0)
-	for _, v := range buf[:r/sys.PtrSize] {
-		for v != 0 {
-			n += int32(v & 1)
-			v >>= 1
-		}
-	}
-	if n == 0 {
-		n = 1
-	}
-	return n
-}
-
-// Clone, the Linux rfork.
-const (
-	_CLONE_VM             = 0x100
-	_CLONE_FS             = 0x200
-	_CLONE_FILES          = 0x400
-	_CLONE_SIGHAND        = 0x800
-	_CLONE_PTRACE         = 0x2000
-	_CLONE_VFORK          = 0x4000
-	_CLONE_PARENT         = 0x8000
-	_CLONE_THREAD         = 0x10000
-	_CLONE_NEWNS          = 0x20000
-	_CLONE_SYSVSEM        = 0x40000
-	_CLONE_SETTLS         = 0x80000
-	_CLONE_PARENT_SETTID  = 0x100000
-	_CLONE_CHILD_CLEARTID = 0x200000
-	_CLONE_UNTRACED       = 0x800000
-	_CLONE_CHILD_SETTID   = 0x1000000
-	_CLONE_STOPPED        = 0x2000000
-	_CLONE_NEWUTS         = 0x4000000
-	_CLONE_NEWIPC         = 0x8000000
-
-	cloneFlags = _CLONE_VM | /* share memory */
-		_CLONE_FS | /* share cwd, etc */
-		_CLONE_FILES | /* share fd table */
-		_CLONE_SIGHAND | /* share sig handler table */
-		_CLONE_THREAD /* revisit - okay for now */
-)
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	/*
-	 * note: strace gets confused if we use CLONE_PTRACE here.
-	 */
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, " ostk=", &mp, "\n")
-	}
-
-	// Disable signals during clone, so that the new thread starts
-	// with signals disabled. It will enable them in minit.
-	var oset sigset
-	rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
-	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
-	rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
-
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
-		throw("newosproc")
-	}
-}
-
-// Version of newosproc that doesn't require a valid G.
-//go:nosplit
-func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
-	stack := sysAlloc(stacksize, &memstats.stacks_sys)
-	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
-		exit(1)
-	}
-	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
-	if ret < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
-		exit(1)
-	}
-}
-
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
-func osinit() {
-	ncpu = getproccount()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-func getRandomData(r []byte) {
-	if startupRandomData != nil {
-		n := copy(r, startupRandomData)
-		extendRandom(r, n)
-		return
-	}
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to do synchronous initialization of Go code built with
-// -buildmode=c-archive or -buildmode=c-shared.
-// None of the Go runtime is initialized.
-//go:nosplit
-//go:nowritebarrierrec
-func libpreinit() {
-	initsig(true)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
-	mp.gsignal.m = mp
-}
-
-//go:nosplit
-func msigsave(mp *m) {
-	smask := &mp.sigmask
-	rtsigprocmask(_SIG_SETMASK, nil, smask, int32(unsafe.Sizeof(*smask)))
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-	rtsigprocmask(_SIG_SETMASK, &sigmask, nil, int32(unsafe.Sizeof(sigmask)))
-}
-
-//go:nosplit
-func sigblock() {
-	rtsigprocmask(_SIG_SETMASK, &sigset_all, nil, int32(unsafe.Sizeof(sigset_all)))
-}
-
-func gettid() uint32
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	// Initialize signal handling.
-	_g_ := getg()
-
-	var st sigaltstackt
-	sigaltstack(nil, &st)
-	if st.ss_flags&_SS_DISABLE != 0 {
-		signalstack(&_g_.m.gsignal.stack)
-		_g_.m.newSigstack = true
-	} else {
-		// Use existing signal stack.
-		stsp := uintptr(unsafe.Pointer(st.ss_sp))
-		_g_.m.gsignal.stack.lo = stsp
-		_g_.m.gsignal.stack.hi = stsp + st.ss_size
-		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
-		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
-		_g_.m.gsignal.stackAlloc = st.ss_size
-		_g_.m.newSigstack = false
-	}
-
-	// for debuggers, in case cgo created the thread
-	_g_.m.procid = uint64(gettid())
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := _g_.m.sigmask
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			sigdelset(&nmask, i)
-		}
-	}
-	rtsigprocmask(_SIG_SETMASK, &nmask, nil, int32(unsafe.Sizeof(nmask)))
-}
-
-// Called from dropm to undo the effect of an minit.
-//go:nosplit
-func unminit() {
-	if getg().m.newSigstack {
-		signalstack(nil)
-	}
-}
-
-func memlimit() uintptr {
-	/*
-		TODO: Convert to Go when something actually uses the result.
-
-		Rlimit rl;
-		extern byte runtime·text[], runtime·end[];
-		uintptr used;
-
-		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
-			return 0;
-		if(rl.rlim_cur >= 0x7fffffff)
-			return 0;
-
-		// Estimate our VM footprint excluding the heap.
-		// Not an exact science: use size of binary plus
-		// some room for thread stacks.
-		used = runtime·end - runtime·text + (64<<20);
-		if(used >= rl.rlim_cur)
-			return 0;
-
-		// If there's not at least 16 MB left, we're probably
-		// not going to be able to do much. Treat as no limit.
-		rl.rlim_cur -= used;
-		if(rl.rlim_cur < (16<<20))
-			return 0;
-
-		return rl.rlim_cur - used;
-	*/
-
-	return 0
-}
-
-//#ifdef GOARCH_386
-//#define sa_handler k_sa_handler
-//#endif
-
-func sigreturn()
-func sigtramp()
-func cgoSigtramp()
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sigfillset(&sa.sa_mask)
-	// Although Linux manpage says "sa_restorer element is obsolete and
-	// should not be used". x86_64 kernel requires it. Only use it on
-	// x86.
-	if GOARCH == "386" || GOARCH == "amd64" {
-		sa.sa_restorer = funcPC(sigreturn)
-	}
-	if fn == funcPC(sighandler) {
-		if iscgo {
-			fn = funcPC(cgoSigtramp)
-		} else {
-			fn = funcPC(sigtramp)
-		}
-	}
-	sa.sa_handler = fn
-	rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask))
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsigstack(i int32) {
-	var sa sigactiont
-	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction failure")
-	}
-	if sa.sa_handler == 0 || sa.sa_handler == _SIG_DFL || sa.sa_handler == _SIG_IGN || sa.sa_flags&_SA_ONSTACK != 0 {
-		return
-	}
-	sa.sa_flags |= _SA_ONSTACK
-	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction failure")
-	}
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func getsig(i int32) uintptr {
-	var sa sigactiont
-
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction read failure")
-	}
-	if sa.sa_handler == funcPC(sigtramp) || sa.sa_handler == funcPC(cgoSigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_handler
-}
-
-//go:nosplit
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func updatesigmask(m sigmask) {
-	var mask sigset
-	sigcopyset(&mask, m)
-	rtsigprocmask(_SIG_SETMASK, &mask, nil, int32(unsafe.Sizeof(mask)))
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	sigaddset(&mask, int(sig))
-	rtsigprocmask(_SIG_UNBLOCK, &mask, nil, int32(unsafe.Sizeof(mask)))
-}
diff --git a/src/runtime/os1_linux_generic.go b/src/runtime/os1_linux_generic.go
deleted file mode 100644
index 2c8b743aeb..0000000000
--- a/src/runtime/os1_linux_generic.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !mips64
-// +build !mips64le
-// +build linux
-
-package runtime
-
-var sigset_all = sigset{^uint32(0), ^uint32(0)}
-
-func sigaddset(mask *sigset, i int) {
-	(*mask)[(i-1)/32] |= 1 << ((uint32(i) - 1) & 31)
-}
-
-func sigdelset(mask *sigset, i int) {
-	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-}
-
-func sigfillset(mask *uint64) {
-	*mask = ^uint64(0)
-}
-
-func sigcopyset(mask *sigset, m sigmask) {
-	copy((*mask)[:], m[:])
-}
diff --git a/src/runtime/os1_linux_mips64x.go b/src/runtime/os1_linux_mips64x.go
deleted file mode 100644
index 701e979102..0000000000
--- a/src/runtime/os1_linux_mips64x.go
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips64 mips64le
-// +build linux
-
-package runtime
-
-var sigset_all = sigset{^uint64(0), ^uint64(0)}
-
-func sigaddset(mask *sigset, i int) {
-	(*mask)[(i-1)/64] |= 1 << ((uint32(i) - 1) & 63)
-}
-
-func sigdelset(mask *sigset, i int) {
-	(*mask)[(i-1)/64] &^= 1 << ((uint32(i) - 1) & 63)
-}
-
-func sigfillset(mask *[2]uint64) {
-	(*mask)[0], (*mask)[1] = ^uint64(0), ^uint64(0)
-}
-
-func sigcopyset(mask *sigset, m sigmask) {
-	(*mask)[0] = uint64(m[0]) | uint64(m[1])<<32
-}
diff --git a/src/runtime/os1_netbsd.go b/src/runtime/os1_netbsd.go
deleted file mode 100644
index 3c3b64186d..0000000000
--- a/src/runtime/os1_netbsd.go
+++ /dev/null
@@ -1,275 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"runtime/internal/atomic"
-	"unsafe"
-)
-
-const (
-	_ESRCH     = 3
-	_ETIMEDOUT = 60
-
-	// From NetBSD's <sys/time.h>
-	_CLOCK_REALTIME  = 0
-	_CLOCK_VIRTUAL   = 1
-	_CLOCK_PROF      = 2
-	_CLOCK_MONOTONIC = 3
-)
-
-var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
-
-// From NetBSD's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-//go:nosplit
-func semacreate(mp *m) {
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	_g_ := getg()
-
-	// Compute sleep deadline.
-	var tsp *timespec
-	if ns >= 0 {
-		var ts timespec
-		var nsec int32
-		ns += nanotime()
-		ts.set_sec(timediv(ns, 1000000000, &nsec))
-		ts.set_nsec(nsec)
-		tsp = &ts
-	}
-
-	for {
-		v := atomic.Load(&_g_.m.waitsemacount)
-		if v > 0 {
-			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
-				return 0 // semaphore acquired
-			}
-			continue
-		}
-
-		// Sleep until unparked by semawakeup or timeout.
-		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
-		if ret == _ETIMEDOUT {
-			return -1
-		}
-	}
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	atomic.Xadd(&mp.waitsemacount, 1)
-	// From NetBSD's _lwp_unpark(2) manual:
-	// "If the target LWP is not currently waiting, it will return
-	// immediately upon the next call to _lwp_park()."
-	ret := lwp_unpark(int32(mp.procid), unsafe.Pointer(&mp.waitsemacount))
-	if ret != 0 && ret != _ESRCH {
-		// semawakeup can be called on signal stack.
-		systemstack(func() {
-			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
-		})
-	}
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
-	}
-
-	var uc ucontextt
-	getcontext(unsafe.Pointer(&uc))
-
-	uc.uc_flags = _UC_SIGMASK | _UC_CPU
-	uc.uc_link = nil
-	uc.uc_sigmask = sigset_all
-
-	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(netbsdMstart))
-
-	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-// netbsdMStart is the function call that starts executing a newly
-// created thread. On NetBSD, a new thread inherits the signal stack
-// of the creating thread. That confuses minit, so we remove that
-// signal stack here before calling the regular mstart. It's a bit
-// baroque to remove a signal stack here only to add one in minit, but
-// it's a simple change that keeps NetBSD working like other OS's.
-// At this point all signals are blocked, so there is no race.
-//go:nosplit
-func netbsdMstart() {
-	signalstack(nil)
-	mstart()
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-//go:nosplit
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-	sigprocmask(_SIG_SETMASK, &sigmask, nil)
-}
-
-//go:nosplit
-func sigblock() {
-	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	_g_ := getg()
-	_g_.m.procid = uint64(lwp_self())
-
-	// Initialize signal handling.
-
-	// On NetBSD a thread created by pthread_create inherits the
-	// signal stack of the creating thread. We always create a
-	// new signal stack here, to avoid having two Go threads using
-	// the same signal stack. This breaks the case of a thread
-	// created in C that calls sigaltstack and then calls a Go
-	// function, because we will lose track of the C code's
-	// sigaltstack, but it's the best we can do.
-	signalstack(&_g_.m.gsignal.stack)
-	_g_.m.newSigstack = true
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := _g_.m.sigmask
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-//go:nosplit
-func unminit() {
-	if getg().m.newSigstack {
-		signalstack(nil)
-	}
-}
-
-func memlimit() uintptr {
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_sigaction uintptr
-	sa_mask      sigset
-	sa_flags     int32
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_sigaction = fn
-	sigaction(i, &sa, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_sigaction == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_sigaction
-}
-
-//go:nosplit
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func updatesigmask(m sigmask) {
-	var mask sigset
-	copy(mask.__bits[:], m[:])
-	sigprocmask(_SIG_SETMASK, &mask, nil)
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}
diff --git a/src/runtime/os1_plan9.go b/src/runtime/os1_plan9.go
index 2c257442ba..eb7a0c6481 100644
--- a/src/runtime/os1_plan9.go
+++ b/src/runtime/os1_plan9.go
@@ -17,10 +17,10 @@ func mpreinit(mp *m) {
 	// Initialize stack and goroutine for note handling.
 	mp.gsignal = malg(32 * 1024)
 	mp.gsignal.m = mp
-	mp.notesig = (*int8)(mallocgc(_ERRMAX, nil, _FlagNoScan))
+	mp.notesig = (*int8)(mallocgc(_ERRMAX, nil, true))
 	// Initialize stack for handling strings from the
 	// errstr system call, as used in package syscall.
-	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, _FlagNoScan))
+	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, true))
 }
 
 func msigsave(mp *m) {
diff --git a/src/runtime/os1_windows.go b/src/runtime/os1_windows.go
deleted file mode 100644
index 315dd9816a..0000000000
--- a/src/runtime/os1_windows.go
+++ /dev/null
@@ -1,703 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"runtime/internal/atomic"
-	"unsafe"
-)
-
-//go:cgo_import_dynamic runtime._AddVectoredExceptionHandler AddVectoredExceptionHandler%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CloseHandle CloseHandle%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateEventA CreateEventA%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CryptAcquireContextW CryptAcquireContextW%5 "advapi32.dll"
-//go:cgo_import_dynamic runtime._CryptGenRandom CryptGenRandom%3 "advapi32.dll"
-//go:cgo_import_dynamic runtime._CryptReleaseContext CryptReleaseContext%2 "advapi32.dll"
-//go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
-//go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetConsoleMode GetConsoleMode%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetEnvironmentStringsW GetEnvironmentStringsW%0 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetProcAddress GetProcAddress%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetProcessAffinityMask GetProcessAffinityMask%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetQueuedCompletionStatus GetQueuedCompletionStatus%5 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetStdHandle GetStdHandle%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._NtWaitForSingleObject NtWaitForSingleObject%3 "ntdll.dll"
-//go:cgo_import_dynamic runtime._ResumeThread ResumeThread%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetConsoleCtrlHandler SetConsoleCtrlHandler%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetErrorMode SetErrorMode%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetEvent SetEvent%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetProcessPriorityBoost SetProcessPriorityBoost%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetThreadPriority SetThreadPriority%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetUnhandledExceptionFilter SetUnhandledExceptionFilter%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetWaitableTimer SetWaitableTimer%6 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SuspendThread SuspendThread%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SwitchToThread SwitchToThread%0 "kernel32.dll"
-//go:cgo_import_dynamic runtime._VirtualAlloc VirtualAlloc%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._WSAGetOverlappedResult WSAGetOverlappedResult%5 "ws2_32.dll"
-//go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
-//go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
-
-var (
-	// Following syscalls are available on every Windows PC.
-	// All these variables are set by the Windows executable
-	// loader before the Go program starts.
-	_AddVectoredExceptionHandler,
-	_CloseHandle,
-	_CreateEventA,
-	_CreateIoCompletionPort,
-	_CreateThread,
-	_CreateWaitableTimerA,
-	_CryptAcquireContextW,
-	_CryptGenRandom,
-	_CryptReleaseContext,
-	_DuplicateHandle,
-	_ExitProcess,
-	_FreeEnvironmentStringsW,
-	_GetConsoleMode,
-	_GetEnvironmentStringsW,
-	_GetProcAddress,
-	_GetProcessAffinityMask,
-	_GetQueuedCompletionStatus,
-	_GetStdHandle,
-	_GetSystemInfo,
-	_GetThreadContext,
-	_LoadLibraryW,
-	_LoadLibraryA,
-	_NtWaitForSingleObject,
-	_ResumeThread,
-	_SetConsoleCtrlHandler,
-	_SetErrorMode,
-	_SetEvent,
-	_SetProcessPriorityBoost,
-	_SetThreadPriority,
-	_SetUnhandledExceptionFilter,
-	_SetWaitableTimer,
-	_SuspendThread,
-	_SwitchToThread,
-	_VirtualAlloc,
-	_VirtualFree,
-	_WSAGetOverlappedResult,
-	_WaitForSingleObject,
-	_WriteConsoleW,
-	_WriteFile stdFunction
-
-	// Following syscalls are only available on some Windows PCs.
-	// We will load syscalls, if available, before using them.
-	_AddDllDirectory,
-	_AddVectoredContinueHandler,
-	_GetQueuedCompletionStatusEx,
-	_LoadLibraryExW,
-	_ stdFunction
-)
-
-type sigset struct{}
-
-// Call a Windows function with stdcall conventions,
-// and switch to os stack during the call.
-func asmstdcall(fn unsafe.Pointer)
-
-var asmstdcallAddr unsafe.Pointer
-
-func windowsFindfunc(name []byte, lib uintptr) stdFunction {
-	f := stdcall2(_GetProcAddress, lib, uintptr(unsafe.Pointer(&name[0])))
-	return stdFunction(unsafe.Pointer(f))
-}
-
-func loadOptionalSyscalls() {
-	var (
-		kernel32dll                 = []byte("kernel32.dll\000")
-		addVectoredContinueHandler  = []byte("AddVectoredContinueHandler\000")
-		getQueuedCompletionStatusEx = []byte("GetQueuedCompletionStatusEx\000")
-		addDllDirectory             = []byte("AddDllDirectory\000")
-		loadLibraryExW              = []byte("LoadLibraryExW\000")
-	)
-
-	k32 := stdcall1(_LoadLibraryA, uintptr(unsafe.Pointer(&kernel32dll[0])))
-	if k32 == 0 {
-		throw("kernel32.dll not found")
-	}
-	_AddDllDirectory = windowsFindfunc(addDllDirectory, k32)
-	_AddVectoredContinueHandler = windowsFindfunc(addVectoredContinueHandler, k32)
-	_GetQueuedCompletionStatusEx = windowsFindfunc(getQueuedCompletionStatusEx, k32)
-	_LoadLibraryExW = windowsFindfunc(loadLibraryExW, k32)
-}
-
-//go:nosplit
-func getLoadLibrary() uintptr {
-	return uintptr(unsafe.Pointer(_LoadLibraryW))
-}
-
-//go:nosplit
-func getLoadLibraryEx() uintptr {
-	return uintptr(unsafe.Pointer(_LoadLibraryExW))
-}
-
-//go:nosplit
-func getGetProcAddress() uintptr {
-	return uintptr(unsafe.Pointer(_GetProcAddress))
-}
-
-func getproccount() int32 {
-	var mask, sysmask uintptr
-	ret := stdcall3(_GetProcessAffinityMask, currentProcess, uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
-	if ret != 0 {
-		n := 0
-		maskbits := int(unsafe.Sizeof(mask) * 8)
-		for i := 0; i < maskbits; i++ {
-			if mask&(1<<uint(i)) != 0 {
-				n++
-			}
-		}
-		if n != 0 {
-			return int32(n)
-		}
-	}
-	// use GetSystemInfo if GetProcessAffinityMask fails
-	var info systeminfo
-	stdcall1(_GetSystemInfo, uintptr(unsafe.Pointer(&info)))
-	return int32(info.dwnumberofprocessors)
-}
-
-const (
-	currentProcess = ^uintptr(0) // -1 = current process
-	currentThread  = ^uintptr(1) // -2 = current thread
-)
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func externalthreadhandler()
-
-// When loading DLLs, we prefer to use LoadLibraryEx with
-// LOAD_LIBRARY_SEARCH_* flags, if available. LoadLibraryEx is not
-// available on old Windows, though, and the LOAD_LIBRARY_SEARCH_*
-// flags are not available on some versions of Windows without a
-// security patch.
-//
-// https://msdn.microsoft.com/en-us/library/ms684179(v=vs.85).aspx says:
-// "Windows 7, Windows Server 2008 R2, Windows Vista, and Windows
-// Server 2008: The LOAD_LIBRARY_SEARCH_* flags are available on
-// systems that have KB2533623 installed. To determine whether the
-// flags are available, use GetProcAddress to get the address of the
-// AddDllDirectory, RemoveDllDirectory, or SetDefaultDllDirectories
-// function. If GetProcAddress succeeds, the LOAD_LIBRARY_SEARCH_*
-// flags can be used with LoadLibraryEx."
-var useLoadLibraryEx bool
-
-func osinit() {
-	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
-	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
-	switchtothreadAddr = unsafe.Pointer(funcPC(switchtothread))
-
-	setBadSignalMsg()
-
-	loadOptionalSyscalls()
-
-	useLoadLibraryEx = (_LoadLibraryExW != nil && _AddDllDirectory != nil)
-
-	disableWER()
-
-	externalthreadhandlerp = funcPC(externalthreadhandler)
-
-	initExceptionHandler()
-
-	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
-
-	ncpu = getproccount()
-
-	// Windows dynamic priority boosting assumes that a process has different types
-	// of dedicated threads -- GUI, IO, computational, etc. Go processes use
-	// equivalent threads that all do a mix of GUI, IO, computations, etc.
-	// In such context dynamic priority boosting does nothing but harm, so we turn it off.
-	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
-}
-
-//go:nosplit
-func getRandomData(r []byte) {
-	const (
-		prov_rsa_full       = 1
-		crypt_verifycontext = 0xF0000000
-	)
-	var handle uintptr
-	n := 0
-	if stdcall5(_CryptAcquireContextW, uintptr(unsafe.Pointer(&handle)), 0, 0, prov_rsa_full, crypt_verifycontext) != 0 {
-		if stdcall3(_CryptGenRandom, handle, uintptr(len(r)), uintptr(unsafe.Pointer(&r[0]))) != 0 {
-			n = len(r)
-		}
-		stdcall2(_CryptReleaseContext, handle, 0)
-	}
-	extendRandom(r, n)
-}
-
-func goenvs() {
-	// strings is a pointer to environment variable pairs in the form:
-	//     "envA=valA\x00envB=valB\x00\x00" (in UTF-16)
-	// Two consecutive zero bytes end the list.
-	strings := unsafe.Pointer(stdcall0(_GetEnvironmentStringsW))
-	p := (*[1 << 24]uint16)(strings)[:]
-
-	n := 0
-	for from, i := 0, 0; true; i++ {
-		if p[i] == 0 {
-			// empty string marks the end
-			if i == from {
-				break
-			}
-			from = i + 1
-			n++
-		}
-	}
-	envs = make([]string, n)
-
-	for i := range envs {
-		envs[i] = gostringw(&p[0])
-		for p[0] != 0 {
-			p = p[1:]
-		}
-		p = p[1:] // skip nil byte
-	}
-
-	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
-}
-
-//go:nosplit
-func exit(code int32) {
-	stdcall1(_ExitProcess, uintptr(code))
-}
-
-//go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, n int32) int32 {
-	const (
-		_STD_OUTPUT_HANDLE = ^uintptr(10) // -11
-		_STD_ERROR_HANDLE  = ^uintptr(11) // -12
-	)
-	var handle uintptr
-	switch fd {
-	case 1:
-		handle = stdcall1(_GetStdHandle, _STD_OUTPUT_HANDLE)
-	case 2:
-		handle = stdcall1(_GetStdHandle, _STD_ERROR_HANDLE)
-	default:
-		// assume fd is real windows handle.
-		handle = fd
-	}
-	isASCII := true
-	b := (*[1 << 30]byte)(buf)[:n]
-	for _, x := range b {
-		if x >= 0x80 {
-			isASCII = false
-			break
-		}
-	}
-
-	if !isASCII {
-		var m uint32
-		isConsole := stdcall2(_GetConsoleMode, handle, uintptr(unsafe.Pointer(&m))) != 0
-		// If this is a console output, various non-unicode code pages can be in use.
-		// Use the dedicated WriteConsole call to ensure unicode is printed correctly.
-		if isConsole {
-			return int32(writeConsole(handle, buf, n))
-		}
-	}
-	var written uint32
-	stdcall5(_WriteFile, handle, uintptr(buf), uintptr(n), uintptr(unsafe.Pointer(&written)), 0)
-	return int32(written)
-}
-
-var (
-	utf16ConsoleBack     [1000]uint16
-	utf16ConsoleBackLock mutex
-)
-
-// writeConsole writes bufLen bytes from buf to the console File.
-// It returns the number of bytes written.
-func writeConsole(handle uintptr, buf unsafe.Pointer, bufLen int32) int {
-	const surr2 = (surrogateMin + surrogateMax + 1) / 2
-
-	// Do not use defer for unlock. May cause issues when printing a panic.
-	lock(&utf16ConsoleBackLock)
-
-	b := (*[1 << 30]byte)(buf)[:bufLen]
-	s := *(*string)(unsafe.Pointer(&b))
-
-	utf16tmp := utf16ConsoleBack[:]
-
-	total := len(s)
-	w := 0
-	for len(s) > 0 {
-		if w >= len(utf16tmp)-2 {
-			writeConsoleUTF16(handle, utf16tmp[:w])
-			w = 0
-		}
-		r, n := charntorune(s)
-		s = s[n:]
-		if r < 0x10000 {
-			utf16tmp[w] = uint16(r)
-			w++
-		} else {
-			r -= 0x10000
-			utf16tmp[w] = surrogateMin + uint16(r>>10)&0x3ff
-			utf16tmp[w+1] = surr2 + uint16(r)&0x3ff
-			w += 2
-		}
-	}
-	writeConsoleUTF16(handle, utf16tmp[:w])
-	unlock(&utf16ConsoleBackLock)
-	return total
-}
-
-// writeConsoleUTF16 is the dedicated windows calls that correctly prints
-// to the console regardless of the current code page. Input is utf-16 code points.
-// The handle must be a console handle.
-func writeConsoleUTF16(handle uintptr, b []uint16) {
-	l := uint32(len(b))
-	if l == 0 {
-		return
-	}
-	var written uint32
-	stdcall5(_WriteConsoleW,
-		handle,
-		uintptr(unsafe.Pointer(&b[0])),
-		uintptr(l),
-		uintptr(unsafe.Pointer(&written)),
-		0,
-	)
-	return
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	// store ms in ns to save stack space
-	if ns < 0 {
-		ns = _INFINITE
-	} else {
-		ns = int64(timediv(ns, 1000000, nil))
-		if ns == 0 {
-			ns = 1
-		}
-	}
-	if stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(ns)) != 0 {
-		return -1 // timeout
-	}
-	return 0
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	stdcall1(_SetEvent, mp.waitsema)
-}
-
-//go:nosplit
-func semacreate(mp *m) {
-	if mp.waitsema != 0 {
-		return
-	}
-	mp.waitsema = stdcall4(_CreateEventA, 0, 0, 0, 0)
-}
-
-// May run with m.p==nil, so write barriers are not allowed. This
-// function is called by newosproc0, so it is also required to
-// operate without stack guards.
-//go:nowritebarrierc
-//go:nosplit
-func newosproc(mp *m, stk unsafe.Pointer) {
-	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
-	thandle := stdcall6(_CreateThread, 0, 0x20000,
-		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
-		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
-	if thandle == 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", getlasterror(), ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-// Used by the C library build mode. On Linux this function would allocate a
-// stack, but that's not necessary for Windows. No stack guards are present
-// and the GC has not been initialized, so write barriers will fail.
-//go:nowritebarrierc
-//go:nosplit
-func newosproc0(mp *m, stk unsafe.Pointer) {
-	newosproc(mp, stk)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-}
-
-//go:nosplit
-func msigsave(mp *m) {
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-}
-
-//go:nosplit
-func sigblock() {
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	var thandle uintptr
-	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
-	atomic.Storeuintptr(&getg().m.thread, thandle)
-}
-
-// Called from dropm to undo the effect of an minit.
-//go:nosplit
-func unminit() {
-	tp := &getg().m.thread
-	stdcall1(_CloseHandle, *tp)
-	*tp = 0
-}
-
-// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-type _KSYSTEM_TIME struct {
-	LowPart   uint32
-	High1Time int32
-	High2Time int32
-}
-
-const (
-	_INTERRUPT_TIME = 0x7ffe0008
-	_SYSTEM_TIME    = 0x7ffe0014
-)
-
-//go:nosplit
-func systime(addr uintptr) int64 {
-	timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
-
-	var t _KSYSTEM_TIME
-	for i := 1; i < 10000; i++ {
-		// these fields must be read in that order (see URL above)
-		t.High1Time = timeaddr.High1Time
-		t.LowPart = timeaddr.LowPart
-		t.High2Time = timeaddr.High2Time
-		if t.High1Time == t.High2Time {
-			return int64(t.High1Time)<<32 | int64(t.LowPart)
-		}
-		if (i % 100) == 0 {
-			osyield()
-		}
-	}
-	systemstack(func() {
-		throw("interrupt/system time is changing too fast")
-	})
-	return 0
-}
-
-//go:nosplit
-func unixnano() int64 {
-	return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
-}
-
-//go:nosplit
-func nanotime() int64 {
-	return systime(_INTERRUPT_TIME) * 100
-}
-
-// Calling stdcall on os stack.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-//go:nosplit
-func stdcall(fn stdFunction) uintptr {
-	gp := getg()
-	mp := gp.m
-	mp.libcall.fn = uintptr(unsafe.Pointer(fn))
-
-	if mp.profilehz != 0 {
-		// leave pc/sp for cpu profiler
-		mp.libcallg.set(gp)
-		mp.libcallpc = getcallerpc(unsafe.Pointer(&fn))
-		// sp must be the last, because once async cpu profiler finds
-		// all three values to be non-zero, it will use them
-		mp.libcallsp = getcallersp(unsafe.Pointer(&fn))
-	}
-	asmcgocall(asmstdcallAddr, unsafe.Pointer(&mp.libcall))
-	mp.libcallsp = 0
-	return mp.libcall.r1
-}
-
-//go:nosplit
-func stdcall0(fn stdFunction) uintptr {
-	mp := getg().m
-	mp.libcall.n = 0
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&fn))) // it's unused but must be non-nil, otherwise crashes
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall1(fn stdFunction, a0 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 1
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 2
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 3
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 4
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 5
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 6
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 7
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func onosstack(fn unsafe.Pointer, arg uint32)
-func usleep2(usec uint32)
-func switchtothread()
-
-var usleep2Addr unsafe.Pointer
-var switchtothreadAddr unsafe.Pointer
-
-//go:nosplit
-func osyield() {
-	onosstack(switchtothreadAddr, 0)
-}
-
-//go:nosplit
-func usleep(us uint32) {
-	// Have 1us units; want 100ns units.
-	onosstack(usleep2Addr, 10*us)
-}
-
-func ctrlhandler1(_type uint32) uint32 {
-	var s uint32
-
-	switch _type {
-	case _CTRL_C_EVENT, _CTRL_BREAK_EVENT:
-		s = _SIGINT
-	default:
-		return 0
-	}
-
-	if sigsend(s) {
-		return 1
-	}
-	exit(2) // SIGINT, SIGTERM, etc
-	return 0
-}
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func profileloop()
-
-var profiletimer uintptr
-
-func profilem(mp *m) {
-	var r *context
-	rbuf := make([]byte, unsafe.Sizeof(*r)+15)
-
-	tls := &mp.tls[0]
-	gp := *((**g)(unsafe.Pointer(tls)))
-
-	// align Context to 16 bytes
-	r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
-	r.contextflags = _CONTEXT_CONTROL
-	stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
-	sigprof(r.ip(), r.sp(), 0, gp, mp)
-}
-
-func profileloop1(param uintptr) uint32 {
-	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
-
-	for {
-		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
-		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
-		for mp := first; mp != nil; mp = mp.alllink {
-			thread := atomic.Loaduintptr(&mp.thread)
-			// Do not profile threads blocked on Notes,
-			// this includes idle worker threads,
-			// idle timer thread, idle heap scavenger, etc.
-			if thread == 0 || mp.profilehz == 0 || mp.blocked {
-				continue
-			}
-			stdcall1(_SuspendThread, thread)
-			if mp.profilehz != 0 && !mp.blocked {
-				profilem(mp)
-			}
-			stdcall1(_ResumeThread, thread)
-		}
-	}
-}
-
-var cpuprofilerlock mutex
-
-func resetcpuprofiler(hz int32) {
-	lock(&cpuprofilerlock)
-	if profiletimer == 0 {
-		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
-		atomic.Storeuintptr(&profiletimer, timer)
-		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
-		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
-		stdcall1(_CloseHandle, thread)
-	}
-	unlock(&cpuprofilerlock)
-
-	ms := int32(0)
-	due := ^int64(^uint64(1 << 63))
-	if hz > 0 {
-		ms = 1000 / hz
-		if ms == 0 {
-			ms = 1
-		}
-		due = int64(ms) * -10000
-	}
-	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
-	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
-}
-
-func memlimit() uintptr {
-	return 0
-}
diff --git a/src/runtime/os2_darwin.go b/src/runtime/os2_darwin.go
deleted file mode 100644
index 542bd74219..0000000000
--- a/src/runtime/os2_darwin.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_NSIG        = 32
-	_SI_USER     = 0 /* empirically true, but not what headers say */
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-	_SS_DISABLE  = 4
-)
diff --git a/src/runtime/os2_dragonfly.go b/src/runtime/os2_dragonfly.go
deleted file mode 100644
index 6ea2da0393..0000000000
--- a/src/runtime/os2_dragonfly.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_NSIG        = 33
-	_SI_USER     = 0
-	_SS_DISABLE  = 4
-	_RLIMIT_AS   = 10
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-)
diff --git a/src/runtime/os2_linux_mips64x.go b/src/runtime/os2_linux_mips64x.go
deleted file mode 100644
index 9a6a92a87d..0000000000
--- a/src/runtime/os2_linux_mips64x.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build linux
-// +build mips64 mips64le
-
-package runtime
-
-const (
-	_SS_DISABLE  = 2
-	_NSIG        = 65
-	_SI_USER     = 0
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-	_RLIMIT_AS   = 6
-)
-
-type sigset [2]uint64
-
-type rlimit struct {
-	rlim_cur uintptr
-	rlim_max uintptr
-}
diff --git a/src/runtime/os2_netbsd.go b/src/runtime/os2_netbsd.go
deleted file mode 100644
index 405dd5e727..0000000000
--- a/src/runtime/os2_netbsd.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_SS_DISABLE  = 4
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-	_NSIG        = 33
-	_SI_USER     = 0
-
-	// From NetBSD's <sys/ucontext.h>
-	_UC_SIGMASK = 0x01
-	_UC_CPU     = 0x04
-)
diff --git a/src/runtime/os2_windows.go b/src/runtime/os2_windows.go
deleted file mode 100644
index a867dfeb64..0000000000
--- a/src/runtime/os2_windows.go
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-func getlasterror() uint32
-func setlasterror(err uint32)
-
-// Function to be called by windows CreateThread
-// to start new os thread.
-func tstart_stdcall(newm *m) uint32
-
-func ctrlhandler(_type uint32) uint32
-
-// TODO(brainman): should not need those
-const (
-	_NSIG = 65
-)
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index e9b8933fb9..a0e3d8ed6b 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors. All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -24,6 +24,476 @@ func mach_thread_self() uint32
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
 
+func unimplemented(name string) {
+	println(name, "not implemented")
+	*(*int)(unsafe.Pointer(uintptr(1231))) = 1231
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	mach_semrelease(mp.waitsema)
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+	systemstack(func() {
+		mp.waitsema = mach_semcreate()
+	})
+}
+
+// BSD interface for threading.
+func osinit() {
+	// bsdthread_register delayed until end of goenvs so that we
+	// can look at the environment first.
+
+	ncpu = getncpu()
+}
+
+func getncpu() int32 {
+	// Use sysctl to fetch hw.ncpu.
+	mib := [2]uint32{6, 3}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 && int32(out) > 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+
+	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
+	// but only if we're not using cgo. If we are using cgo we need
+	// to let the C pthread library install its own thread-creation callback.
+	if !iscgo {
+		if bsdthread_register() != 0 {
+			if gogetenv("DYLD_INSERT_LIBRARIES") != "" {
+				throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)")
+			}
+			throw("runtime: bsdthread_register error")
+		}
+	}
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	errno := bsdthread_create(stk, unsafe.Pointer(mp), funcPC(mstart))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+
+	if errno < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -errno, ")\n")
+		throw("runtime.newosproc")
+	}
+}
+
+// newosproc0 is a version of newosproc that can be called before the runtime
+// is initialized.
+//
+// As Go uses bsdthread_register when running without cgo, this function is
+// not safe to use after initialization as it does not pass an M as fnarg.
+//
+//go:nosplit
+func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
+	stack := sysAlloc(stacksize, &memstats.stacks_sys)
+	if stack == nil {
+		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		exit(1)
+	}
+	stk := unsafe.Pointer(uintptr(stack) + stacksize)
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	errno := bsdthread_create(stk, fn, fnarg)
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+
+	if errno < 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	// Initialize signal handling.
+	_g_ := getg()
+
+	// The alternate signal stack is buggy on arm and arm64.
+	// The signal handler handles it directly.
+	// The sigaltstack assembly function does nothing.
+	if GOARCH != "arm" && GOARCH != "arm64" {
+		var st stackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			signalstack(&_g_.m.gsignal.stack)
+			_g_.m.newSigstack = true
+		} else {
+			// Use existing signal stack.
+			stsp := uintptr(unsafe.Pointer(st.ss_sp))
+			_g_.m.gsignal.stack.lo = stsp
+			_g_.m.gsignal.stack.hi = stsp + st.ss_size
+			_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+			_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+			_g_.m.gsignal.stackAlloc = st.ss_size
+			_g_.m.newSigstack = false
+		}
+	}
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask &^= 1 << (uint32(i) - 1)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+// Mach IPC, to get at semaphores
+// Definitions are in /usr/include/mach on a Mac.
+
+func macherror(r int32, fn string) {
+	print("mach error ", fn, ": ", r, "\n")
+	throw("mach error")
+}
+
+const _DebugMach = false
+
+var zerondr machndr
+
+func mach_msgh_bits(a, b uint32) uint32 {
+	return a | b<<8
+}
+
+func mach_msg(h *machheader, op int32, send_size, rcv_size, rcv_name, timeout, notify uint32) int32 {
+	// TODO: Loop on interrupt.
+	return mach_msg_trap(unsafe.Pointer(h), op, send_size, rcv_size, rcv_name, timeout, notify)
+}
+
+// Mach RPC (MIG)
+const (
+	_MinMachMsg = 48
+	_MachReply  = 100
+)
+
+type codemsg struct {
+	h    machheader
+	ndr  machndr
+	code int32
+}
+
+func machcall(h *machheader, maxsize int32, rxsize int32) int32 {
+	_g_ := getg()
+	port := _g_.m.machport
+	if port == 0 {
+		port = mach_reply_port()
+		_g_.m.machport = port
+	}
+
+	h.msgh_bits |= mach_msgh_bits(_MACH_MSG_TYPE_COPY_SEND, _MACH_MSG_TYPE_MAKE_SEND_ONCE)
+	h.msgh_local_port = port
+	h.msgh_reserved = 0
+	id := h.msgh_id
+
+	if _DebugMach {
+		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
+		print("send:\t")
+		var i uint32
+		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
+			print(" ", p[i])
+			if i%8 == 7 {
+				print("\n\t")
+			}
+		}
+		if i%8 != 0 {
+			print("\n")
+		}
+	}
+	ret := mach_msg(h, _MACH_SEND_MSG|_MACH_RCV_MSG, h.msgh_size, uint32(maxsize), port, 0, 0)
+	if ret != 0 {
+		if _DebugMach {
+			print("mach_msg error ", ret, "\n")
+		}
+		return ret
+	}
+	if _DebugMach {
+		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
+		var i uint32
+		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
+			print(" ", p[i])
+			if i%8 == 7 {
+				print("\n\t")
+			}
+		}
+		if i%8 != 0 {
+			print("\n")
+		}
+	}
+	if h.msgh_id != id+_MachReply {
+		if _DebugMach {
+			print("mach_msg _MachReply id mismatch ", h.msgh_id, " != ", id+_MachReply, "\n")
+		}
+		return -303 // MIG_REPLY_MISMATCH
+	}
+	// Look for a response giving the return value.
+	// Any call can send this back with an error,
+	// and some calls only have return values so they
+	// send it back on success too. I don't quite see how
+	// you know it's one of these and not the full response
+	// format, so just look if the message is right.
+	c := (*codemsg)(unsafe.Pointer(h))
+	if uintptr(h.msgh_size) == unsafe.Sizeof(*c) && h.msgh_bits&_MACH_MSGH_BITS_COMPLEX == 0 {
+		if _DebugMach {
+			print("mig result ", c.code, "\n")
+		}
+		return c.code
+	}
+	if h.msgh_size != uint32(rxsize) {
+		if _DebugMach {
+			print("mach_msg _MachReply size mismatch ", h.msgh_size, " != ", rxsize, "\n")
+		}
+		return -307 // MIG_ARRAY_TOO_LARGE
+	}
+	return 0
+}
+
+// Semaphores!
+
+const (
+	tmach_semcreate = 3418
+	rmach_semcreate = tmach_semcreate + _MachReply
+
+	tmach_semdestroy = 3419
+	rmach_semdestroy = tmach_semdestroy + _MachReply
+
+	_KERN_ABORTED             = 14
+	_KERN_OPERATION_TIMED_OUT = 49
+)
+
+type tmach_semcreatemsg struct {
+	h      machheader
+	ndr    machndr
+	policy int32
+	value  int32
+}
+
+type rmach_semcreatemsg struct {
+	h         machheader
+	body      machbody
+	semaphore machport
+}
+
+type tmach_semdestroymsg struct {
+	h         machheader
+	body      machbody
+	semaphore machport
+}
+
+func mach_semcreate() uint32 {
+	var m [256]uint8
+	tx := (*tmach_semcreatemsg)(unsafe.Pointer(&m))
+	rx := (*rmach_semcreatemsg)(unsafe.Pointer(&m))
+
+	tx.h.msgh_bits = 0
+	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
+	tx.h.msgh_remote_port = mach_task_self()
+	tx.h.msgh_id = tmach_semcreate
+	tx.ndr = zerondr
+
+	tx.policy = 0 // 0 = SYNC_POLICY_FIFO
+	tx.value = 0
+
+	for {
+		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), int32(unsafe.Sizeof(*rx)))
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_create")
+	}
+	if rx.body.msgh_descriptor_count != 1 {
+		unimplemented("mach_semcreate desc count")
+	}
+	return rx.semaphore.name
+}
+
+func mach_semdestroy(sem uint32) {
+	var m [256]uint8
+	tx := (*tmach_semdestroymsg)(unsafe.Pointer(&m))
+
+	tx.h.msgh_bits = _MACH_MSGH_BITS_COMPLEX
+	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
+	tx.h.msgh_remote_port = mach_task_self()
+	tx.h.msgh_id = tmach_semdestroy
+	tx.body.msgh_descriptor_count = 1
+	tx.semaphore.name = sem
+	tx.semaphore.disposition = _MACH_MSG_TYPE_MOVE_SEND
+	tx.semaphore._type = 0
+
+	for {
+		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), 0)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_destroy")
+	}
+}
+
+// The other calls have simple system call traps in sys_darwin_{amd64,386}.s
+
+func mach_semaphore_wait(sema uint32) int32
+func mach_semaphore_timedwait(sema, sec, nsec uint32) int32
+func mach_semaphore_signal(sema uint32) int32
+func mach_semaphore_signal_all(sema uint32) int32
+
+func semasleep1(ns int64) int32 {
+	_g_ := getg()
+
+	if ns >= 0 {
+		var nsecs int32
+		secs := timediv(ns, 1000000000, &nsecs)
+		r := mach_semaphore_timedwait(_g_.m.waitsema, uint32(secs), uint32(nsecs))
+		if r == _KERN_ABORTED || r == _KERN_OPERATION_TIMED_OUT {
+			return -1
+		}
+		if r != 0 {
+			macherror(r, "semaphore_wait")
+		}
+		return 0
+	}
+
+	for {
+		r := mach_semaphore_wait(_g_.m.waitsema)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_wait")
+	}
+	return 0
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	var r int32
+	systemstack(func() {
+		r = semasleep1(ns)
+	})
+	return r
+}
+
+//go:nosplit
+func mach_semrelease(sem uint32) {
+	for {
+		r := mach_semaphore_signal(sem)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+
+		// mach_semrelease must be completely nosplit,
+		// because it is called from Go code.
+		// If we're going to die, start that process on the system stack
+		// to avoid a Go stack split.
+		systemstack(func() { macherror(r, "semaphore_signal") })
+	}
+}
+
+//go:nosplit
+func osyield() {
+	usleep(1)
+}
+
+func memlimit() uintptr {
+	// NOTE(rsc): Could use getrlimit here,
+	// like on FreeBSD or Linux, but Darwin doesn't enforce
+	// ulimit -v, so it's unclear why we'd try to stay within
+	// the limit.
+	return 0
+}
+
+const (
+	_NSIG        = 32
+	_SI_USER     = 0 /* empirically true, but not what headers say */
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_SS_DISABLE  = 4
+)
+
 //go:noescape
 func sigprocmask(how uint32, new, old *sigset)
 
@@ -40,3 +510,73 @@ func setitimer(mode int32, new, old *itimerval)
 
 func raise(sig int32)
 func raiseproc(int32)
+
+//extern SigTabTT runtime·sigtab[];
+
+type sigset uint32
+
+var sigset_all = ^sigset(0)
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = ^uint32(0)
+	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp)) // runtime·sigtramp's job is to call into real handler
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
+	sigaction(uint32(i), &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	var osa usigactiont
+	sigaction(uint32(i), nil, &osa)
+	handler := *(*uintptr)(unsafe.Pointer(&osa.__sigaction_u))
+	if handler == 0 || handler == _SIG_DFL || handler == _SIG_IGN || osa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	var sa sigactiont
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = handler
+	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp))
+	sa.sa_mask = osa.sa_mask
+	sa.sa_flags = osa.sa_flags | _SA_ONSTACK
+	sigaction(uint32(i), &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa usigactiont
+	sigaction(uint32(i), nil, &sa)
+	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st stackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	s := sigset(m[0])
+	sigprocmask(_SIG_SETMASK, &s, nil)
+}
+
+func unblocksig(sig int32) {
+	mask := sigset(1) << (uint32(sig) - 1)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}
diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
index c3833a397a..78a150eee5 100644
--- a/src/runtime/os_dragonfly.go
+++ b/src/runtime/os_dragonfly.go
@@ -6,6 +6,16 @@ package runtime
 
 import "unsafe"
 
+const (
+	_NSIG        = 33
+	_SI_USER     = 0
+	_SS_DISABLE  = 4
+	_RLIMIT_AS   = 10
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+)
+
 type mOS struct{}
 
 //go:noescape
@@ -41,3 +51,266 @@ func sys_umtx_wakeup(addr *uint32, val int32) int32
 func osyield()
 
 const stackSystem = 0
+
+// From DragonFly's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	systemstack(func() {
+		futexsleep1(addr, val, ns)
+	})
+}
+
+func futexsleep1(addr *uint32, val uint32, ns int64) {
+	var timeout int32
+	if ns >= 0 {
+		// The timeout is specified in microseconds - ensure that we
+		// do not end up dividing to zero, which would put us to sleep
+		// indefinitely...
+		timeout = timediv(ns, 1000, nil)
+		if timeout == 0 {
+			timeout = 1
+		}
+	}
+
+	// sys_umtx_sleep will return EWOULDBLOCK (EAGAIN) when the timeout
+	// expires or EBUSY if the mutex value does not match.
+	ret := sys_umtx_sleep(addr, int32(val), timeout)
+	if ret >= 0 || ret == -_EINTR || ret == -_EAGAIN || ret == -_EBUSY {
+		return
+	}
+
+	print("umtx_sleep addr=", addr, " val=", val, " ret=", ret, "\n")
+	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
+}
+
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := sys_umtx_wakeup(addr, int32(cnt))
+	if ret >= 0 {
+		return
+	}
+
+	systemstack(func() {
+		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
+		*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+	})
+}
+
+func lwp_start(uintptr)
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " lwp_start=", funcPC(lwp_start), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+
+	params := lwpparams{
+		start_func: funcPC(lwp_start),
+		arg:        unsafe.Pointer(mp),
+		stack:      uintptr(stk),
+		tid1:       unsafe.Pointer(&mp.procid),
+		tid2:       nil,
+	}
+
+	lwp_create(&params)
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+
+	// m.procid is a uint64, but lwp_start writes an int32. Fix it up.
+	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
+
+	// Initialize signal handling.
+
+	// On DragonFly a thread created by pthread_create inherits
+	// the signal stack of the creating thread. We always create
+	// a new signal stack here, to avoid having two Go threads
+	// using the same signal stack. This breaks the case of a
+	// thread created in C that calls sigaltstack and then calls a
+	// Go function, because we will lose track of the C code's
+	// sigaltstack, but it's the best we can do.
+	signalstack(&_g_.m.gsignal.stack)
+	_g_.m.newSigstack = true
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	/*
+		                TODO: Convert to Go when something actually uses the result.
+
+				Rlimit rl;
+				extern byte runtime·text[], runtime·end[];
+				uintptr used;
+
+				if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+					return 0;
+				if(rl.rlim_cur >= 0x7fffffff)
+					return 0;
+
+				// Estimate our VM footprint excluding the heap.
+				// Not an exact science: use size of binary plus
+				// some room for thread stacks.
+				used = runtime·end - runtime·text + (64<<20);
+				if(used >= rl.rlim_cur)
+					return 0;
+
+				// If there's not at least 16 MB left, we're probably
+				// not going to be able to do much. Treat as no limit.
+				rl.rlim_cur -= used;
+				if(rl.rlim_cur < (16<<20))
+					return 0;
+
+				return rl.rlim_cur - used;
+	*/
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_sigaction uintptr
+	sa_flags     int32
+	sa_mask      sigset
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = sigset_all
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_sigaction = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_sigaction == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_sigaction
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index dd69743e10..35b57d8a23 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -1,19 +1,365 @@
-// Copyright 2014 The Go Authors. All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type mOS struct{}
 
 //go:noescape
 func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
 
+// Linux futex.
+//
+//	futexsleep(uint32 *addr, uint32 val)
+//	futexwakeup(uint32 *addr)
+//
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up threads sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.
+
+const (
+	_FUTEX_WAIT = 0
+	_FUTEX_WAKE = 1
+)
+
+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
+// Don't sleep longer than ns; ns < 0 means forever.
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	var ts timespec
+
+	// Some Linux kernels have a bug where futex of
+	// FUTEX_WAIT returns an internal error code
+	// as an errno. Libpthread ignores the return value
+	// here, and so can we: as it says a few lines up,
+	// spurious wakeups are allowed.
+	if ns < 0 {
+		futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
+		return
+	}
+
+	// It's difficult to live within the no-split stack limits here.
+	// On ARM and 386, a 64-bit divide invokes a general software routine
+	// that needs more stack than we can afford. So we use timediv instead.
+	// But on real 64-bit systems, where words are larger but the stack limit
+	// is not, even timediv is too heavy, and we really need to use just an
+	// ordinary machine instruction.
+	if sys.PtrSize == 8 {
+		ts.set_sec(ns / 1000000000)
+		ts.set_nsec(int32(ns % 1000000000))
+	} else {
+		ts.tv_nsec = 0
+		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+	}
+	futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
+}
+
+// If any procs are sleeping on addr, wake up at most cnt.
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
+	if ret >= 0 {
+		return
+	}
+
+	// I don't know that futex wakeup can return
+	// EAGAIN or EINTR, but if it does, it would be
+	// safe to loop and call futex again.
+	systemstack(func() {
+		print("futexwakeup addr=", addr, " returned ", ret, "\n")
+	})
+
+	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+}
+
+func getproccount() int32 {
+	// This buffer is huge (8 kB) but we are on the system stack
+	// and there should be plenty of space (64 kB).
+	// Also this is a leaf, so we're not holding up the memory for long.
+	// See golang.org/issue/11823.
+	// The suggested behavior here is to keep trying with ever-larger
+	// buffers, but we don't have a dynamic memory allocator at the
+	// moment, so that's a bit tricky and seems like overkill.
+	const maxCPUs = 64 * 1024
+	var buf [maxCPUs / (sys.PtrSize * 8)]uintptr
+	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
+	n := int32(0)
+	for _, v := range buf[:r/sys.PtrSize] {
+		for v != 0 {
+			n += int32(v & 1)
+			v >>= 1
+		}
+	}
+	if n == 0 {
+		n = 1
+	}
+	return n
+}
+
+// Clone, the Linux rfork.
+const (
+	_CLONE_VM             = 0x100
+	_CLONE_FS             = 0x200
+	_CLONE_FILES          = 0x400
+	_CLONE_SIGHAND        = 0x800
+	_CLONE_PTRACE         = 0x2000
+	_CLONE_VFORK          = 0x4000
+	_CLONE_PARENT         = 0x8000
+	_CLONE_THREAD         = 0x10000
+	_CLONE_NEWNS          = 0x20000
+	_CLONE_SYSVSEM        = 0x40000
+	_CLONE_SETTLS         = 0x80000
+	_CLONE_PARENT_SETTID  = 0x100000
+	_CLONE_CHILD_CLEARTID = 0x200000
+	_CLONE_UNTRACED       = 0x800000
+	_CLONE_CHILD_SETTID   = 0x1000000
+	_CLONE_STOPPED        = 0x2000000
+	_CLONE_NEWUTS         = 0x4000000
+	_CLONE_NEWIPC         = 0x8000000
+
+	cloneFlags = _CLONE_VM | /* share memory */
+		_CLONE_FS | /* share cwd, etc */
+		_CLONE_FILES | /* share fd table */
+		_CLONE_SIGHAND | /* share sig handler table */
+		_CLONE_THREAD /* revisit - okay for now */
+)
+
 //go:noescape
 func clone(flags int32, stk, mm, gg, fn unsafe.Pointer) int32
 
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	/*
+	 * note: strace gets confused if we use CLONE_PTRACE here.
+	 */
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	// Disable signals during clone, so that the new thread starts
+	// with signals disabled. It will enable them in minit.
+	var oset sigset
+	rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
+	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
+	rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
+
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+		throw("newosproc")
+	}
+}
+
+// Version of newosproc that doesn't require a valid G.
+//go:nosplit
+func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
+	stack := sysAlloc(stacksize, &memstats.stacks_sys)
+	if stack == nil {
+		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		exit(1)
+	}
+	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
+	if ret < 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+const (
+	_AT_NULL   = 0  // End of vector
+	_AT_PAGESZ = 6  // System physical page size
+	_AT_RANDOM = 25 // introduced in 2.6.29
+)
+
+func sysargs(argc int32, argv **byte) {
+	n := argc + 1
+
+	// skip over argv, envp to get to auxv
+	for argv_index(argv, n) != nil {
+		n++
+	}
+
+	// skip NULL separator
+	n++
+
+	// now argv+n is auxv
+	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
+	for i := 0; auxv[i] != _AT_NULL; i += 2 {
+		tag, val := auxv[i], auxv[i+1]
+		switch tag {
+		case _AT_RANDOM:
+			// The kernel provides a pointer to 16-bytes
+			// worth of random data.
+			startupRandomData = (*[16]byte)(unsafe.Pointer(val))[:]
+
+		case _AT_PAGESZ:
+			// Check that the true physical page size is
+			// compatible with the runtime's assumed
+			// physical page size.
+			if sys.PhysPageSize < val {
+				print("runtime: kernel page size (", val, ") is larger than runtime page size (", sys.PhysPageSize, ")\n")
+				exit(1)
+			}
+			if sys.PhysPageSize%val != 0 {
+				print("runtime: runtime page size (", sys.PhysPageSize, ") is not a multiple of kernel page size (", val, ")\n")
+				exit(1)
+			}
+		}
+
+		archauxv(tag, val)
+	}
+}
+
+func osinit() {
+	ncpu = getproccount()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+func getRandomData(r []byte) {
+	if startupRandomData != nil {
+		n := copy(r, startupRandomData)
+		extendRandom(r, n)
+		return
+	}
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	smask := &mp.sigmask
+	rtsigprocmask(_SIG_SETMASK, nil, smask, int32(unsafe.Sizeof(*smask)))
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	rtsigprocmask(_SIG_SETMASK, &sigmask, nil, int32(unsafe.Sizeof(sigmask)))
+}
+
+//go:nosplit
+func sigblock() {
+	rtsigprocmask(_SIG_SETMASK, &sigset_all, nil, int32(unsafe.Sizeof(sigset_all)))
+}
+
+func gettid() uint32
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	// Initialize signal handling.
+	_g_ := getg()
+
+	var st sigaltstackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		// Use existing signal stack.
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		_g_.m.gsignal.stack.lo = stsp
+		_g_.m.gsignal.stack.hi = stsp + st.ss_size
+		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+		_g_.m.gsignal.stackAlloc = st.ss_size
+		_g_.m.newSigstack = false
+	}
+
+	// for debuggers, in case cgo created the thread
+	_g_.m.procid = uint64(gettid())
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			sigdelset(&nmask, i)
+		}
+	}
+	rtsigprocmask(_SIG_SETMASK, &nmask, nil, int32(unsafe.Sizeof(nmask)))
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	/*
+		TODO: Convert to Go when something actually uses the result.
+
+		Rlimit rl;
+		extern byte runtime·text[], runtime·end[];
+		uintptr used;
+
+		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+			return 0;
+		if(rl.rlim_cur >= 0x7fffffff)
+			return 0;
+
+		// Estimate our VM footprint excluding the heap.
+		// Not an exact science: use size of binary plus
+		// some room for thread stacks.
+		used = runtime·end - runtime·text + (64<<20);
+		if(used >= rl.rlim_cur)
+			return 0;
+
+		// If there's not at least 16 MB left, we're probably
+		// not going to be able to do much. Treat as no limit.
+		rl.rlim_cur -= used;
+		if(rl.rlim_cur < (16<<20))
+			return 0;
+
+		return rl.rlim_cur - used;
+	*/
+
+	return 0
+}
+
+//#ifdef GOARCH_386
+//#define sa_handler k_sa_handler
+//#endif
+
+func sigreturn()
+func sigtramp()
+func cgoSigtramp()
+
 //go:noescape
 func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
 
@@ -34,3 +380,88 @@ func raiseproc(sig int32)
 //go:noescape
 func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
 func osyield()
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sigfillset(&sa.sa_mask)
+	// Although Linux manpage says "sa_restorer element is obsolete and
+	// should not be used". x86_64 kernel requires it. Only use it on
+	// x86.
+	if GOARCH == "386" || GOARCH == "amd64" {
+		sa.sa_restorer = funcPC(sigreturn)
+	}
+	if fn == funcPC(sighandler) {
+		if iscgo {
+			fn = funcPC(cgoSigtramp)
+		} else {
+			fn = funcPC(sigtramp)
+		}
+	}
+	sa.sa_handler = fn
+	rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask))
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	var sa sigactiont
+	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction failure")
+	}
+	if sa.sa_handler == 0 || sa.sa_handler == _SIG_DFL || sa.sa_handler == _SIG_IGN || sa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	sa.sa_flags |= _SA_ONSTACK
+	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction failure")
+	}
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+
+	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction read failure")
+	}
+	if sa.sa_handler == funcPC(sigtramp) || sa.sa_handler == funcPC(cgoSigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_handler
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	sigcopyset(&mask, m)
+	rtsigprocmask(_SIG_SETMASK, &mask, nil, int32(unsafe.Sizeof(mask)))
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	sigaddset(&mask, int(sig))
+	rtsigprocmask(_SIG_UNBLOCK, &mask, nil, int32(unsafe.Sizeof(mask)))
+}
diff --git a/src/runtime/os_linux_386.go b/src/runtime/os_linux_386.go
deleted file mode 100644
index 0f39cade3b..0000000000
--- a/src/runtime/os_linux_386.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-const (
-	_AT_NULL    = 0
-	_AT_RANDOM  = 25
-	_AT_SYSINFO = 32
-)
-
-func sysargs(argc int32, argv **byte) {
-	// skip over argv, envv to get to auxv
-	n := argc + 1
-	for argv_index(argv, n) != nil {
-		n++
-	}
-	n++
-	auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-
-	for i := 0; auxv[i] != _AT_NULL; i += 2 {
-		switch auxv[i] {
-		case _AT_RANDOM:
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(auxv[i+1])))[:]
-		}
-	}
-}
diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index 8fdfb585ba..8e2765a413 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go
@@ -4,16 +4,11 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
+import "unsafe"
 
 const (
-	_AT_NULL     = 0
 	_AT_PLATFORM = 15 //  introduced in at least 2.6.11
 	_AT_HWCAP    = 16 // introduced in at least 2.6.11
-	_AT_RANDOM   = 25 // introduced in 2.6.29
 
 	_HWCAP_VFP   = 1 << 6  // introduced in at least 2.6.11
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
@@ -36,33 +31,23 @@ func checkgoarm() {
 	}
 }
 
-func sysargs(argc int32, argv **byte) {
-	// skip over argv, envv to get to auxv
-	n := argc + 1
-	for argv_index(argv, n) != nil {
-		n++
-	}
-	n++
-	auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-
-	for i := 0; auxv[i] != _AT_NULL; i += 2 {
-		switch auxv[i] {
-		case _AT_RANDOM: // kernel provides a pointer to 16-bytes worth of random data
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(auxv[i+1])))[:]
-			// the pointer provided may not be word aligned, so we must treat it
-			// as a byte array.
-			randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-				uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_RANDOM:
+		// sysargs filled in startupRandomData, but that
+		// pointer may not be word aligned, so we must treat
+		// it as a byte array.
+		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
+			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
 
-		case _AT_PLATFORM: // v5l, v6l, v7l
-			t := *(*uint8)(unsafe.Pointer(uintptr(auxv[i+1] + 1)))
-			if '5' <= t && t <= '7' {
-				armArch = t - '0'
-			}
-
-		case _AT_HWCAP: // CPU capability bit flags
-			hwcap = auxv[i+1]
+	case _AT_PLATFORM: // v5l, v6l, v7l
+		t := *(*uint8)(unsafe.Pointer(val + 1))
+		if '5' <= t && t <= '7' {
+			armArch = t - '0'
 		}
+
+	case _AT_HWCAP: // CPU capability bit flags
+		hwcap = uint32(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 3f994f128b..43262aea14 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -4,13 +4,19 @@
 
 package runtime
 
-const (
-	_AT_NULL   = 0
-	_AT_RANDOM = 25 // introduced in 2.6.29
-)
-
 var randomNumber uint32
 
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_RANDOM:
+		// sysargs filled in startupRandomData, but that
+		// pointer may not be word aligned, so we must treat
+		// it as a byte array.
+		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
+			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+	}
+}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand1().
diff --git a/src/runtime/os2_linux_generic.go b/src/runtime/os_linux_generic.go
index 01e6c8a5ec..a16d140776 100644
--- a/src/runtime/os2_linux_generic.go
+++ b/src/runtime/os_linux_generic.go
@@ -4,6 +4,7 @@
 
 // +build !mips64
 // +build !mips64le
+// +build !s390x
 // +build linux
 
 package runtime
@@ -27,3 +28,21 @@ type rlimit struct {
 	rlim_cur uintptr
 	rlim_max uintptr
 }
+
+var sigset_all = sigset{^uint32(0), ^uint32(0)}
+
+func sigaddset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] |= 1 << ((uint32(i) - 1) & 31)
+}
+
+func sigdelset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+}
+
+func sigfillset(mask *uint64) {
+	*mask = ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	copy((*mask)[:], m[:])
+}
diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
index 4d2e9e8a20..92b5c82af7 100644
--- a/src/runtime/os_linux_mips64x.go
+++ b/src/runtime/os_linux_mips64x.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build mips64 mips64le
 // +build linux
+// +build mips64 mips64le
 
 package runtime
 
@@ -16,3 +16,38 @@ func cputicks() int64 {
 	// randomNumber provides better seeding of fastrand1.
 	return nanotime() + int64(randomNumber)
 }
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_RLIMIT_AS   = 6
+)
+
+type sigset [2]uint64
+
+type rlimit struct {
+	rlim_cur uintptr
+	rlim_max uintptr
+}
+
+var sigset_all = sigset{^uint64(0), ^uint64(0)}
+
+func sigaddset(mask *sigset, i int) {
+	(*mask)[(i-1)/64] |= 1 << ((uint32(i) - 1) & 63)
+}
+
+func sigdelset(mask *sigset, i int) {
+	(*mask)[(i-1)/64] &^= 1 << ((uint32(i) - 1) & 63)
+}
+
+func sigfillset(mask *[2]uint64) {
+	(*mask)[0], (*mask)[1] = ^uint64(0), ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	(*mask)[0] = uint64(m[0]) | uint64(m[1])<<32
+}
diff --git a/src/runtime/os_linux_noauxv.go b/src/runtime/os_linux_noauxv.go
new file mode 100644
index 0000000000..0b46f594ce
--- /dev/null
+++ b/src/runtime/os_linux_noauxv.go
@@ -0,0 +1,10 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm,!arm64
+
+package runtime
+
+func archauxv(tag, val uintptr) {
+}
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
new file mode 100644
index 0000000000..e659dff716
--- /dev/null
+++ b/src/runtime/os_linux_s390x.go
@@ -0,0 +1,46 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 0
+	_SIG_UNBLOCK = 1
+	_SIG_SETMASK = 2
+	_RLIMIT_AS   = 9
+)
+
+type sigset uint64
+
+type rlimit struct {
+	rlim_cur uintptr
+	rlim_max uintptr
+}
+
+var sigset_all = sigset(^uint64(0))
+
+func sigaddset(mask *sigset, i int) {
+	if i > 64 {
+		throw("unexpected signal greater than 64")
+	}
+	*mask |= 1 << (uint(i) - 1)
+}
+
+func sigdelset(mask *sigset, i int) {
+	if i > 64 {
+		throw("unexpected signal greater than 64")
+	}
+	*mask &^= 1 << (uint(i) - 1)
+}
+
+func sigfillset(mask *uint64) {
+	*mask = ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	*mask = sigset(uint64(m[0]) | uint64(m[1])<<32)
+}
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index 0fba16d4f4..41f34f7132 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -4,7 +4,23 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	_SS_DISABLE  = 4
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_NSIG        = 33
+	_SI_USER     = 0
+
+	// From NetBSD's <sys/ucontext.h>
+	_UC_SIGMASK = 0x01
+	_UC_CPU     = 0x04
+)
 
 type mOS struct {
 	waitsemacount uint32
@@ -45,3 +61,268 @@ func lwp_unpark(lwp int32, hint unsafe.Pointer) int32
 func lwp_self() int32
 
 func osyield()
+
+const (
+	_ESRCH     = 3
+	_ETIMEDOUT = 60
+
+	// From NetBSD's <sys/time.h>
+	_CLOCK_REALTIME  = 0
+	_CLOCK_VIRTUAL   = 1
+	_CLOCK_PROF      = 2
+	_CLOCK_MONOTONIC = 3
+)
+
+var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+
+// From NetBSD's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	_g_ := getg()
+
+	// Compute sleep deadline.
+	var tsp *timespec
+	if ns >= 0 {
+		var ts timespec
+		var nsec int32
+		ns += nanotime()
+		ts.set_sec(timediv(ns, 1000000000, &nsec))
+		ts.set_nsec(nsec)
+		tsp = &ts
+	}
+
+	for {
+		v := atomic.Load(&_g_.m.waitsemacount)
+		if v > 0 {
+			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
+				return 0 // semaphore acquired
+			}
+			continue
+		}
+
+		// Sleep until unparked by semawakeup or timeout.
+		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
+		if ret == _ETIMEDOUT {
+			return -1
+		}
+	}
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	atomic.Xadd(&mp.waitsemacount, 1)
+	// From NetBSD's _lwp_unpark(2) manual:
+	// "If the target LWP is not currently waiting, it will return
+	// immediately upon the next call to _lwp_park()."
+	ret := lwp_unpark(int32(mp.procid), unsafe.Pointer(&mp.waitsemacount))
+	if ret != 0 && ret != _ESRCH {
+		// semawakeup can be called on signal stack.
+		systemstack(func() {
+			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
+		})
+	}
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var uc ucontextt
+	getcontext(unsafe.Pointer(&uc))
+
+	uc.uc_flags = _UC_SIGMASK | _UC_CPU
+	uc.uc_link = nil
+	uc.uc_sigmask = sigset_all
+
+	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(netbsdMstart))
+
+	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
+		throw("runtime.newosproc")
+	}
+}
+
+// netbsdMStart is the function call that starts executing a newly
+// created thread. On NetBSD, a new thread inherits the signal stack
+// of the creating thread. That confuses minit, so we remove that
+// signal stack here before calling the regular mstart. It's a bit
+// baroque to remove a signal stack here only to add one in minit, but
+// it's a simple change that keeps NetBSD working like other OS's.
+// At this point all signals are blocked, so there is no race.
+//go:nosplit
+func netbsdMstart() {
+	signalstack(nil)
+	mstart()
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+	_g_.m.procid = uint64(lwp_self())
+
+	// Initialize signal handling.
+
+	// On NetBSD a thread created by pthread_create inherits the
+	// signal stack of the creating thread. We always create a
+	// new signal stack here, to avoid having two Go threads using
+	// the same signal stack. This breaks the case of a thread
+	// created in C that calls sigaltstack and then calls a Go
+	// function, because we will lose track of the C code's
+	// sigaltstack, but it's the best we can do.
+	signalstack(&_g_.m.gsignal.stack)
+	_g_.m.newSigstack = true
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_sigaction uintptr
+	sa_mask      sigset
+	sa_flags     int32
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = sigset_all
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_sigaction = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_sigaction == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_sigaction
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}
diff --git a/src/runtime/os1_netbsd_386.go b/src/runtime/os_netbsd_386.go
index 037f7e36dc..037f7e36dc 100644
--- a/src/runtime/os1_netbsd_386.go
+++ b/src/runtime/os_netbsd_386.go
diff --git a/src/runtime/os1_netbsd_amd64.go b/src/runtime/os_netbsd_amd64.go
index 5118b0c4ff..5118b0c4ff 100644
--- a/src/runtime/os1_netbsd_amd64.go
+++ b/src/runtime/os_netbsd_amd64.go
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 8bdf5a271f..9147091a49 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -1,17 +1,127 @@
-// Copyright 2014 The Go Authors. All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// TODO(brainman): should not need those
+const (
+	_NSIG = 65
+)
+
+//go:cgo_import_dynamic runtime._AddVectoredExceptionHandler AddVectoredExceptionHandler%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CloseHandle CloseHandle%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateEventA CreateEventA%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CryptAcquireContextW CryptAcquireContextW%5 "advapi32.dll"
+//go:cgo_import_dynamic runtime._CryptGenRandom CryptGenRandom%3 "advapi32.dll"
+//go:cgo_import_dynamic runtime._CryptReleaseContext CryptReleaseContext%2 "advapi32.dll"
+//go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
+//go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetConsoleMode GetConsoleMode%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetEnvironmentStringsW GetEnvironmentStringsW%0 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetProcAddress GetProcAddress%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetProcessAffinityMask GetProcessAffinityMask%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetQueuedCompletionStatus GetQueuedCompletionStatus%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetStdHandle GetStdHandle%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._NtWaitForSingleObject NtWaitForSingleObject%3 "ntdll.dll"
+//go:cgo_import_dynamic runtime._ResumeThread ResumeThread%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetConsoleCtrlHandler SetConsoleCtrlHandler%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetErrorMode SetErrorMode%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetEvent SetEvent%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetProcessPriorityBoost SetProcessPriorityBoost%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetThreadPriority SetThreadPriority%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetUnhandledExceptionFilter SetUnhandledExceptionFilter%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetWaitableTimer SetWaitableTimer%6 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SuspendThread SuspendThread%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SwitchToThread SwitchToThread%0 "kernel32.dll"
+//go:cgo_import_dynamic runtime._VirtualAlloc VirtualAlloc%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WSAGetOverlappedResult WSAGetOverlappedResult%5 "ws2_32.dll"
+//go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._timeBeginPeriod timeBeginPeriod%1 "winmm.dll"
+
+type stdFunction unsafe.Pointer
+
+var (
+	// Following syscalls are available on every Windows PC.
+	// All these variables are set by the Windows executable
+	// loader before the Go program starts.
+	_AddVectoredExceptionHandler,
+	_CloseHandle,
+	_CreateEventA,
+	_CreateIoCompletionPort,
+	_CreateThread,
+	_CreateWaitableTimerA,
+	_CryptAcquireContextW,
+	_CryptGenRandom,
+	_CryptReleaseContext,
+	_DuplicateHandle,
+	_ExitProcess,
+	_FreeEnvironmentStringsW,
+	_GetConsoleMode,
+	_GetEnvironmentStringsW,
+	_GetProcAddress,
+	_GetProcessAffinityMask,
+	_GetQueuedCompletionStatus,
+	_GetStdHandle,
+	_GetSystemInfo,
+	_GetThreadContext,
+	_LoadLibraryW,
+	_LoadLibraryA,
+	_NtWaitForSingleObject,
+	_ResumeThread,
+	_SetConsoleCtrlHandler,
+	_SetErrorMode,
+	_SetEvent,
+	_SetProcessPriorityBoost,
+	_SetThreadPriority,
+	_SetUnhandledExceptionFilter,
+	_SetWaitableTimer,
+	_SuspendThread,
+	_SwitchToThread,
+	_VirtualAlloc,
+	_VirtualFree,
+	_WSAGetOverlappedResult,
+	_WaitForSingleObject,
+	_WriteConsoleW,
+	_WriteFile,
+	_timeBeginPeriod,
+	_ stdFunction
+
+	// Following syscalls are only available on some Windows PCs.
+	// We will load syscalls, if available, before using them.
+	_AddDllDirectory,
+	_AddVectoredContinueHandler,
+	_GetQueuedCompletionStatusEx,
+	_LoadLibraryExW,
+	_ stdFunction
+)
+
+// Function to be called by windows CreateThread
+// to start new os thread.
+func tstart_stdcall(newm *m) uint32
+
+func ctrlhandler(_type uint32) uint32
 
 type mOS struct {
 	waitsema uintptr // semaphore for parking on locks
 }
 
-type stdFunction *byte
-
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	throw("too many writes on closed pipe")
@@ -30,3 +140,605 @@ func read(fd int32, p unsafe.Pointer, n int32) int32 {
 	throw("unimplemented")
 	return -1
 }
+
+type sigset struct{}
+
+// Call a Windows function with stdcall conventions,
+// and switch to os stack during the call.
+func asmstdcall(fn unsafe.Pointer)
+
+var asmstdcallAddr unsafe.Pointer
+
+func windowsFindfunc(lib uintptr, name []byte) stdFunction {
+	if name[len(name)-1] != 0 {
+		throw("usage")
+	}
+	f := stdcall2(_GetProcAddress, lib, uintptr(unsafe.Pointer(&name[0])))
+	return stdFunction(unsafe.Pointer(f))
+}
+
+func loadOptionalSyscalls() {
+	var kernel32dll = []byte("kernel32.dll\000")
+	k32 := stdcall1(_LoadLibraryA, uintptr(unsafe.Pointer(&kernel32dll[0])))
+	if k32 == 0 {
+		throw("kernel32.dll not found")
+	}
+	_AddDllDirectory = windowsFindfunc(k32, []byte("AddDllDirectory\000"))
+	_AddVectoredContinueHandler = windowsFindfunc(k32, []byte("AddVectoredContinueHandler\000"))
+	_GetQueuedCompletionStatusEx = windowsFindfunc(k32, []byte("GetQueuedCompletionStatusEx\000"))
+	_LoadLibraryExW = windowsFindfunc(k32, []byte("LoadLibraryExW\000"))
+}
+
+//go:nosplit
+func getLoadLibrary() uintptr {
+	return uintptr(unsafe.Pointer(_LoadLibraryW))
+}
+
+//go:nosplit
+func getLoadLibraryEx() uintptr {
+	return uintptr(unsafe.Pointer(_LoadLibraryExW))
+}
+
+//go:nosplit
+func getGetProcAddress() uintptr {
+	return uintptr(unsafe.Pointer(_GetProcAddress))
+}
+
+func getproccount() int32 {
+	var mask, sysmask uintptr
+	ret := stdcall3(_GetProcessAffinityMask, currentProcess, uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
+	if ret != 0 {
+		n := 0
+		maskbits := int(unsafe.Sizeof(mask) * 8)
+		for i := 0; i < maskbits; i++ {
+			if mask&(1<<uint(i)) != 0 {
+				n++
+			}
+		}
+		if n != 0 {
+			return int32(n)
+		}
+	}
+	// use GetSystemInfo if GetProcessAffinityMask fails
+	var info systeminfo
+	stdcall1(_GetSystemInfo, uintptr(unsafe.Pointer(&info)))
+	return int32(info.dwnumberofprocessors)
+}
+
+const (
+	currentProcess = ^uintptr(0) // -1 = current process
+	currentThread  = ^uintptr(1) // -2 = current thread
+)
+
+// in sys_windows_386.s and sys_windows_amd64.s:
+func externalthreadhandler()
+func getlasterror() uint32
+func setlasterror(err uint32)
+
+// When loading DLLs, we prefer to use LoadLibraryEx with
+// LOAD_LIBRARY_SEARCH_* flags, if available. LoadLibraryEx is not
+// available on old Windows, though, and the LOAD_LIBRARY_SEARCH_*
+// flags are not available on some versions of Windows without a
+// security patch.
+//
+// https://msdn.microsoft.com/en-us/library/ms684179(v=vs.85).aspx says:
+// "Windows 7, Windows Server 2008 R2, Windows Vista, and Windows
+// Server 2008: The LOAD_LIBRARY_SEARCH_* flags are available on
+// systems that have KB2533623 installed. To determine whether the
+// flags are available, use GetProcAddress to get the address of the
+// AddDllDirectory, RemoveDllDirectory, or SetDefaultDllDirectories
+// function. If GetProcAddress succeeds, the LOAD_LIBRARY_SEARCH_*
+// flags can be used with LoadLibraryEx."
+var useLoadLibraryEx bool
+
+var timeBeginPeriodRetValue uint32
+
+func osinit() {
+	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
+	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
+	switchtothreadAddr = unsafe.Pointer(funcPC(switchtothread))
+
+	setBadSignalMsg()
+
+	loadOptionalSyscalls()
+
+	useLoadLibraryEx = (_LoadLibraryExW != nil && _AddDllDirectory != nil)
+
+	disableWER()
+
+	externalthreadhandlerp = funcPC(externalthreadhandler)
+
+	initExceptionHandler()
+
+	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
+
+	timeBeginPeriodRetValue = uint32(stdcall1(_timeBeginPeriod, 1))
+
+	ncpu = getproccount()
+
+	// Windows dynamic priority boosting assumes that a process has different types
+	// of dedicated threads -- GUI, IO, computational, etc. Go processes use
+	// equivalent threads that all do a mix of GUI, IO, computations, etc.
+	// In such context dynamic priority boosting does nothing but harm, so we turn it off.
+	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
+}
+
+//go:nosplit
+func getRandomData(r []byte) {
+	const (
+		prov_rsa_full       = 1
+		crypt_verifycontext = 0xF0000000
+	)
+	var handle uintptr
+	n := 0
+	if stdcall5(_CryptAcquireContextW, uintptr(unsafe.Pointer(&handle)), 0, 0, prov_rsa_full, crypt_verifycontext) != 0 {
+		if stdcall3(_CryptGenRandom, handle, uintptr(len(r)), uintptr(unsafe.Pointer(&r[0]))) != 0 {
+			n = len(r)
+		}
+		stdcall2(_CryptReleaseContext, handle, 0)
+	}
+	extendRandom(r, n)
+}
+
+func goenvs() {
+	// strings is a pointer to environment variable pairs in the form:
+	//     "envA=valA\x00envB=valB\x00\x00" (in UTF-16)
+	// Two consecutive zero bytes end the list.
+	strings := unsafe.Pointer(stdcall0(_GetEnvironmentStringsW))
+	p := (*[1 << 24]uint16)(strings)[:]
+
+	n := 0
+	for from, i := 0, 0; true; i++ {
+		if p[i] == 0 {
+			// empty string marks the end
+			if i == from {
+				break
+			}
+			from = i + 1
+			n++
+		}
+	}
+	envs = make([]string, n)
+
+	for i := range envs {
+		envs[i] = gostringw(&p[0])
+		for p[0] != 0 {
+			p = p[1:]
+		}
+		p = p[1:] // skip nil byte
+	}
+
+	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
+}
+
+//go:nosplit
+func exit(code int32) {
+	stdcall1(_ExitProcess, uintptr(code))
+}
+
+//go:nosplit
+func write(fd uintptr, buf unsafe.Pointer, n int32) int32 {
+	const (
+		_STD_OUTPUT_HANDLE = ^uintptr(10) // -11
+		_STD_ERROR_HANDLE  = ^uintptr(11) // -12
+	)
+	var handle uintptr
+	switch fd {
+	case 1:
+		handle = stdcall1(_GetStdHandle, _STD_OUTPUT_HANDLE)
+	case 2:
+		handle = stdcall1(_GetStdHandle, _STD_ERROR_HANDLE)
+	default:
+		// assume fd is real windows handle.
+		handle = fd
+	}
+	isASCII := true
+	b := (*[1 << 30]byte)(buf)[:n]
+	for _, x := range b {
+		if x >= 0x80 {
+			isASCII = false
+			break
+		}
+	}
+
+	if !isASCII {
+		var m uint32
+		isConsole := stdcall2(_GetConsoleMode, handle, uintptr(unsafe.Pointer(&m))) != 0
+		// If this is a console output, various non-unicode code pages can be in use.
+		// Use the dedicated WriteConsole call to ensure unicode is printed correctly.
+		if isConsole {
+			return int32(writeConsole(handle, buf, n))
+		}
+	}
+	var written uint32
+	stdcall5(_WriteFile, handle, uintptr(buf), uintptr(n), uintptr(unsafe.Pointer(&written)), 0)
+	return int32(written)
+}
+
+var (
+	utf16ConsoleBack     [1000]uint16
+	utf16ConsoleBackLock mutex
+)
+
+// writeConsole writes bufLen bytes from buf to the console File.
+// It returns the number of bytes written.
+func writeConsole(handle uintptr, buf unsafe.Pointer, bufLen int32) int {
+	const surr2 = (surrogateMin + surrogateMax + 1) / 2
+
+	// Do not use defer for unlock. May cause issues when printing a panic.
+	lock(&utf16ConsoleBackLock)
+
+	b := (*[1 << 30]byte)(buf)[:bufLen]
+	s := *(*string)(unsafe.Pointer(&b))
+
+	utf16tmp := utf16ConsoleBack[:]
+
+	total := len(s)
+	w := 0
+	for len(s) > 0 {
+		if w >= len(utf16tmp)-2 {
+			writeConsoleUTF16(handle, utf16tmp[:w])
+			w = 0
+		}
+		r, n := charntorune(s)
+		s = s[n:]
+		if r < 0x10000 {
+			utf16tmp[w] = uint16(r)
+			w++
+		} else {
+			r -= 0x10000
+			utf16tmp[w] = surrogateMin + uint16(r>>10)&0x3ff
+			utf16tmp[w+1] = surr2 + uint16(r)&0x3ff
+			w += 2
+		}
+	}
+	writeConsoleUTF16(handle, utf16tmp[:w])
+	unlock(&utf16ConsoleBackLock)
+	return total
+}
+
+// writeConsoleUTF16 is the dedicated windows calls that correctly prints
+// to the console regardless of the current code page. Input is utf-16 code points.
+// The handle must be a console handle.
+func writeConsoleUTF16(handle uintptr, b []uint16) {
+	l := uint32(len(b))
+	if l == 0 {
+		return
+	}
+	var written uint32
+	stdcall5(_WriteConsoleW,
+		handle,
+		uintptr(unsafe.Pointer(&b[0])),
+		uintptr(l),
+		uintptr(unsafe.Pointer(&written)),
+		0,
+	)
+	return
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	// store ms in ns to save stack space
+	if ns < 0 {
+		ns = _INFINITE
+	} else {
+		ns = int64(timediv(ns, 1000000, nil))
+		if ns == 0 {
+			ns = 1
+		}
+	}
+	if stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(ns)) != 0 {
+		return -1 // timeout
+	}
+	return 0
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	stdcall1(_SetEvent, mp.waitsema)
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+	mp.waitsema = stdcall4(_CreateEventA, 0, 0, 0, 0)
+}
+
+// May run with m.p==nil, so write barriers are not allowed. This
+// function is called by newosproc0, so it is also required to
+// operate without stack guards.
+//go:nowritebarrierc
+//go:nosplit
+func newosproc(mp *m, stk unsafe.Pointer) {
+	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
+	thandle := stdcall6(_CreateThread, 0, 0x20000,
+		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
+		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
+	if thandle == 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", getlasterror(), ")\n")
+		throw("runtime.newosproc")
+	}
+}
+
+// Used by the C library build mode. On Linux this function would allocate a
+// stack, but that's not necessary for Windows. No stack guards are present
+// and the GC has not been initialized, so write barriers will fail.
+//go:nowritebarrierc
+//go:nosplit
+func newosproc0(mp *m, stk unsafe.Pointer) {
+	newosproc(mp, stk)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+}
+
+//go:nosplit
+func sigblock() {
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	var thandle uintptr
+	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
+	atomic.Storeuintptr(&getg().m.thread, thandle)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	tp := &getg().m.thread
+	stdcall1(_CloseHandle, *tp)
+	*tp = 0
+}
+
+// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
+type _KSYSTEM_TIME struct {
+	LowPart   uint32
+	High1Time int32
+	High2Time int32
+}
+
+const (
+	_INTERRUPT_TIME = 0x7ffe0008
+	_SYSTEM_TIME    = 0x7ffe0014
+)
+
+//go:nosplit
+func systime(addr uintptr) int64 {
+	timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
+
+	var t _KSYSTEM_TIME
+	for i := 1; i < 10000; i++ {
+		// these fields must be read in that order (see URL above)
+		t.High1Time = timeaddr.High1Time
+		t.LowPart = timeaddr.LowPart
+		t.High2Time = timeaddr.High2Time
+		if t.High1Time == t.High2Time {
+			return int64(t.High1Time)<<32 | int64(t.LowPart)
+		}
+		if (i % 100) == 0 {
+			osyield()
+		}
+	}
+	systemstack(func() {
+		throw("interrupt/system time is changing too fast")
+	})
+	return 0
+}
+
+//go:nosplit
+func unixnano() int64 {
+	return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
+}
+
+//go:nosplit
+func nanotime() int64 {
+	return systime(_INTERRUPT_TIME) * 100
+}
+
+// Calling stdcall on os stack.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+//go:nosplit
+func stdcall(fn stdFunction) uintptr {
+	gp := getg()
+	mp := gp.m
+	mp.libcall.fn = uintptr(unsafe.Pointer(fn))
+
+	if mp.profilehz != 0 {
+		// leave pc/sp for cpu profiler
+		mp.libcallg.set(gp)
+		mp.libcallpc = getcallerpc(unsafe.Pointer(&fn))
+		// sp must be the last, because once async cpu profiler finds
+		// all three values to be non-zero, it will use them
+		mp.libcallsp = getcallersp(unsafe.Pointer(&fn))
+	}
+	asmcgocall(asmstdcallAddr, unsafe.Pointer(&mp.libcall))
+	mp.libcallsp = 0
+	return mp.libcall.r1
+}
+
+//go:nosplit
+func stdcall0(fn stdFunction) uintptr {
+	mp := getg().m
+	mp.libcall.n = 0
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&fn))) // it's unused but must be non-nil, otherwise crashes
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall1(fn stdFunction, a0 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 1
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 2
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 3
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 4
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 5
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 6
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 7
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+// in sys_windows_386.s and sys_windows_amd64.s
+func onosstack(fn unsafe.Pointer, arg uint32)
+func usleep2(usec uint32)
+func switchtothread()
+
+var usleep2Addr unsafe.Pointer
+var switchtothreadAddr unsafe.Pointer
+
+//go:nosplit
+func osyield() {
+	onosstack(switchtothreadAddr, 0)
+}
+
+//go:nosplit
+func usleep(us uint32) {
+	// Have 1us units; want 100ns units.
+	onosstack(usleep2Addr, 10*us)
+}
+
+func ctrlhandler1(_type uint32) uint32 {
+	var s uint32
+
+	switch _type {
+	case _CTRL_C_EVENT, _CTRL_BREAK_EVENT:
+		s = _SIGINT
+	default:
+		return 0
+	}
+
+	if sigsend(s) {
+		return 1
+	}
+	exit(2) // SIGINT, SIGTERM, etc
+	return 0
+}
+
+// in sys_windows_386.s and sys_windows_amd64.s
+func profileloop()
+
+var profiletimer uintptr
+
+func profilem(mp *m) {
+	var r *context
+	rbuf := make([]byte, unsafe.Sizeof(*r)+15)
+
+	tls := &mp.tls[0]
+	gp := *((**g)(unsafe.Pointer(tls)))
+
+	// align Context to 16 bytes
+	r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
+	r.contextflags = _CONTEXT_CONTROL
+	stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
+	sigprof(r.ip(), r.sp(), 0, gp, mp)
+}
+
+func profileloop1(param uintptr) uint32 {
+	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
+
+	for {
+		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
+		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
+		for mp := first; mp != nil; mp = mp.alllink {
+			thread := atomic.Loaduintptr(&mp.thread)
+			// Do not profile threads blocked on Notes,
+			// this includes idle worker threads,
+			// idle timer thread, idle heap scavenger, etc.
+			if thread == 0 || mp.profilehz == 0 || mp.blocked {
+				continue
+			}
+			stdcall1(_SuspendThread, thread)
+			if mp.profilehz != 0 && !mp.blocked {
+				profilem(mp)
+			}
+			stdcall1(_ResumeThread, thread)
+		}
+	}
+}
+
+var cpuprofilerlock mutex
+
+func resetcpuprofiler(hz int32) {
+	lock(&cpuprofilerlock)
+	if profiletimer == 0 {
+		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
+		atomic.Storeuintptr(&profiletimer, timer)
+		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
+		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
+		stdcall1(_CloseHandle, thread)
+	}
+	unlock(&cpuprofilerlock)
+
+	ms := int32(0)
+	due := ^int64(^uint64(1 << 63))
+	if hz > 0 {
+		ms = 1000 / hz
+		if ms == 0 {
+			ms = 1
+		}
+		due = int64(ms) * -10000
+	}
+	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
+	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
+}
+
+func memlimit() uintptr {
+	return 0
+}
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 59fbc83369..382a20e4e7 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -205,7 +205,7 @@ func newdefer(siz int32) *_defer {
 	if d == nil {
 		// Allocate new defer+args.
 		total := roundupsize(totaldefersize(uintptr(siz)))
-		d = (*_defer)(mallocgc(total, deferType, 0))
+		d = (*_defer)(mallocgc(total, deferType, true))
 	}
 	d.siz = siz
 	gp := mp.curg
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index d15102c703..0fff9d46d9 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -82,7 +82,7 @@ func TestMemoryProfiler(t *testing.T) {
 #	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:61
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 
-		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:27
 #	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:62
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index fa0af59b37..8b2f3d5291 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -530,15 +530,20 @@ func blockChanClose() {
 }
 
 func blockSelectRecvAsync() {
+	const numTries = 3
 	c := make(chan bool, 1)
 	c2 := make(chan bool, 1)
 	go func() {
-		time.Sleep(blockDelay)
-		c <- true
+		for i := 0; i < numTries; i++ {
+			time.Sleep(blockDelay)
+			c <- true
+		}
 	}()
-	select {
-	case <-c:
-	case <-c2:
+	for i := 0; i < numTries; i++ {
+		select {
+		case <-c:
+		case <-c2:
+		}
 	}
 }
 
@@ -585,6 +590,9 @@ func func3(c chan int) { <-c }
 func func4(c chan int) { <-c }
 
 func TestGoroutineCounts(t *testing.T) {
+	if runtime.GOOS == "openbsd" {
+		testenv.SkipFlaky(t, 15156)
+	}
 	c := make(chan int)
 	for i := 0; i < 100; i++ {
 		if i%10 == 0 {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 5145c84aea..ee732e3cf7 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -381,7 +381,7 @@ func badmcall2(fn func(*g)) {
 }
 
 func badreflectcall() {
-	panic("runtime: arg size to reflect.call more than 1GB")
+	panic(plainError("arg size to reflect.call more than 1GB"))
 }
 
 func lockedOSThread() bool {
@@ -402,6 +402,16 @@ func allgadd(gp *g) {
 	lock(&allglock)
 	allgs = append(allgs, gp)
 	allglen = uintptr(len(allgs))
+
+	// Grow GC rescan list if necessary.
+	if len(allgs) > cap(work.rescan.list) {
+		lock(&work.rescan.lock)
+		l := work.rescan.list
+		// Let append do the heavy lifting, but keep the
+		// length the same.
+		work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
+		unlock(&work.rescan.lock)
+	}
 	unlock(&allglock)
 }
 
@@ -435,9 +445,10 @@ func schedinit() {
 	tracebackinit()
 	moduledataverify()
 	stackinit()
-	itabsinit()
 	mallocinit()
 	mcommoninit(_g_.m)
+	typelinksinit()
+	itabsinit()
 
 	msigsave(_g_.m)
 	initSigmask = _g_.m.sigmask
@@ -449,6 +460,9 @@ func schedinit() {
 
 	sched.lastpoll = uint64(nanotime())
 	procs := int(ncpu)
+	if procs > _MaxGomaxprocs {
+		procs = _MaxGomaxprocs
+	}
 	if n := atoi(gogetenv("GOMAXPROCS")); n > 0 {
 		if n > _MaxGomaxprocs {
 			n = _MaxGomaxprocs
@@ -639,17 +653,17 @@ func readgstatus(gp *g) uint32 {
 	return atomic.Load(&gp.atomicstatus)
 }
 
-// Ownership of gscanvalid:
+// Ownership of gcscanvalid:
 //
 // If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
-// then gp owns gp.gscanvalid, and other goroutines must not modify it.
+// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
 //
 // Otherwise, a second goroutine can lock the scan state by setting _Gscan
-// in the status bit and then modify gscanvalid, and then unlock the scan state.
+// in the status bit and then modify gcscanvalid, and then unlock the scan state.
 //
 // Note that the first condition implies an exception to the second:
 // if a second goroutine changes gp's status to _Grunning|_Gscan,
-// that second goroutine still does not have the right to modify gscanvalid.
+// that second goroutine still does not have the right to modify gcscanvalid.
 
 // The Gscanstatuses are acting like locks and this releases them.
 // If it proves to be a performance hit we should be able to make these
@@ -677,9 +691,6 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 		dumpgstatus(gp)
 		throw("casfrom_Gscanstatus: gp->status is not in scan state")
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
 }
 
 // This will return false if the gp is not in the expected status and the cas fails.
@@ -753,8 +764,9 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
+	if newval == _Grunning && gp.gcscanvalid {
+		// Run queueRescan on the system stack so it has more space.
+		systemstack(func() { queueRescan(gp) })
 	}
 }
 
@@ -1404,6 +1416,8 @@ func newextram() {
 	gp.syscallpc = gp.sched.pc
 	gp.syscallsp = gp.sched.sp
 	gp.stktopsp = gp.sched.sp
+	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
+	gp.gcRescan = -1
 	// malg returns status as Gidle, change to Gsyscall before adding to allg
 	// where GC will see it.
 	casgstatus(gp, _Gidle, _Gsyscall)
@@ -1792,23 +1806,7 @@ func execute(gp *g, inheritTime bool) {
 		// GoSysExit has to happen when we have a P, but before GoStart.
 		// So we emit it here.
 		if gp.syscallsp != 0 && gp.sysblocktraced {
-			// Since gp.sysblocktraced is true, we must emit an event.
-			// There is a race between the code that initializes sysexitseq
-			// and sysexitticks (in exitsyscall, which runs without a P,
-			// and therefore is not stopped with the rest of the world)
-			// and the code that initializes a new trace.
-			// The recorded sysexitseq and sysexitticks must therefore
-			// be treated as "best effort". If they are valid for this trace,
-			// then great, use them for greater accuracy.
-			// But if they're not valid for this trace, assume that the
-			// trace was started after the actual syscall exit (but before
-			// we actually managed to start the goroutine, aka right now),
-			// and assign a fresh time stamp to keep the log consistent.
-			seq, ts := gp.sysexitseq, gp.sysexitticks
-			if seq == 0 || int64(seq)-int64(trace.seqStart) < 0 {
-				seq, ts = tracestamp()
-			}
-			traceGoSysExit(seq, ts)
+			traceGoSysExit(gp.sysexitticks)
 		}
 		traceGoStart()
 	}
@@ -2225,6 +2223,10 @@ func goexit0(gp *g) {
 	gp.waitreason = ""
 	gp.param = nil
 
+	// Note that gp's stack scan is now "valid" because it has no
+	// stack. We could dequeueRescan, but that takes a lock and
+	// isn't really necessary.
+	gp.gcscanvalid = true
 	dropg()
 
 	if _g_.m.locked&^_LockExternal != 0 {
@@ -2477,7 +2479,6 @@ func exitsyscall(dummy int32) {
 	}
 
 	_g_.sysexitticks = 0
-	_g_.sysexitseq = 0
 	if trace.enabled {
 		// Wait till traceGoSysBlock event is emitted.
 		// This ensures consistency of the trace (the goroutine is started after it is blocked).
@@ -2488,7 +2489,7 @@ func exitsyscall(dummy int32) {
 		// Tracing code can invoke write barriers that cannot run without a P.
 		// So instead we remember the syscall exit time and emit the event
 		// in execute when we have a P.
-		_g_.sysexitseq, _g_.sysexitticks = tracestamp()
+		_g_.sysexitticks = cputicks()
 	}
 
 	_g_.m.locks--
@@ -2536,7 +2537,7 @@ func exitsyscallfast() bool {
 					// Denote blocking of the new syscall.
 					traceGoSysBlock(_g_.m.p.ptr())
 					// Denote completion of the current syscall.
-					traceGoSysExit(tracestamp())
+					traceGoSysExit(0)
 				})
 			}
 			_g_.m.p.ptr().syscalltick++
@@ -2560,7 +2561,7 @@ func exitsyscallfast() bool {
 						osyield()
 					}
 				}
-				traceGoSysExit(tracestamp())
+				traceGoSysExit(0)
 			}
 		})
 		if ok {
@@ -2716,6 +2717,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	if newg == nil {
 		newg = malg(_StackMin)
 		casgstatus(newg, _Gidle, _Gdead)
+		newg.gcRescan = -1
 		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
 	}
 	if newg.stack.hi == 0 {
@@ -2749,6 +2751,17 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
 	if isSystemGoroutine(newg) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
+	// The stack is dirty from the argument frame, so queue it for
+	// scanning. Do this before setting it to runnable so we still
+	// own the G. If we're recycling a G, it may already be on the
+	// rescan list.
+	if newg.gcRescan == -1 {
+		queueRescan(newg)
+	} else {
+		// The recycled G is already on the rescan list. Just
+		// mark the stack dirty.
+		newg.gcscanvalid = false
+	}
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -2811,8 +2824,13 @@ func gfput(_p_ *p, gp *g) {
 			_p_.gfreecnt--
 			gp = _p_.gfree
 			_p_.gfree = gp.schedlink.ptr()
-			gp.schedlink.set(sched.gfree)
-			sched.gfree = gp
+			if gp.stack.lo == 0 {
+				gp.schedlink.set(sched.gfreeNoStack)
+				sched.gfreeNoStack = gp
+			} else {
+				gp.schedlink.set(sched.gfreeStack)
+				sched.gfreeStack = gp
+			}
 			sched.ngfree++
 		}
 		unlock(&sched.gflock)
@@ -2824,12 +2842,20 @@ func gfput(_p_ *p, gp *g) {
 func gfget(_p_ *p) *g {
 retry:
 	gp := _p_.gfree
-	if gp == nil && sched.gfree != nil {
+	if gp == nil && (sched.gfreeStack != nil || sched.gfreeNoStack != nil) {
 		lock(&sched.gflock)
-		for _p_.gfreecnt < 32 && sched.gfree != nil {
+		for _p_.gfreecnt < 32 {
+			if sched.gfreeStack != nil {
+				// Prefer Gs with stacks.
+				gp = sched.gfreeStack
+				sched.gfreeStack = gp.schedlink.ptr()
+			} else if sched.gfreeNoStack != nil {
+				gp = sched.gfreeNoStack
+				sched.gfreeNoStack = gp.schedlink.ptr()
+			} else {
+				break
+			}
 			_p_.gfreecnt++
-			gp = sched.gfree
-			sched.gfree = gp.schedlink.ptr()
 			sched.ngfree--
 			gp.schedlink.set(_p_.gfree)
 			_p_.gfree = gp
@@ -2866,8 +2892,13 @@ func gfpurge(_p_ *p) {
 		_p_.gfreecnt--
 		gp := _p_.gfree
 		_p_.gfree = gp.schedlink.ptr()
-		gp.schedlink.set(sched.gfree)
-		sched.gfree = gp
+		if gp.stack.lo == 0 {
+			gp.schedlink.set(sched.gfreeNoStack)
+			sched.gfreeNoStack = gp
+		} else {
+			gp.schedlink.set(sched.gfreeStack)
+			sched.gfreeStack = gp
+		}
 		sched.ngfree++
 	}
 	unlock(&sched.gflock)
diff --git a/src/runtime/race/testdata/io_test.go b/src/runtime/race/testdata/io_test.go
index 1b3ee3822b..30a121bee4 100644
--- a/src/runtime/race/testdata/io_test.go
+++ b/src/runtime/race/testdata/io_test.go
@@ -7,9 +7,11 @@ package race_test
 import (
 	"fmt"
 	"io/ioutil"
+	"net"
 	"net/http"
 	"os"
 	"path/filepath"
+	"sync"
 	"testing"
 	"time"
 )
@@ -41,29 +43,34 @@ func TestNoRaceIOFile(t *testing.T) {
 	_ = x
 }
 
+var (
+	regHandler  sync.Once
+	handlerData int
+)
+
 func TestNoRaceIOHttp(t *testing.T) {
-	x := 0
-	go func() {
+	regHandler.Do(func() {
 		http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
-			x = 41
+			handlerData++
 			fmt.Fprintf(w, "test")
-			x = 42
+			handlerData++
 		})
-		err := http.ListenAndServe("127.0.0.1:23651", nil)
-		if err != nil {
-			t.Fatalf("http.ListenAndServe: %v", err)
-		}
-	}()
-	time.Sleep(1e7)
-	x = 1
-	_, err := http.Get("http://127.0.0.1:23651")
+	})
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("net.Listen: %v", err)
+	}
+	defer ln.Close()
+	go http.Serve(ln, nil)
+	handlerData++
+	_, err = http.Get("http://" + ln.Addr().String())
 	if err != nil {
 		t.Fatalf("http.Get: %v", err)
 	}
-	x = 2
-	_, err = http.Get("http://127.0.0.1:23651")
+	handlerData++
+	_, err = http.Get("http://" + ln.Addr().String())
 	if err != nil {
 		t.Fatalf("http.Get: %v", err)
 	}
-	x = 3
+	handlerData++
 }
diff --git a/src/runtime/rdebug.go b/src/runtime/rdebug.go
index d966734813..1b213f1934 100644
--- a/src/runtime/rdebug.go
+++ b/src/runtime/rdebug.go
@@ -15,9 +15,8 @@ func setMaxStack(in int) (out int) {
 
 //go:linkname setPanicOnFault runtime/debug.setPanicOnFault
 func setPanicOnFault(new bool) (old bool) {
-	mp := acquirem()
-	old = mp.curg.paniconfault
-	mp.curg.paniconfault = new
-	releasem(mp)
+	_g_ := getg()
+	old = _g_.paniconfault
+	_g_.paniconfault = new
 	return old
 }
diff --git a/src/runtime/rt0_linux_s390x.s b/src/runtime/rt0_linux_s390x.s
new file mode 100644
index 0000000000..aedd6c7ef2
--- /dev/null
+++ b/src/runtime/rt0_linux_s390x.s
@@ -0,0 +1,20 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_s390x_linux(SB),NOSPLIT|NOFRAME,$0
+	// In a statically linked binary, the stack contains argc,
+	// argv as argc string pointers followed by a NULL, envv as a
+	// sequence of string pointers followed by a NULL, and auxv.
+	// There is no TLS base pointer.
+	//
+	// TODO: Support dynamic linking entry point
+	MOVD 0(R15), R2 // argc
+	ADD $8, R15, R3 // argv
+	BR main(SB)
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$runtime·rt0_go(SB), R11
+	BR	R11
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index 110d99064f..4f82646dbb 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -3,6 +3,7 @@ package runtime_test
 import (
 	"bytes"
 	"fmt"
+	"internal/testenv"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -13,19 +14,22 @@ import (
 	"testing"
 )
 
-func checkGdbPython(t *testing.T) {
-	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
-	out, err := cmd.CombinedOutput()
-
-	if err != nil {
-		t.Skipf("skipping due to issue running gdb: %v", err)
+func checkGdbEnvironment(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	if runtime.GOOS == "darwin" {
+		t.Skip("gdb does not work on darwin")
 	}
-	if string(out) != "go gdb python support\n" {
-		t.Skipf("skipping due to lack of python gdb support: %s", out)
+	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
+		t.Skip("gdb test can fail with GOROOT_FINAL pending")
 	}
+}
 
+func checkGdbVersion(t *testing.T) {
 	// Issue 11214 reports various failures with older versions of gdb.
-	out, err = exec.Command("gdb", "--version").CombinedOutput()
+	out, err := exec.Command("gdb", "--version").CombinedOutput()
+	if err != nil {
+		t.Skipf("skipping: error executing gdb: %v", err)
+	}
 	re := regexp.MustCompile(`([0-9]+)\.([0-9]+)`)
 	matches := re.FindSubmatch(out)
 	if len(matches) < 3 {
@@ -42,6 +46,18 @@ func checkGdbPython(t *testing.T) {
 	t.Logf("gdb version %d.%d", major, minor)
 }
 
+func checkGdbPython(t *testing.T) {
+	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
+	out, err := cmd.CombinedOutput()
+
+	if err != nil {
+		t.Skipf("skipping due to issue running gdb: %v", err)
+	}
+	if string(out) != "go gdb python support\n" {
+		t.Skipf("skipping due to lack of python gdb support: %s", out)
+	}
+}
+
 const helloSource = `
 package main
 import "fmt"
@@ -57,13 +73,8 @@ func main() {
 `
 
 func TestGdbPython(t *testing.T) {
-	if runtime.GOOS == "darwin" {
-		t.Skip("gdb does not work on darwin")
-	}
-	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
-		t.Skip("gdb test can fail with GOROOT_FINAL pending")
-	}
-
+	checkGdbEnvironment(t)
+	checkGdbVersion(t)
 	checkGdbPython(t)
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -104,7 +115,7 @@ func TestGdbPython(t *testing.T) {
 	// stack frames on RISC architectures.
 	canBackTrace := false
 	switch runtime.GOARCH {
-	case "amd64", "386", "ppc64", "ppc64le", "arm", "arm64", "mips64", "mips64le":
+	case "amd64", "386", "ppc64", "ppc64le", "arm", "arm64", "mips64", "mips64le", "s390x":
 		canBackTrace = true
 		args = append(args,
 			"-ex", "echo BEGIN goroutine 2 bt\n",
@@ -162,3 +173,82 @@ func TestGdbPython(t *testing.T) {
 		t.Logf("gdb cannot backtrace for GOARCH=%s, skipped goroutine backtrace test", runtime.GOARCH)
 	}
 }
+
+const backtraceSource = `
+package main
+
+//go:noinline
+func aaa() bool { return bbb() }
+
+//go:noinline
+func bbb() bool { return ccc() }
+
+//go:noinline
+func ccc() bool { return ddd() }
+
+//go:noinline
+func ddd() bool { return f() }
+
+//go:noinline
+func eee() bool { return true }
+
+var f = eee
+
+func main() {
+	_ = aaa()
+}
+`
+
+// TestGdbBacktrace tests that gdb can unwind the stack correctly
+// using only the DWARF debug info.
+func TestGdbBacktrace(t *testing.T) {
+	checkGdbEnvironment(t)
+	checkGdbVersion(t)
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// Build the source code.
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(backtraceSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	cmd := exec.Command("go", "build", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := testEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	// Execute gdb commands.
+	args := []string{"-nx", "-batch",
+		"-ex", "break main.eee",
+		"-ex", "run",
+		"-ex", "backtrace",
+		"-ex", "continue",
+		filepath.Join(dir, "a.exe"),
+	}
+	got, _ := exec.Command("gdb", args...).CombinedOutput()
+
+	// Check that the backtrace matches the source code.
+	bt := []string{
+		"eee",
+		"ddd",
+		"ccc",
+		"bbb",
+		"aaa",
+		"main",
+	}
+	for i, name := range bt {
+		s := fmt.Sprintf("#%v.*main\\.%v", i, name)
+		re := regexp.MustCompile(s)
+		if found := re.Find(got) != nil; !found {
+			t.Errorf("could not find '%v' in backtrace", s)
+			t.Fatalf("gdb output:\n%v", string(got))
+		}
+	}
+}
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 95bebac593..9089383904 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -477,10 +477,51 @@ func gomcache() *mcache {
 }
 
 //go:linkname reflect_typelinks reflect.typelinks
-func reflect_typelinks() [][]*_type {
-	ret := [][]*_type{firstmoduledata.typelinks}
+func reflect_typelinks() ([]unsafe.Pointer, [][]int32) {
+	sections := []unsafe.Pointer{unsafe.Pointer(firstmoduledata.types)}
+	ret := [][]int32{firstmoduledata.typelinks}
 	for datap := firstmoduledata.next; datap != nil; datap = datap.next {
+		sections = append(sections, unsafe.Pointer(datap.types))
 		ret = append(ret, datap.typelinks)
 	}
-	return ret
+	return sections, ret
+}
+
+// reflect_resolveNameOff resolves a name offset from a base pointer.
+//go:linkname reflect_resolveNameOff reflect.resolveNameOff
+func reflect_resolveNameOff(ptrInModule unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer(resolveNameOff(ptrInModule, nameOff(off)).bytes)
+}
+
+// reflect_resolveTypeOff resolves an *rtype offset from a base type.
+//go:linkname reflect_resolveTypeOff reflect.resolveTypeOff
+func reflect_resolveTypeOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer((*_type)(rtype).typeOff(typeOff(off)))
+}
+
+// reflect_resolveTextOff resolves an function pointer offset from a base type.
+//go:linkname reflect_resolveTextOff reflect.resolveTextOff
+func reflect_resolveTextOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return (*_type)(rtype).textOff(textOff(off))
+
+}
+
+// reflect_addReflectOff adds a pointer to the reflection offset lookup map.
+//go:linkname reflect_addReflectOff reflect.addReflectOff
+func reflect_addReflectOff(ptr unsafe.Pointer) int32 {
+	lock(&reflectOffs.lock)
+	if reflectOffs.m == nil {
+		reflectOffs.m = make(map[int32]unsafe.Pointer)
+		reflectOffs.minv = make(map[unsafe.Pointer]int32)
+		reflectOffs.next = -1
+	}
+	id, found := reflectOffs.minv[ptr]
+	if !found {
+		id = reflectOffs.next
+		reflectOffs.next-- // use negative offsets as IDs to aid debugging
+		reflectOffs.m[id] = ptr
+		reflectOffs.minv[ptr] = id
+	}
+	unlock(&reflectOffs.lock)
+	return id
 }
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index e0137f7e97..d35b897c3e 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -10,9 +10,7 @@ import (
 	"unsafe"
 )
 
-/*
- * defined constants
- */
+// defined constants
 const (
 	// G status
 	//
@@ -99,6 +97,10 @@ const (
 	_Pdead
 )
 
+// Mutual exclusion locks.  In the uncontended case,
+// as fast as spin locks (just a few user-level instructions),
+// but on the contention path they sleep in the kernel.
+// A zeroed Mutex is unlocked (no need to initialize each lock).
 type mutex struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -106,6 +108,26 @@ type mutex struct {
 	key uintptr
 }
 
+// sleep and wakeup on one-time events.
+// before any calls to notesleep or notewakeup,
+// must call noteclear to initialize the Note.
+// then, exactly one thread can call notesleep
+// and exactly one thread can call notewakeup (once).
+// once notewakeup has been called, the notesleep
+// will return.  future notesleep will return immediately.
+// subsequent noteclear must be called only after
+// previous notesleep has returned, e.g. it's disallowed
+// to call noteclear straight after notewakeup.
+//
+// notetsleep is like notesleep but wakes up after
+// a given number of nanoseconds even if the event
+// has not yet happened.  if a goroutine uses notetsleep to
+// wake up early, it must wait to call noteclear until it
+// can be sure that no other goroutine is calling
+// notewakeup.
+//
+// notesleep/notetsleep are generally called on g0,
+// notetsleepg is similar to notetsleep but is called on user g.
 type note struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -310,16 +332,17 @@ type g struct {
 	waitsince      int64  // approx time when the g become blocked
 	waitreason     string // if status==Gwaiting
 	schedlink      guintptr
-	preempt        bool   // preemption signal, duplicates stackguard0 = stackpreempt
-	paniconfault   bool   // panic (instead of crash) on unexpected fault address
-	preemptscan    bool   // preempted g does scan for gc
-	gcscandone     bool   // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool   // false at start of gc cycle, true if G has not run since last scan
-	throwsplit     bool   // must not split stack
-	raceignore     int8   // ignore race detection events
-	sysblocktraced bool   // StartTrace has emitted EvGoInSyscall about this goroutine
-	sysexitticks   int64  // cputicks when syscall has returned (for tracing)
-	sysexitseq     uint64 // trace seq when syscall has returned (for tracing)
+	preempt        bool     // preemption signal, duplicates stackguard0 = stackpreempt
+	paniconfault   bool     // panic (instead of crash) on unexpected fault address
+	preemptscan    bool     // preempted g does scan for gc
+	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
+	throwsplit     bool     // must not split stack
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
 	lockedm        *m
 	sig            uint32
 	writebuf       []byte
@@ -331,7 +354,14 @@ type g struct {
 	racectx        uintptr
 	waiting        *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
 
-	// Per-G gcController state
+	// Per-G GC state
+
+	// gcRescan is this G's index in work.rescan.list. If this is
+	// -1, this G is not on the rescan list.
+	//
+	// If gcphase != _GCoff and this G is visible to the garbage
+	// collector, writes to this are protected by work.rescan.lock.
+	gcRescan int32
 
 	// gcAssistBytes is this G's GC assist credit in terms of
 	// bytes allocated. If this is positive, then the G has credit
@@ -397,8 +427,8 @@ type m struct {
 	waittraceskip int
 	startingtrace bool
 	syscalltick   uint32
-	//#ifdef GOOS_windows
-	thread uintptr // thread handle
+	thread        uintptr // thread handle
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -406,7 +436,7 @@ type m struct {
 	libcallsp uintptr
 	libcallg  guintptr
 	syscall   libcall // stores syscall parameters on windows
-	//#endif
+
 	mOS
 }
 
@@ -500,9 +530,10 @@ type schedt struct {
 	runqsize int32
 
 	// Global cache of dead G's.
-	gflock mutex
-	gfree  *g
-	ngfree int32
+	gflock       mutex
+	gfreeStack   *g
+	gfreeNoStack *g
+	ngfree       int32
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
@@ -530,10 +561,10 @@ type schedt struct {
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
 // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
 // External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m->locked record the nesting depth of calls to lockOSThread
+// The upper bits of m.locked record the nesting depth of calls to lockOSThread
 // (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
 // Internal locks can be recursive. For instance, a lock for cgo can occur while the main
 // goroutine is holding the lock during the initialization phase.
@@ -603,13 +634,6 @@ type forcegcstate struct {
 	idle uint32
 }
 
-/*
- * known to compiler
- */
-const (
-	_Structrnd = sys.RegSize
-)
-
 // startup_random_data holds random bytes initialized at startup. These come from
 // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
 var startupRandomData []byte
@@ -635,9 +659,7 @@ func extendRandom(r []byte, n int) {
 	}
 }
 
-/*
- * deferred subroutine calls
- */
+// deferred subroutine calls
 type _defer struct {
 	siz     int32
 	started bool
@@ -648,9 +670,7 @@ type _defer struct {
 	link    *_defer
 }
 
-/*
- * panics
- */
+// panics
 type _panic struct {
 	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
 	arg       interface{}    // argument to panic
@@ -659,10 +679,7 @@ type _panic struct {
 	aborted   bool           // the panic was aborted
 }
 
-/*
- * stack traces
- */
-
+// stack traces
 type stkframe struct {
 	fn       *_func     // function being run
 	pc       uintptr    // program counter within fn
@@ -682,10 +699,8 @@ const (
 	_TraceJumpStack                 // if traceback is on a systemstack, resume trace at g that called into it
 )
 
-const (
-	// The maximum number of frames we print for a traceback
-	_TracebackMaxFrames = 100
-)
+// The maximum number of frames we print for a traceback
+const _TracebackMaxFrames = 100
 
 var (
 	emptystring string
@@ -716,46 +731,3 @@ var (
 	islibrary bool // -buildmode=c-shared
 	isarchive bool // -buildmode=c-archive
 )
-
-/*
- * mutual exclusion locks.  in the uncontended case,
- * as fast as spin locks (just a few user-level instructions),
- * but on the contention path they sleep in the kernel.
- * a zeroed Mutex is unlocked (no need to initialize each lock).
- */
-
-/*
- * sleep and wakeup on one-time events.
- * before any calls to notesleep or notewakeup,
- * must call noteclear to initialize the Note.
- * then, exactly one thread can call notesleep
- * and exactly one thread can call notewakeup (once).
- * once notewakeup has been called, the notesleep
- * will return.  future notesleep will return immediately.
- * subsequent noteclear must be called only after
- * previous notesleep has returned, e.g. it's disallowed
- * to call noteclear straight after notewakeup.
- *
- * notetsleep is like notesleep but wakes up after
- * a given number of nanoseconds even if the event
- * has not yet happened.  if a goroutine uses notetsleep to
- * wake up early, it must wait to call noteclear until it
- * can be sure that no other goroutine is calling
- * notewakeup.
- *
- * notesleep/notetsleep are generally called on g0,
- * notetsleepg is similar to notetsleep but is called on user g.
- */
-// bool	runtime·notetsleep(Note*, int64);  // false - timeout
-// bool	runtime·notetsleepg(Note*, int64);  // false - timeout
-
-/*
- * Lock-free stack.
- * Initialize uint64 head to 0, compare with 0 to test for emptiness.
- * The stack does not keep pointers to nodes,
- * so they can be garbage collected if there are no other pointers to nodes.
- */
-
-// for mmap, we only pass the lower 32 bits of file offset to the
-// assembly routine; the higher bits (if required), should be provided
-// by the assembly routine as 0.
diff --git a/src/runtime/select.go b/src/runtime/select.go
index c80c833b15..433048fb79 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -594,7 +594,7 @@ retc:
 sclose:
 	// send on closed channel
 	selunlock(scases, lockorder)
-	panic("send on closed channel")
+	panic(plainError("send on closed channel"))
 }
 
 func (c *hchan) sortkey() uintptr {
@@ -626,7 +626,7 @@ const (
 func reflect_rselect(cases []runtimeSelect) (chosen int, recvOK bool) {
 	// flagNoScan is safe here, because all objects are also referenced from cases.
 	size := selectsize(uintptr(len(cases)))
-	sel := (*hselect)(mallocgc(size, nil, flagNoScan))
+	sel := (*hselect)(mallocgc(size, nil, true))
 	newselect(sel, int64(size), int32(len(cases)))
 	r := new(bool)
 	for i := range cases {
diff --git a/src/runtime/signal_dragonfly.go b/src/runtime/signal_dragonfly.go
index f507a07233..8e9ce17c86 100644
--- a/src/runtime/signal_dragonfly.go
+++ b/src/runtime/signal_dragonfly.go
@@ -14,14 +14,14 @@ var sigtable = [...]sigTabT{
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -30,14 +30,14 @@ var sigtable = [...]sigTabT{
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
diff --git a/src/runtime/signal_freebsd.go b/src/runtime/signal_freebsd.go
index cd2068a62c..c8b09e92d9 100644
--- a/src/runtime/signal_freebsd.go
+++ b/src/runtime/signal_freebsd.go
@@ -16,14 +16,14 @@ var sigtable = [...]sigTabT{
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigNotify, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -32,14 +32,14 @@ var sigtable = [...]sigTabT{
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
diff --git a/src/runtime/signal_linux_s390x.go b/src/runtime/signal_linux_s390x.go
new file mode 100644
index 0000000000..155d3a326f
--- /dev/null
+++ b/src/runtime/signal_linux_s390x.go
@@ -0,0 +1,208 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+func (c *sigctxt) regs() *sigcontext {
+	return (*sigcontext)(unsafe.Pointer(&(*ucontext)(c.ctxt).uc_mcontext))
+}
+func (c *sigctxt) r0() uint64      { return c.regs().gregs[0] }
+func (c *sigctxt) r1() uint64      { return c.regs().gregs[1] }
+func (c *sigctxt) r2() uint64      { return c.regs().gregs[2] }
+func (c *sigctxt) r3() uint64      { return c.regs().gregs[3] }
+func (c *sigctxt) r4() uint64      { return c.regs().gregs[4] }
+func (c *sigctxt) r5() uint64      { return c.regs().gregs[5] }
+func (c *sigctxt) r6() uint64      { return c.regs().gregs[6] }
+func (c *sigctxt) r7() uint64      { return c.regs().gregs[7] }
+func (c *sigctxt) r8() uint64      { return c.regs().gregs[8] }
+func (c *sigctxt) r9() uint64      { return c.regs().gregs[9] }
+func (c *sigctxt) r10() uint64     { return c.regs().gregs[10] }
+func (c *sigctxt) r11() uint64     { return c.regs().gregs[11] }
+func (c *sigctxt) r12() uint64     { return c.regs().gregs[12] }
+func (c *sigctxt) r13() uint64     { return c.regs().gregs[13] }
+func (c *sigctxt) r14() uint64     { return c.regs().gregs[14] }
+func (c *sigctxt) r15() uint64     { return c.regs().gregs[15] }
+func (c *sigctxt) link() uint64    { return c.regs().gregs[14] }
+func (c *sigctxt) sp() uint64      { return c.regs().gregs[15] }
+func (c *sigctxt) pc() uint64      { return c.regs().psw_addr }
+func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_r0(x uint64)      { c.regs().gregs[0] = x }
+func (c *sigctxt) set_r13(x uint64)     { c.regs().gregs[13] = x }
+func (c *sigctxt) set_link(x uint64)    { c.regs().gregs[14] = x }
+func (c *sigctxt) set_sp(x uint64)      { c.regs().gregs[15] = x }
+func (c *sigctxt) set_pc(x uint64)      { c.regs().psw_addr = x }
+func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
+}
+
+func dumpregs(c *sigctxt) {
+	print("r0   ", hex(c.r0()), "\t")
+	print("r1   ", hex(c.r1()), "\n")
+	print("r2   ", hex(c.r2()), "\t")
+	print("r3   ", hex(c.r3()), "\n")
+	print("r4   ", hex(c.r4()), "\t")
+	print("r5   ", hex(c.r5()), "\n")
+	print("r6   ", hex(c.r6()), "\t")
+	print("r7   ", hex(c.r7()), "\n")
+	print("r8   ", hex(c.r8()), "\t")
+	print("r9   ", hex(c.r9()), "\n")
+	print("r10  ", hex(c.r10()), "\t")
+	print("r11  ", hex(c.r11()), "\n")
+	print("r12  ", hex(c.r12()), "\t")
+	print("r13  ", hex(c.r13()), "\n")
+	print("r14  ", hex(c.r14()), "\t")
+	print("r15  ", hex(c.r15()), "\n")
+	print("pc   ", hex(c.pc()), "\t")
+	print("link ", hex(c.link()), "\n")
+}
+
+var crashing int32
+
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	if sig == _SIGPROF {
+		sigprof(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp, _g_.m)
+		return
+	}
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = uintptr(c.sigaddr())
+		gp.sigpc = uintptr(c.pc())
+
+		// We arrange link, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LINK to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		sp := c.sp() - sys.MinFrameSize
+		c.set_sp(sp)
+		*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+
+		pc := uintptr(gp.sigpc)
+
+		// If we don't recognize the PC as code
+		// but we do recognize the link register as code,
+		// then assume this was a call to non-code and treat like
+		// pc == 0, to make unwinding show the context.
+		if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+			pc = 0
+		}
+
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
+		if pc != 0 {
+			c.set_link(uint64(pc))
+		}
+
+		// In case we are panicking from external C code
+		c.set_r0(0)
+		c.set_r13(uint64(uintptr(unsafe.Pointer(gp))))
+		c.set_pc(uint64(funcPC(sigpanic)))
+		return
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(int32(sig))
+	}
+
+	if flags&_SigThrow == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(c.pc()), " m=", _g_.m.id, "\n")
+	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+		print("signal arrived during cgo execution\n")
+		gp = _g_.m.lockedg
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp)
+		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+			// tracebackothers on original m skipped this one; trace it now.
+			goroutineheader(_g_.m.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		} else if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(c)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < sched.mcount {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	exit(2)
+}
diff --git a/src/runtime/signal_openbsd.go b/src/runtime/signal_openbsd.go
index 3c50190da4..9275279860 100644
--- a/src/runtime/signal_openbsd.go
+++ b/src/runtime/signal_openbsd.go
@@ -16,14 +16,14 @@ var sigtable = [...]sigTabT{
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/*  2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/*  3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/*  4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/*  5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/*  4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/*  5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/*  6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/*  7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/*  8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/*  8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/*  9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -32,14 +32,14 @@ var sigtable = [...]sigTabT{
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index 32c40c4768..e97497f18c 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -45,7 +45,7 @@ var sigtable = [...]sigTabT{
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 30 */ {_SigNotify, "SIGPWR: power failure restart"},
-	/* 31 */ {_SigNotify, "SIGSYS: bad system call"},
+	/* 31 */ {_SigThrow + _SigUnblock, "SIGSYS: bad system call"},
 	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
 	/* 34 */ {_SigNotify, "signal 34"},
diff --git a/src/runtime/sigtab_linux_mips64x.go b/src/runtime/sigtab_linux_mips64x.go
index dbd50f7b1f..f7d81811ba 100644
--- a/src/runtime/sigtab_linux_mips64x.go
+++ b/src/runtime/sigtab_linux_mips64x.go
@@ -25,7 +25,7 @@ var sigtable = [...]sigTabT{
 	/* 9 */ {0, "SIGKILL: kill"},
 	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
 	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
-	/* 12 */ {_SigNotify, "SIGSYS: bad system call"},
+	/* 12 */ {_SigThrow + _SigUnblock, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 0bc0299f72..e15e6c4dc6 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -14,39 +14,69 @@ type slice struct {
 	cap   int
 }
 
+// maxElems is a lookup table containing the maximum capacity for a slice.
+// The index is the size of the slice element.
+var maxElems = [...]uintptr{
+	^uintptr(0),
+	_MaxMem / 1, _MaxMem / 2, _MaxMem / 3, _MaxMem / 4,
+	_MaxMem / 5, _MaxMem / 6, _MaxMem / 7, _MaxMem / 8,
+	_MaxMem / 9, _MaxMem / 10, _MaxMem / 11, _MaxMem / 12,
+	_MaxMem / 13, _MaxMem / 14, _MaxMem / 15, _MaxMem / 16,
+	_MaxMem / 17, _MaxMem / 18, _MaxMem / 19, _MaxMem / 20,
+	_MaxMem / 21, _MaxMem / 22, _MaxMem / 23, _MaxMem / 24,
+	_MaxMem / 25, _MaxMem / 26, _MaxMem / 27, _MaxMem / 28,
+	_MaxMem / 29, _MaxMem / 30, _MaxMem / 31, _MaxMem / 32,
+}
+
+// maxSliceCap returns the maximum capacity for a slice.
+func maxSliceCap(elemsize uintptr) uintptr {
+	if elemsize < uintptr(len(maxElems)) {
+		return maxElems[elemsize]
+	}
+	return _MaxMem / elemsize
+}
+
 // TODO: take uintptrs instead of int64s?
-func makeslice(t *slicetype, len64, cap64 int64) slice {
-	// NOTE: The len > MaxMem/elemsize check here is not strictly necessary,
+func makeslice(et *_type, len64, cap64 int64) slice {
+	// NOTE: The len > maxElements check here is not strictly necessary,
 	// but it produces a 'len out of range' error instead of a 'cap out of range' error
 	// when someone does make([]T, bignumber). 'cap out of range' is true too,
 	// but since the cap is only being supplied implicitly, saying len is clearer.
 	// See issue 4085.
+	maxElements := maxSliceCap(et.size)
 	len := int(len64)
-	if len64 < 0 || int64(len) != len64 || t.elem.size > 0 && uintptr(len) > _MaxMem/t.elem.size {
+	if len64 < 0 || int64(len) != len64 || uintptr(len) > maxElements {
 		panic(errorString("makeslice: len out of range"))
 	}
+
 	cap := int(cap64)
-	if cap < len || int64(cap) != cap64 || t.elem.size > 0 && uintptr(cap) > _MaxMem/t.elem.size {
+	if cap < len || int64(cap) != cap64 || uintptr(cap) > maxElements {
 		panic(errorString("makeslice: cap out of range"))
 	}
-	p := newarray(t.elem, uintptr(cap))
+
+	p := mallocgc(et.size*uintptr(cap), et, true)
 	return slice{p, len, cap}
 }
 
 // growslice handles slice growth during append.
-// It is passed the slice type, the old slice, and the desired new minimum capacity,
+// It is passed the slice element type, the old slice, and the desired new minimum capacity,
 // and it returns a new slice with at least that capacity, with the old data
 // copied into it.
-func growslice(t *slicetype, old slice, cap int) slice {
+// The new slice's length is set to the old slice's length,
+// NOT to the new requested capacity.
+// This is for codegen convenience. The old slice's length is used immediately
+// to calculate where to write new values during an append.
+// TODO: When the old backend is gone, reconsider this decision.
+// The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
+func growslice(et *_type, old slice, cap int) slice {
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
-		racereadrangepc(old.array, uintptr(old.len*int(t.elem.size)), callerpc, funcPC(growslice))
+		callerpc := getcallerpc(unsafe.Pointer(&et))
+		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
 	}
 	if msanenabled {
-		msanread(old.array, uintptr(old.len*int(t.elem.size)))
+		msanread(old.array, uintptr(old.len*int(et.size)))
 	}
 
-	et := t.elem
 	if et.size == 0 {
 		if cap < old.cap {
 			panic(errorString("growslice: cap out of range"))
@@ -70,38 +100,35 @@ func growslice(t *slicetype, old slice, cap int) slice {
 		}
 	}
 
-	var lenmem, capmem, maxcap uintptr
+	var lenmem, capmem uintptr
 	const ptrSize = unsafe.Sizeof((*byte)(nil))
 	switch et.size {
 	case 1:
 		lenmem = uintptr(old.len)
 		capmem = roundupsize(uintptr(newcap))
 		newcap = int(capmem)
-		maxcap = _MaxMem
 	case ptrSize:
 		lenmem = uintptr(old.len) * ptrSize
 		capmem = roundupsize(uintptr(newcap) * ptrSize)
 		newcap = int(capmem / ptrSize)
-		maxcap = _MaxMem / ptrSize
 	default:
 		lenmem = uintptr(old.len) * et.size
 		capmem = roundupsize(uintptr(newcap) * et.size)
 		newcap = int(capmem / et.size)
-		maxcap = _MaxMem / et.size
 	}
 
-	if cap < old.cap || uintptr(newcap) > maxcap {
+	if cap < old.cap || uintptr(newcap) > maxSliceCap(et.size) {
 		panic(errorString("growslice: cap out of range"))
 	}
 
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
-		p = rawmem(capmem)
+		p = mallocgc(capmem, nil, false)
 		memmove(p, old.array, lenmem)
 		memclr(add(p, lenmem), capmem-lenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
-		p = newarray(et, uintptr(newcap))
+		p = mallocgc(capmem, et, true)
 		if !writeBarrier.enabled {
 			memmove(p, old.array, lenmem)
 		} else {
diff --git a/src/runtime/softfloat_arm.go b/src/runtime/softfloat_arm.go
index b1f1a72925..648b2e1169 100644
--- a/src/runtime/softfloat_arm.go
+++ b/src/runtime/softfloat_arm.go
@@ -168,14 +168,15 @@ execute:
 		}
 		return 1
 	}
-	if i == 0xe08bb00d {
-		// add sp to r11.
-		// might be part of a large stack offset address
+	if i&0xfffffff0 == 0xe08bb000 {
+		r := i & 0xf
+		// add r to r11.
+		// might be part of a large offset address calculation
 		// (or might not, but again no harm done).
-		regs[11] += regs[13]
+		regs[11] += regs[r]
 
 		if fptrace > 0 {
-			print("*** cpu R[11] += R[13] ", hex(regs[11]), "\n")
+			print("*** cpu R[11] += R[", r, "] ", hex(regs[11]), "\n")
 		}
 		return 1
 	}
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 1ca737e920..ac4efc114b 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -634,8 +634,8 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
 	// Adjust local variables if stack frame has been allocated.
 	size := frame.varp - frame.sp
 	var minsize uintptr
-	switch sys.TheChar {
-	case '7':
+	switch sys.ArchFamily {
+	case sys.ARM64:
 		minsize = sys.SpAlign
 	default:
 		minsize = sys.MinFrameSize
@@ -662,7 +662,7 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
 	}
 
 	// Adjust saved base pointer if there is one.
-	if sys.TheChar == '6' && frame.argp-frame.varp == 2*sys.RegSize {
+	if sys.ArchFamily == sys.AMD64 && frame.argp-frame.varp == 2*sys.RegSize {
 		if !framepointer_enabled {
 			print("runtime: found space for saved base pointer, but no framepointer experiment\n")
 			print("argp=", hex(frame.argp), " varp=", hex(frame.varp), "\n")
@@ -969,7 +969,7 @@ func newstack() {
 		throw("missing stack in newstack")
 	}
 	sp := gp.sched.sp
-	if sys.TheChar == '6' || sys.TheChar == '8' {
+	if sys.ArchFamily == sys.AMD64 || sys.ArchFamily == sys.I386 {
 		// The call to morestack cost a word.
 		sp -= sys.PtrSize
 	}
@@ -1016,6 +1016,7 @@ func newstack() {
 			gp.preemptscan = false
 			gp.preempt = false
 			casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
+			// This clears gcscanvalid.
 			casgstatus(gp, _Gwaiting, _Grunning)
 			gp.stackguard0 = gp.stack.lo + _StackGuard
 			gogo(&gp.sched) // never return
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 2d20e0a9c3..ef28ba9828 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -139,7 +139,8 @@ func slicebytetostringtmp(b []byte) string {
 func stringtoslicebyte(buf *tmpBuf, s string) []byte {
 	var b []byte
 	if buf != nil && len(s) <= len(buf) {
-		b = buf[:len(s):len(s)]
+		*buf = tmpBuf{}
+		b = buf[:len(s)]
 	} else {
 		b = rawbyteslice(len(s))
 	}
@@ -171,7 +172,8 @@ func stringtoslicerune(buf *[tmpStringBufSize]rune, s string) []rune {
 	}
 	var a []rune
 	if buf != nil && n <= len(buf) {
-		a = buf[:n:n]
+		*buf = [tmpStringBufSize]rune{}
+		a = buf[:n]
 	} else {
 		a = rawruneslice(n)
 	}
@@ -284,7 +286,7 @@ func stringiter2(s string, k int) (int, rune) {
 // The storage is not zeroed. Callers should use
 // b to set the string contents and then drop b.
 func rawstring(size int) (s string, b []byte) {
-	p := mallocgc(uintptr(size), nil, flagNoScan|flagNoZero)
+	p := mallocgc(uintptr(size), nil, false)
 
 	stringStructOf(&s).str = p
 	stringStructOf(&s).len = size
@@ -302,7 +304,7 @@ func rawstring(size int) (s string, b []byte) {
 // rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
 func rawbyteslice(size int) (b []byte) {
 	cap := roundupsize(uintptr(size))
-	p := mallocgc(cap, nil, flagNoScan|flagNoZero)
+	p := mallocgc(cap, nil, false)
 	if cap != uintptr(size) {
 		memclr(add(p, uintptr(size)), cap-uintptr(size))
 	}
@@ -317,7 +319,7 @@ func rawruneslice(size int) (b []rune) {
 		throw("out of memory")
 	}
 	mem := roundupsize(uintptr(size) * 4)
-	p := mallocgc(mem, nil, flagNoScan|flagNoZero)
+	p := mallocgc(mem, nil, false)
 	if mem != uintptr(size)*4 {
 		memclr(add(p, uintptr(size)*4), mem-uintptr(size)*4)
 	}
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index ee9709e87d..0f1d82a481 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go
@@ -238,17 +238,35 @@ func TestRangeStringCast(t *testing.T) {
 	}
 }
 
+func isZeroed(b []byte) bool {
+	for _, x := range b {
+		if x != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func isZeroedR(r []rune) bool {
+	for _, x := range r {
+		if x != 0 {
+			return false
+		}
+	}
+	return true
+}
+
 func TestString2Slice(t *testing.T) {
 	// Make sure we don't return slices that expose
 	// an unzeroed section of stack-allocated temp buf
 	// between len and cap. See issue 14232.
 	s := "foož"
 	b := ([]byte)(s)
-	if cap(b) != 5 {
-		t.Errorf("want cap of 5, got %d", cap(b))
+	if !isZeroed(b[len(b):cap(b)]) {
+		t.Errorf("extra bytes not zeroed")
 	}
 	r := ([]rune)(s)
-	if cap(r) != 4 {
-		t.Errorf("want cap of 4, got %d", cap(r))
+	if !isZeroedR(r[len(r):cap(r)]) {
+		t.Errorf("extra runes not zeroed")
 	}
 }
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 158bdcea0d..2df390253a 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -127,8 +127,9 @@ type moduledata struct {
 	bss, ebss             uintptr
 	noptrbss, enoptrbss   uintptr
 	end, gcdata, gcbss    uintptr
+	types, etypes         uintptr
 
-	typelinks []*_type
+	typelinks []int32 // offsets from types
 	itablinks []*itab
 
 	modulename   string
@@ -136,6 +137,8 @@ type moduledata struct {
 
 	gcdatamask, gcbssmask bitvector
 
+	typemap map[typeOff]*_type // offset to *_rtype in previous module
+
 	next *moduledata
 }
 
diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
new file mode 100644
index 0000000000..f43792bd51
--- /dev/null
+++ b/src/runtime/sys_linux_s390x.s
@@ -0,0 +1,440 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// System calls and other system stuff for Linux s390x; see
+// /usr/include/asm/unistd.h for the syscall number definitions.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define SYS_exit                  1
+#define SYS_read                  3
+#define SYS_write                 4
+#define SYS_open                  5
+#define SYS_close                 6
+#define SYS_getpid               20
+#define SYS_kill                 37
+#define SYS_fcntl                55
+#define SYS_gettimeofday         78
+#define SYS_mmap                 90
+#define SYS_munmap               91
+#define SYS_setitimer           104
+#define SYS_clone               120
+#define SYS_select              142
+#define SYS_sched_yield         158
+#define SYS_rt_sigreturn        173
+#define SYS_rt_sigaction        174
+#define SYS_rt_sigprocmask      175
+#define SYS_sigaltstack         186
+#define SYS_ugetrlimit          191
+#define SYS_madvise             219
+#define SYS_mincore             218
+#define SYS_gettid              236
+#define SYS_tkill               237
+#define SYS_futex               238
+#define SYS_sched_getaffinity   240
+#define SYS_exit_group          248
+#define SYS_epoll_create        249
+#define SYS_epoll_ctl           250
+#define SYS_epoll_wait          251
+#define SYS_clock_gettime       260
+#define SYS_epoll_create1       327
+
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), R2
+	MOVW	$SYS_exit_group, R1
+	SYSCALL
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), R2
+	MOVW	$SYS_exit, R1
+	SYSCALL
+	RET
+
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	name+0(FP), R2
+	MOVW	mode+8(FP), R3
+	MOVW	perm+12(FP), R4
+	MOVW	$SYS_open, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	fd+0(FP), R2
+	MOVW	$SYS_close, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+8(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+	MOVD	fd+0(FP), R2
+	MOVD	p+8(FP), R3
+	MOVW	n+16(FP), R4
+	MOVW	$SYS_write, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	fd+0(FP), R2
+	MOVD	p+8(FP), R3
+	MOVW	n+16(FP), R4
+	MOVW	$SYS_read, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	kind+0(FP), R2
+	MOVD	limit+8(FP), R3
+	MOVW	$SYS_ugetrlimit, R1
+	SYSCALL
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16-4
+	MOVW	usec+0(FP), R2
+	MOVD	R2, R4
+	MOVW	$1000000, R3
+	DIVD	R3, R2
+	MOVD	R2, 8(R15)
+	MULLD	R2, R3
+	SUB	R3, R4
+	MOVD	R4, 16(R15)
+
+	// select(0, 0, 0, 0, &tv)
+	MOVW	$0, R2
+	MOVW	$0, R3
+	MOVW	$0, R4
+	MOVW	$0, R5
+	ADD	$8, R15, R6
+	MOVW	$SYS_select, R1
+	SYSCALL
+	RET
+
+TEXT runtime·gettid(SB),NOSPLIT,$0-4
+	MOVW	$SYS_gettid, R1
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_gettid, R1
+	SYSCALL
+	MOVW	R2, R2	// arg 1 tid
+	MOVW	sig+0(FP), R3	// arg 2
+	MOVW	$SYS_tkill, R1
+	SYSCALL
+	RET
+
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_getpid, R1
+	SYSCALL
+	MOVW	R2, R2	// arg 1 pid
+	MOVW	sig+0(FP), R3	// arg 2
+	MOVW	$SYS_kill, R1
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	mode+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVW	$SYS_setitimer, R1
+	SYSCALL
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVD	dst+16(FP), R4
+	MOVW	$SYS_mincore, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$16
+	MOVD	$0(R15), R2
+	MOVD	$0, R3
+	MOVW	$SYS_gettimeofday, R1
+	SYSCALL
+	MOVD	0(R15), R2	// sec
+	MOVD	8(R15), R4	// usec
+	MOVD	$1000, R3
+	MULLD	R3, R4
+	MOVD	R2, sec+0(FP)
+	MOVW	R4, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	MOVW	$1, R2 // CLOCK_MONOTONIC
+	MOVD	$0(R15), R3
+	MOVW	$SYS_clock_gettime, R1
+	SYSCALL
+	MOVD	0(R15), R2	// sec
+	MOVD	8(R15), R4	// nsec
+	// sec is in R2, nsec in R4
+	// return nsec in R2
+	MOVD	$1000000000, R3
+	MULLD	R3, R2
+	ADD	R4, R2
+	MOVD	R2, ret+0(FP)
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	sig+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVW	size+24(FP), R5
+	MOVW	$SYS_rt_sigprocmask, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
+	MOVD	sig+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVD	size+24(FP), R5
+	MOVW	$SYS_rt_sigaction, R1
+	SYSCALL
+	MOVW	R2, ret+32(FP)
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R2
+	MOVD	info+16(FP), R3
+	MOVD	ctx+24(FP), R4
+	MOVD	fn+0(FP), R5
+	BL	R5
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	// initialize essential registers (just in case)
+	XOR	R0, R0
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVB	runtime·iscgo(SB), R6
+	CMPBEQ	R6, $0, 2(PC)
+	BL	runtime·load_g(SB)
+
+	MOVW	R2, 8(R15)
+	MOVD	R3, 16(R15)
+	MOVD	R4, 24(R15)
+	MOVD	$runtime·sigtrampgo(SB), R5
+	BL	R5
+	RET
+
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	BR	runtime·sigtramp(SB)
+
+// func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+TEXT runtime·mmap(SB),NOSPLIT,$48-40
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	prot+16(FP), R4
+	MOVW	flags+20(FP), R5
+	MOVW	fd+24(FP), R6
+	MOVWZ	off+28(FP), R7
+
+	// s390x uses old_mmap, so the arguments need to be placed into
+	// a struct and a pointer to the struct passed to mmap.
+	MOVD	R2, addr-48(SP)
+	MOVD	R3, n-40(SP)
+	MOVD	R4, prot-32(SP)
+	MOVD	R5, flags-24(SP)
+	MOVD	R6, fd-16(SP)
+	MOVD	R7, off-8(SP)
+
+	MOVD	$addr-48(SP), R2
+	MOVW	$SYS_mmap, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	NEG	R2
+	MOVD	R2, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	$SYS_munmap, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	flags+16(FP), R4
+	MOVW	$SYS_madvise, R1
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+// int64 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVW	op+8(FP), R3
+	MOVW	val+12(FP), R4
+	MOVD	ts+16(FP), R5
+	MOVD	addr2+24(FP), R6
+	MOVW	val3+32(FP),  R7
+	MOVW	$SYS_futex, R1
+	SYSCALL
+	MOVW	R2, ret+40(FP)
+	RET
+
+// int32 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), R3
+	MOVD	stk+8(FP), R2
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	// Careful: Linux system call clobbers ???.
+	MOVD	mm+16(FP), R7
+	MOVD	gg+24(FP), R8
+	MOVD	fn+32(FP), R9
+
+	MOVD	R7, -8(R2)
+	MOVD	R8, -16(R2)
+	MOVD	R9, -24(R2)
+	MOVD	$1234, R7
+	MOVD	R7, -32(R2)
+
+	SYSCALL $SYS_clone
+
+	// In parent, return.
+	CMPBEQ	R2, $0, 3(PC)
+	MOVW	R2, ret+40(FP)
+	RET
+
+	// In child, on new stack.
+	// initialize essential registers
+	XOR	R0, R0
+	MOVD	-32(R15), R7
+	CMP	R7, $1234
+	BEQ	2(PC)
+	MOVD	R0, 0(R0)
+
+	// Initialize m->procid to Linux tid
+	SYSCALL $SYS_gettid
+
+	MOVD	-24(R15), R9        // fn
+	MOVD	-16(R15), R8        // g
+	MOVD	-8(R15), R7         // m
+
+	CMPBEQ	R7, $0, nog
+	CMP	R8, $0
+	BEQ	nog
+
+	MOVD	R2, m_procid(R7)
+
+	// In child, set up new stack
+	MOVD	R7, g_m(R8)
+	MOVD	R8, g
+	//CALL	runtime·stackcheck(SB)
+
+nog:
+	// Call fn
+	BL	R9
+
+	// It shouldn't return.	 If it does, exit that thread.
+	MOVW	$111, R2
+	MOVW	$SYS_exit, R1
+	SYSCALL
+	BR	-2(PC)	// keep exiting
+
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+	MOVD	new+0(FP), R2
+	MOVD	old+8(FP), R3
+	MOVW	$SYS_sigaltstack, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_sched_yield, R1
+	SYSCALL
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
+	MOVD	pid+0(FP), R2
+	MOVD	len+8(FP), R3
+	MOVD	buf+16(FP), R4
+	MOVW	$SYS_sched_getaffinity, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
+	MOVW    size+0(FP), R2
+	MOVW	$SYS_epoll_create, R1
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), R2
+	MOVW	$SYS_epoll_create1, R1
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), R2
+	MOVW	op+4(FP), R3
+	MOVW	fd+8(FP), R4
+	MOVD	ev+16(FP), R5
+	MOVW	$SYS_epoll_ctl, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), R2
+	MOVD	ev+8(FP), R3
+	MOVW	nev+16(FP), R4
+	MOVW	timeout+20(FP), R5
+	MOVW	$SYS_epoll_wait, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
+	MOVW    fd+0(FP), R2  // fd
+	MOVD    $2, R3  // F_SETFD
+	MOVD    $1, R4  // FD_CLOEXEC
+	MOVW	$SYS_fcntl, R1
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_s390x.go b/src/runtime/sys_s390x.go
new file mode 100644
index 0000000000..2aa81e75c0
--- /dev/null
+++ b/src/runtime/sys_s390x.go
@@ -0,0 +1,45 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then did an immediate Gosave.
+func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
+	if buf.lr != 0 {
+		throw("invalid use of gostartcall")
+	}
+	buf.lr = buf.pc
+	buf.pc = uintptr(fn)
+	buf.ctxt = ctxt
+}
+
+// Called to rewind context saved during morestack back to beginning of function.
+// To help us, the linker emits a jmp back to the beginning right after the
+// call to morestack. We just have to decode and apply that jump.
+func rewindmorestack(buf *gobuf) {
+	var inst uint64
+	if buf.pc&1 == 0 && buf.pc != 0 {
+		inst = *(*uint64)(unsafe.Pointer(buf.pc))
+		switch inst >> 48 {
+		case 0xa7f4: // BRC (branch relative on condition) instruction.
+			inst >>= 32
+			inst &= 0xFFFF
+			offset := int64(int16(inst))
+			offset <<= 1
+			buf.pc += uintptr(offset)
+			return
+		case 0xc0f4: // BRCL (branch relative on condition long) instruction.
+			inst >>= 16
+			inst = inst & 0xFFFFFFFF
+			inst = (inst << 1) & 0xFFFFFFFF
+			buf.pc += uintptr(int32(inst))
+			return
+		}
+	}
+	print("runtime: pc=", hex(buf.pc), " ", hex(inst), "\n")
+	throw("runtime: misuse of rewindmorestack")
+}
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index ff045338c1..4a10749682 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -622,6 +622,13 @@ uintptr_t cfunc(callback f, uintptr_t n) {
 	}
 }
 
+func TestTimeBeginPeriod(t *testing.T) {
+	const TIMERR_NOERROR = 0
+	if *runtime.TimeBeginPeriodRetValue != TIMERR_NOERROR {
+		t.Fatalf("timeBeginPeriod failed: it returned %d", *runtime.TimeBeginPeriodRetValue)
+	}
+}
+
 // removeOneCPU removes one (any) cpu from affinity mask.
 // It returns new affinity mask.
 func removeOneCPU(mask uintptr) (uintptr, error) {
@@ -874,21 +881,10 @@ var (
 	modwinmm    = syscall.NewLazyDLL("winmm.dll")
 	modkernel32 = syscall.NewLazyDLL("kernel32.dll")
 
-	proctimeBeginPeriod = modwinmm.NewProc("timeBeginPeriod")
-	proctimeEndPeriod   = modwinmm.NewProc("timeEndPeriod")
-
 	procCreateEvent = modkernel32.NewProc("CreateEventW")
 	procSetEvent    = modkernel32.NewProc("SetEvent")
 )
 
-func timeBeginPeriod(period uint32) {
-	syscall.Syscall(proctimeBeginPeriod.Addr(), 1, uintptr(period), 0, 0)
-}
-
-func timeEndPeriod(period uint32) {
-	syscall.Syscall(proctimeEndPeriod.Addr(), 1, uintptr(period), 0, 0)
-}
-
 func createEvent() (syscall.Handle, error) {
 	r0, _, e0 := syscall.Syscall6(procCreateEvent.Addr(), 4, 0, 0, 0, 0, 0, 0)
 	if r0 == 0 {
@@ -905,7 +901,7 @@ func setEvent(h syscall.Handle) error {
 	return nil
 }
 
-func benchChanToSyscallPing(b *testing.B) {
+func BenchmarkChanToSyscallPing(b *testing.B) {
 	n := b.N
 	ch := make(chan int)
 	event, err := createEvent()
@@ -927,17 +923,7 @@ func benchChanToSyscallPing(b *testing.B) {
 	}
 }
 
-func BenchmarkChanToSyscallPing1ms(b *testing.B) {
-	timeBeginPeriod(1)
-	benchChanToSyscallPing(b)
-	timeEndPeriod(1)
-}
-
-func BenchmarkChanToSyscallPing15ms(b *testing.B) {
-	benchChanToSyscallPing(b)
-}
-
-func benchSyscallToSyscallPing(b *testing.B) {
+func BenchmarkSyscallToSyscallPing(b *testing.B) {
 	n := b.N
 	event1, err := createEvent()
 	if err != nil {
@@ -965,17 +951,7 @@ func benchSyscallToSyscallPing(b *testing.B) {
 	}
 }
 
-func BenchmarkSyscallToSyscallPing1ms(b *testing.B) {
-	timeBeginPeriod(1)
-	benchSyscallToSyscallPing(b)
-	timeEndPeriod(1)
-}
-
-func BenchmarkSyscallToSyscallPing15ms(b *testing.B) {
-	benchSyscallToSyscallPing(b)
-}
-
-func benchChanToChanPing(b *testing.B) {
+func BenchmarkChanToChanPing(b *testing.B) {
 	n := b.N
 	ch1 := make(chan int)
 	ch2 := make(chan int)
@@ -991,28 +967,8 @@ func benchChanToChanPing(b *testing.B) {
 	}
 }
 
-func BenchmarkChanToChanPing1ms(b *testing.B) {
-	timeBeginPeriod(1)
-	benchChanToChanPing(b)
-	timeEndPeriod(1)
-}
-
-func BenchmarkChanToChanPing15ms(b *testing.B) {
-	benchChanToChanPing(b)
-}
-
-func benchOsYield(b *testing.B) {
+func BenchmarkOsYield(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		runtime.OsYield()
 	}
 }
-
-func BenchmarkOsYield1ms(b *testing.B) {
-	timeBeginPeriod(1)
-	benchOsYield(b)
-	timeEndPeriod(1)
-}
-
-func BenchmarkOsYield15ms(b *testing.B) {
-	benchOsYield(b)
-}
diff --git a/src/runtime/tls_s390x.s b/src/runtime/tls_s390x.s
new file mode 100644
index 0000000000..cb6a21c114
--- /dev/null
+++ b/src/runtime/tls_s390x.s
@@ -0,0 +1,51 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// We have to resort to TLS variable to save g (R13).
+// One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R13,
+// this might well result in another SIGSEGV.
+
+// save_g saves the g register into pthread-provided
+// thread-local memory, so that we can call externally compiled
+// s390x code that will overwrite this register.
+//
+// If !iscgo, this is a no-op.
+//
+// NOTE: setg_gcc<> assume this clobbers only R10 and R11.
+TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVB	runtime·iscgo(SB),  R10
+	CMPBEQ	R10, $0, nocgo
+	MOVW	AR0, R11
+	SLD	$32, R11
+	MOVW	AR1, R11
+	MOVD	runtime·tls_g(SB), R10
+	MOVD	g, 0(R10)(R11*1)
+nocgo:
+	RET
+
+// load_g loads the g register from pthread-provided
+// thread-local memory, for use after calling externally compiled
+// s390x code that overwrote those registers.
+//
+// This is never called directly from C code (it doesn't have to
+// follow the C ABI), but it may be called from a C context, where the
+// usual Go registers aren't set up.
+//
+// NOTE: _cgo_topofstack assumes this only clobbers g (R13), R10 and R11.
+TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	AR0, R11
+	SLD	$32, R11
+	MOVW	AR1, R11
+	MOVD	runtime·tls_g(SB), R10
+	MOVD	0(R10)(R11*1), g
+	RET
+
+GLOBL runtime·tls_g+0(SB),TLSBSS,$8
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 805c34f483..092f941f0c 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -13,7 +13,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -23,25 +22,25 @@ const (
 	traceEvNone           = 0  // unused
 	traceEvBatch          = 1  // start of per-P batch of events [pid, timestamp]
 	traceEvFrequency      = 2  // contains tracer timer frequency [frequency (ticks per second)]
-	traceEvStack          = 3  // stack [stack id, number of PCs, array of PCs]
+	traceEvStack          = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
 	traceEvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
 	traceEvProcStart      = 5  // start of P [timestamp, thread id]
 	traceEvProcStop       = 6  // stop of P [timestamp]
-	traceEvGCStart        = 7  // GC start [timestamp, stack id]
+	traceEvGCStart        = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone         = 8  // GC done [timestamp]
 	traceEvGCScanStart    = 9  // GC scan start [timestamp]
 	traceEvGCScanDone     = 10 // GC scan done [timestamp]
 	traceEvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone    = 12 // GC sweep done [timestamp]
-	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, start PC, stack id]
-	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id]
+	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
 	traceEvGoEnd          = 15 // goroutine ends [timestamp]
 	traceEvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
 	traceEvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
 	traceEvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
 	traceEvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
 	traceEvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, stack]
+	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
 	traceEvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
 	traceEvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
 	traceEvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
@@ -49,15 +48,19 @@ const (
 	traceEvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
 	traceEvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
 	traceEvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, real timestamp]
+	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
 	traceEvGoSysBlock     = 30 // syscall blocks [timestamp]
-	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [goroutine id]
-	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [goroutine id]
+	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
 	traceEvHeapAlloc      = 33 // memstats.heap_live change [timestamp, heap_alloc]
 	traceEvNextGC         = 34 // memstats.next_gc change [timestamp, next_gc]
 	traceEvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
 	traceEvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
-	traceEvCount          = 37
+	traceEvString         = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvCount          = 41
 )
 
 const (
@@ -104,6 +107,7 @@ var trace struct {
 	ticksEnd      int64       // cputicks when tracing was stopped
 	timeStart     int64       // nanotime when tracing was started
 	timeEnd       int64       // nanotime when tracing was stopped
+	seqGC         uint64      // GC start/done sequencer
 	reading       traceBufPtr // buffer currently handed off to user
 	empty         traceBufPtr // stack of empty buffers
 	fullHead      traceBufPtr // queue of full buffers
@@ -111,35 +115,19 @@ var trace struct {
 	reader        *g              // goroutine that called ReadTrace, or nil
 	stackTab      traceStackTable // maps stack traces to unique ids
 
+	// Dictionary for traceEvString.
+	// Currently this is used only for func/file:line info after tracing session,
+	// so we assume single-threaded access.
+	strings   map[string]uint64
+	stringSeq uint64
+
 	bufLock mutex       // protects buf
 	buf     traceBufPtr // global trace buffer, used when running without a p
 }
 
-var traceseq uint64 // global trace sequence number
-
-// tracestamp returns a consistent sequence number, time stamp pair
-// for use in a trace. We need to make sure that time stamp ordering
-// (assuming synchronized CPUs) and sequence ordering match.
-// To do that, we increment traceseq, grab ticks, and increment traceseq again.
-// We treat odd traceseq as a sign that another thread is in the middle
-// of the sequence and spin until it is done.
-// Not splitting stack to avoid preemption, just in case the call sites
-// that used to call xadd64 and cputicks are sensitive to that.
-//go:nosplit
-func tracestamp() (seq uint64, ts int64) {
-	seq = atomic.Load64(&traceseq)
-	for seq&1 != 0 || !atomic.Cas64(&traceseq, seq, seq+1) {
-		seq = atomic.Load64(&traceseq)
-	}
-	ts = cputicks()
-	atomic.Store64(&traceseq, seq+2)
-	return seq >> 1, ts
-}
-
 // traceBufHeader is per-P tracing buffer.
 type traceBufHeader struct {
 	link      traceBufPtr             // in trace.empty/full
-	lastSeq   uint64                  // sequence number of last event
 	lastTicks uint64                  // when we wrote the last event
 	pos       int                     // next write offset in arr
 	stk       [traceStackSize]uintptr // scratch buffer for traceback
@@ -187,11 +175,6 @@ func StartTrace() error {
 		return errorString("tracing is already enabled")
 	}
 
-	trace.seqStart, trace.ticksStart = tracestamp()
-	trace.timeStart = nanotime()
-	trace.headerWritten = false
-	trace.footerWritten = false
-
 	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
 	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
 	// That would lead to an inconsistent trace:
@@ -204,12 +187,15 @@ func StartTrace() error {
 	for _, gp := range allgs {
 		status := readgstatus(gp)
 		if status != _Gdead {
-			traceGoCreate(gp, gp.startpc)
+			traceGoCreate(gp, gp.startpc) // also resets gp.traceseq/tracelastp
 		}
 		if status == _Gwaiting {
+			// traceEvGoWaiting is implied to have seq=1.
+			gp.traceseq++
 			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
 		}
 		if status == _Gsyscall {
+			gp.traceseq++
 			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
 		} else {
 			gp.sysblocktraced = false
@@ -217,6 +203,17 @@ func StartTrace() error {
 	}
 	traceProcStart()
 	traceGoStart()
+	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
+	// If we do it the other way around, it is possible that exitsyscall will
+	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
+	// It will lead to a false conclusion that cputicks is broken.
+	trace.ticksStart = cputicks()
+	trace.timeStart = nanotime()
+	trace.headerWritten = false
+	trace.footerWritten = false
+	trace.strings = make(map[string]uint64)
+	trace.stringSeq = 0
+	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
@@ -272,8 +269,6 @@ func StopTrace() {
 
 	trace.enabled = false
 	trace.shutdown = true
-	trace.stackTab.dump()
-
 	unlock(&trace.bufLock)
 
 	startTheWorld()
@@ -309,6 +304,7 @@ func StopTrace() {
 		trace.empty = buf.ptr().link
 		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
 	}
+	trace.strings = nil
 	trace.shutdown = false
 	unlock(&trace.lock)
 }
@@ -348,7 +344,7 @@ func ReadTrace() []byte {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.5 trace\x00\x00\x00\x00")
+		return []byte("go 1.7 trace\x00\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -374,12 +370,13 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		data = traceAppend(data, 0)
 		if timers.gp != nil {
 			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
 			data = traceAppend(data, uint64(timers.gp.goid))
-			data = traceAppend(data, 0)
 		}
+		// This will emit a bunch of full buffers, we will pick them up
+		// on the next iteration.
+		trace.stackTab.dump()
 		return data
 	}
 	// Done.
@@ -483,19 +480,14 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 		(*bufp).set(buf)
 	}
 
-	seq, ticksraw := tracestamp()
-	seqDiff := seq - buf.lastSeq
-	ticks := uint64(ticksraw) / traceTickDiv
+	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
 	if buf.pos == 0 {
 		buf.byte(traceEvBatch | 1<<traceArgCountShift)
 		buf.varint(uint64(pid))
-		buf.varint(seq)
 		buf.varint(ticks)
-		seqDiff = 0
 		tickDiff = 0
 	}
-	buf.lastSeq = seq
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -514,7 +506,6 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 		buf.varint(0)
 		lenp = &buf.arr[buf.pos-1]
 	}
-	buf.varint(seqDiff)
 	buf.varint(tickDiff)
 	for _, a := range args {
 		buf.varint(a)
@@ -603,6 +594,29 @@ func traceFlush(buf traceBufPtr) traceBufPtr {
 	return buf
 }
 
+func traceString(buf *traceBuf, s string) (uint64, *traceBuf) {
+	if s == "" {
+		return 0, buf
+	}
+	if id, ok := trace.strings[s]; ok {
+		return id, buf
+	}
+
+	trace.stringSeq++
+	id := trace.stringSeq
+	trace.strings[s] = id
+
+	size := 1 + 2*traceBytesPerNumber + len(s)
+	if len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+	}
+	buf.byte(traceEvString)
+	buf.varint(id)
+	buf.varint(uint64(len(s)))
+	buf.pos += copy(buf.arr[buf.pos:], s)
+	return id, buf
+}
+
 // traceAppend appends v to buf in little-endian-base-128 encoding.
 func traceAppend(buf []byte, v uint64) []byte {
 	for ; v >= 0x80; v >>= 7 {
@@ -716,23 +730,28 @@ func (tab *traceStackTable) newStack(n int) *traceStack {
 // dump writes all previously cached stacks to trace buffers,
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
-	var tmp [(2 + traceStackSize) * traceBytesPerNumber]byte
+	frames := make(map[uintptr]traceFrame)
+	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
 	buf := traceFlush(0).ptr()
 	for _, stk := range tab.tab {
 		stk := stk.ptr()
 		for ; stk != nil; stk = stk.link.ptr() {
-			maxSize := 1 + (3+stk.n)*traceBytesPerNumber
-			if len(buf.arr)-buf.pos < maxSize {
-				buf = traceFlush(traceBufPtrOf(buf)).ptr()
-			}
-			// Form the event in the temp buffer, we need to know the actual length.
 			tmpbuf := tmp[:0]
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.n))
 			for _, pc := range stk.stack() {
+				var frame traceFrame
+				frame, buf = traceFrameForPC(buf, frames, pc)
 				tmpbuf = traceAppend(tmpbuf, uint64(pc))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
 			}
 			// Now copy to the buffer.
+			size := 1 + traceBytesPerNumber + len(tmpbuf)
+			if len(buf.arr)-buf.pos < size {
+				buf = traceFlush(traceBufPtrOf(buf)).ptr()
+			}
 			buf.byte(traceEvStack | 3<<traceArgCountShift)
 			buf.varint(uint64(len(tmpbuf)))
 			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
@@ -747,6 +766,39 @@ func (tab *traceStackTable) dump() {
 	*tab = traceStackTable{}
 }
 
+type traceFrame struct {
+	funcID uint64
+	fileID uint64
+	line   uint64
+}
+
+func traceFrameForPC(buf *traceBuf, frames map[uintptr]traceFrame, pc uintptr) (traceFrame, *traceBuf) {
+	if frame, ok := frames[pc]; ok {
+		return frame, buf
+	}
+
+	var frame traceFrame
+	f := findfunc(pc)
+	if f == nil {
+		frames[pc] = frame
+		return frame, buf
+	}
+
+	fn := funcname(f)
+	const maxLen = 1 << 10
+	if len(fn) > maxLen {
+		fn = fn[len(fn)-maxLen:]
+	}
+	frame.funcID, buf = traceString(buf, fn)
+	file, line := funcline(f, pc-sys.PCQuantum)
+	frame.line = uint64(line)
+	if len(file) > maxLen {
+		file = file[len(file)-maxLen:]
+	}
+	frame.fileID, buf = traceString(buf, file)
+	return frame, buf
+}
+
 // traceAlloc is a non-thread-safe region allocator.
 // It holds a linked list of traceAllocBlock.
 type traceAlloc struct {
@@ -820,7 +872,8 @@ func traceProcStop(pp *p) {
 }
 
 func traceGCStart() {
-	traceEvent(traceEvGCStart, 3)
+	traceEvent(traceEvGCStart, 3, trace.seqGC)
+	trace.seqGC++
 }
 
 func traceGCDone() {
@@ -844,11 +897,23 @@ func traceGCSweepDone() {
 }
 
 func traceGoCreate(newg *g, pc uintptr) {
-	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(pc))
+	newg.traceseq = 0
+	newg.tracelastp = getg().m.p
+	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+	id := trace.stackTab.put([]uintptr{pc + sys.PCQuantum})
+	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
 }
 
 func traceGoStart() {
-	traceEvent(traceEvGoStart, -1, uint64(getg().m.curg.goid))
+	_g_ := getg().m.curg
+	_p_ := _g_.m.p
+	_g_.traceseq++
+	if _g_.tracelastp == _p_ {
+		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
+	} else {
+		_g_.tracelastp = _p_
+		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
+	}
 }
 
 func traceGoEnd() {
@@ -856,10 +921,14 @@ func traceGoEnd() {
 }
 
 func traceGoSched() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoSched, 1)
 }
 
 func traceGoPreempt() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoPreempt, 1)
 }
 
@@ -871,19 +940,37 @@ func traceGoPark(traceEv byte, skip int, gp *g) {
 }
 
 func traceGoUnpark(gp *g, skip int) {
-	traceEvent(traceEvGoUnblock, skip, uint64(gp.goid))
+	_p_ := getg().m.p
+	gp.traceseq++
+	if gp.tracelastp == _p_ {
+		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
+	} else {
+		gp.tracelastp = _p_
+		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
+	}
 }
 
 func traceGoSysCall() {
 	traceEvent(traceEvGoSysCall, 1)
 }
 
-func traceGoSysExit(seq uint64, ts int64) {
-	if int64(seq)-int64(trace.seqStart) < 0 {
-		// The timestamp was obtained during a previous tracing session, ignore.
-		return
+func traceGoSysExit(ts int64) {
+	if ts != 0 && ts < trace.ticksStart {
+		// There is a race between the code that initializes sysexitticks
+		// (in exitsyscall, which runs without a P, and therefore is not
+		// stopped with the rest of the world) and the code that initializes
+		// a new trace. The recorded sysexitticks must therefore be treated
+		// as "best effort". If they are valid for this trace, then great,
+		// use them for greater accuracy. But if they're not valid for this
+		// trace, assume that the trace was started after the actual syscall
+		// exit (but before we actually managed to start the goroutine,
+		// aka right now), and assign a fresh time stamp to keep the log consistent.
+		ts = 0
 	}
-	traceEvent(traceEvGoSysExit, -1, uint64(getg().m.curg.goid), seq, uint64(ts)/traceTickDiv)
+	_g_ := getg().m.curg
+	_g_.traceseq++
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
 }
 
 func traceGoSysBlock(pp *p) {
diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
index b99ec687d5..52a71bfb94 100644
--- a/src/runtime/trace/trace_stack_test.go
+++ b/src/runtime/trace/trace_stack_test.go
@@ -125,14 +125,7 @@ func TestTraceSymbolize(t *testing.T) {
 	<-pipeReadDone
 
 	Stop()
-	events, _, err := parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
-	err = trace.Symbolize(events, os.Args[0])
-	if err != nil {
-		t.Fatalf("failed to symbolize trace: %v", err)
-	}
+	events, _ := parseTrace(t, buf)
 
 	// Now check that the stacks are correct.
 	type frame struct {
@@ -149,6 +142,9 @@ func TestTraceSymbolize(t *testing.T) {
 			{"runtime/trace_test.TestTraceSymbolize", 106},
 			{"testing.tRunner", 0},
 		}},
+		{trace.EvGoStart, []frame{
+			{"runtime/trace_test.TestTraceSymbolize.func1", 37},
+		}},
 		{trace.EvGoSched, []frame{
 			{"runtime/trace_test.TestTraceSymbolize", 107},
 			{"testing.tRunner", 0},
diff --git a/src/runtime/trace/trace_test.go b/src/runtime/trace/trace_test.go
index b787a2fc27..5fad3fb7f0 100644
--- a/src/runtime/trace/trace_test.go
+++ b/src/runtime/trace/trace_test.go
@@ -52,7 +52,7 @@ func TestTrace(t *testing.T) {
 		t.Fatalf("failed to start tracing: %v", err)
 	}
 	Stop()
-	_, err := trace.Parse(buf)
+	_, err := trace.Parse(buf, "")
 	if err == trace.ErrTimeOrder {
 		t.Skipf("skipping trace: %v", err)
 	}
@@ -61,13 +61,13 @@ func TestTrace(t *testing.T) {
 	}
 }
 
-func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc, error) {
-	events, err := trace.Parse(r)
+func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc) {
+	events, err := trace.Parse(r, "")
 	if err == trace.ErrTimeOrder {
 		t.Skipf("skipping trace: %v", err)
 	}
 	if err != nil {
-		return nil, nil, err
+		t.Fatalf("failed to parse trace: %v", err)
 	}
 	gs := trace.GoroutineStats(events)
 	for goid := range gs {
@@ -75,7 +75,31 @@ func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GD
 		// But still check that RelatedGoroutines does not crash, hang, etc.
 		_ = trace.RelatedGoroutines(events, goid)
 	}
-	return events, gs, nil
+	return events, gs
+}
+
+func testBrokenTimestamps(t *testing.T, data []byte) {
+	// On some processors cputicks (used to generate trace timestamps)
+	// produce non-monotonic timestamps. It is important that the parser
+	// distinguishes logically inconsistent traces (e.g. missing, excessive
+	// or misordered events) from broken timestamps. The former is a bug
+	// in tracer, the latter is a machine issue.
+	// So now that we have a consistent trace, test that (1) parser does
+	// not return a logical error in case of broken timestamps
+	// and (2) broken timestamps are eventually detected and reported.
+	trace.BreakTimestampsForTesting = true
+	defer func() {
+		trace.BreakTimestampsForTesting = false
+	}()
+	for i := 0; i < 1e4; i++ {
+		_, err := trace.Parse(bytes.NewReader(data), "")
+		if err == trace.ErrTimeOrder {
+			return
+		}
+		if err != nil {
+			t.Fatalf("failed to parse trace: %v", err)
+		}
+	}
 }
 
 func TestTraceStress(t *testing.T) {
@@ -209,10 +233,9 @@ func TestTraceStress(t *testing.T) {
 	runtime.GOMAXPROCS(procs)
 
 	Stop()
-	_, _, err = parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
+	trace := buf.Bytes()
+	parseTrace(t, buf)
+	testBrokenTimestamps(t, trace)
 }
 
 // Do a bunch of various stuff (timers, GC, network, etc) in a separate goroutine.
@@ -353,9 +376,9 @@ func TestTraceStressStartStop(t *testing.T) {
 		}
 		time.Sleep(time.Millisecond)
 		Stop()
-		if _, _, err := parseTrace(t, buf); err != nil {
-			t.Fatalf("failed to parse trace: %v", err)
-		}
+		trace := buf.Bytes()
+		parseTrace(t, buf)
+		testBrokenTimestamps(t, trace)
 	}
 	<-outerDone
 }
@@ -413,10 +436,7 @@ func TestTraceFutileWakeup(t *testing.T) {
 	done.Wait()
 
 	Stop()
-	events, _, err := parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
+	events, _ := parseTrace(t, buf)
 	// Check that (1) trace does not contain EvFutileWakeup events and
 	// (2) there are no consecutive EvGoBlock/EvGCStart/EvGoBlock events
 	// (we call runtime.Gosched between all operations, so these would be futile wakeups).
diff --git a/src/runtime/type.go b/src/runtime/type.go
index fbf6f9973c..608c601abd 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -6,15 +6,20 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
+import "unsafe"
 
-// tflag is documented in ../reflect/type.go.
+// tflag is documented in reflect/type.go.
+//
+// tflag values must be kept in sync with copies in:
+//	cmd/compile/internal/gc/reflect.go
+//	cmd/link/internal/ld/decodesym.go
+//	reflect/type.go
 type tflag uint8
 
-const tflagUncommon tflag = 1
+const (
+	tflagUncommon  tflag = 1 << 0
+	tflagExtraStar tflag = 1 << 1
+)
 
 // Needs to be in sync with ../cmd/compile/internal/ld/decodesym.go:/^func.commonsize,
 // ../cmd/compile/internal/gc/reflect.go:/^func.dcommontype and
@@ -31,8 +36,17 @@ type _type struct {
 	// gcdata stores the GC type data for the garbage collector.
 	// If the KindGCProg bit is set in kind, gcdata is a GC program.
 	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
-	gcdata  *byte
-	_string string
+	gcdata *byte
+	str    nameOff
+	_      int32
+}
+
+func (t *_type) string() string {
+	s := t.nameOff(t.str).name()
+	if t.tflag&tflagExtraStar != 0 {
+		return s[1:]
+	}
+	return s
 }
 
 func (t *_type) uncommon() *uncommontype {
@@ -102,33 +116,160 @@ func hasPrefix(s, prefix string) bool {
 }
 
 func (t *_type) name() string {
-	if hasPrefix(t._string, "map[") {
+	s := t.string()
+	if hasPrefix(s, "map[") {
 		return ""
 	}
-	if hasPrefix(t._string, "struct {") {
+	if hasPrefix(s, "struct {") {
 		return ""
 	}
-	if hasPrefix(t._string, "chan ") {
+	if hasPrefix(s, "chan ") {
 		return ""
 	}
-	if hasPrefix(t._string, "chan<-") {
+	if hasPrefix(s, "chan<-") {
 		return ""
 	}
-	if hasPrefix(t._string, "func(") {
+	if hasPrefix(s, "func(") {
 		return ""
 	}
-	switch t._string[0] {
+	if hasPrefix(s, "interface {") {
+		return ""
+	}
+	switch s[0] {
 	case '[', '*', '<':
 		return ""
 	}
-	i := len(t._string) - 1
+	i := len(s) - 1
 	for i >= 0 {
-		if t._string[i] == '.' {
+		if s[i] == '.' {
 			break
 		}
 		i--
 	}
-	return t._string[i+1:]
+	return s[i+1:]
+}
+
+// reflectOffs holds type offsets defined at run time by the reflect package.
+//
+// When a type is defined at run time, its *rtype data lives on the heap.
+// There are a wide range of possible addresses the heap may use, that
+// may not be representable as a 32-bit offset. Moreover the GC may
+// one day start moving heap memory, in which case there is no stable
+// offset that can be defined.
+//
+// To provide stable offsets, we add pin *rtype objects in a global map
+// and treat the offset as an identifier. We use negative offsets that
+// do not overlap with any compile-time module offsets.
+//
+// Entries are created by reflect.addReflectOff.
+var reflectOffs struct {
+	lock mutex
+	next int32
+	m    map[int32]unsafe.Pointer
+	minv map[unsafe.Pointer]int32
+}
+
+func resolveNameOff(ptrInModule unsafe.Pointer, off nameOff) name {
+	if off == 0 {
+		return name{}
+	}
+	base := uintptr(ptrInModule)
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		lock(&reflectOffs.lock)
+		res, found := reflectOffs.m[int32(off)]
+		unlock(&reflectOffs.lock)
+		if !found {
+			println("runtime: nameOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: name offset base pointer out of range")
+		}
+		return name{(*byte)(res)}
+	}
+	res := md.types + uintptr(off)
+	if res > md.etypes {
+		println("runtime: nameOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+		throw("runtime: name offset out of range")
+	}
+	return name{(*byte)(unsafe.Pointer(res))}
+}
+
+func (t *_type) nameOff(off nameOff) name {
+	return resolveNameOff(unsafe.Pointer(t), off)
+}
+
+func (t *_type) typeOff(off typeOff) *_type {
+	if off == 0 {
+		return nil
+	}
+	base := uintptr(unsafe.Pointer(t))
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		lock(&reflectOffs.lock)
+		res := reflectOffs.m[int32(off)]
+		unlock(&reflectOffs.lock)
+		if res == nil {
+			println("runtime: typeOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: type offset base pointer out of range")
+		}
+		return (*_type)(res)
+	}
+	if t := md.typemap[off]; t != nil {
+		return t
+	}
+	res := md.types + uintptr(off)
+	if res > md.etypes {
+		println("runtime: typeOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+		throw("runtime: type offset out of range")
+	}
+	return (*_type)(unsafe.Pointer(res))
+}
+
+func (t *_type) textOff(off textOff) unsafe.Pointer {
+	base := uintptr(unsafe.Pointer(t))
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		lock(&reflectOffs.lock)
+		res := reflectOffs.m[int32(off)]
+		unlock(&reflectOffs.lock)
+		if res == nil {
+			println("runtime: textOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: text offset base pointer out of range")
+		}
+		return res
+	}
+	res := md.text + uintptr(off)
+	if res > md.etext {
+		println("runtime: textOff", hex(off), "out of range", hex(md.text), "-", hex(md.etext))
+		throw("runtime: text offset out of range")
+	}
+	return unsafe.Pointer(res)
 }
 
 func (t *functype) in() []*_type {
@@ -154,26 +295,31 @@ func (t *functype) dotdotdot() bool {
 	return t.outCount&(1<<15) != 0
 }
 
+type nameOff int32
+type typeOff int32
+type textOff int32
+
 type method struct {
-	name name
-	mtyp *_type
-	ifn  unsafe.Pointer
-	tfn  unsafe.Pointer
+	name nameOff
+	mtyp typeOff
+	ifn  textOff
+	tfn  textOff
 }
 
 type uncommontype struct {
-	pkgpath *string
-	mhdr    []method
+	pkgpath nameOff
+	mcount  uint16 // number of methods
+	moff    uint16 // offset from this uncommontype to [mcount]method
 }
 
 type imethod struct {
-	name  name
-	_type *_type
+	name nameOff
+	ityp typeOff
 }
 
 type interfacetype struct {
 	typ     _type
-	pkgpath *string
+	pkgpath name
 	mhdr    []imethod
 }
 
@@ -229,7 +375,7 @@ type structfield struct {
 
 type structtype struct {
 	typ     _type
-	pkgPath *string
+	pkgPath name
 	fields  []structfield
 }
 
@@ -239,19 +385,19 @@ type name struct {
 	bytes *byte
 }
 
-func (n *name) data(off int) *byte {
+func (n name) data(off int) *byte {
 	return (*byte)(add(unsafe.Pointer(n.bytes), uintptr(off)))
 }
 
-func (n *name) isExported() bool {
+func (n name) isExported() bool {
 	return (*n.bytes)&(1<<0) != 0
 }
 
-func (n *name) nameLen() int {
+func (n name) nameLen() int {
 	return int(uint16(*n.data(1))<<8 | uint16(*n.data(2)))
 }
 
-func (n *name) tagLen() int {
+func (n name) tagLen() int {
 	if *n.data(0)&(1<<1) == 0 {
 		return 0
 	}
@@ -259,7 +405,10 @@ func (n *name) tagLen() int {
 	return int(uint16(*n.data(off))<<8 | uint16(*n.data(off + 1)))
 }
 
-func (n *name) name() (s string) {
+func (n name) name() (s string) {
+	if n.bytes == nil {
+		return ""
+	}
 	nl := n.nameLen()
 	if nl == 0 {
 		return ""
@@ -270,14 +419,219 @@ func (n *name) name() (s string) {
 	return s
 }
 
-func (n *name) pkgPath() *string {
-	if *n.data(0)&(1<<2) == 0 {
-		return nil
+func (n name) tag() (s string) {
+	tl := n.tagLen()
+	if tl == 0 {
+		return ""
+	}
+	nl := n.nameLen()
+	hdr := (*stringStruct)(unsafe.Pointer(&s))
+	hdr.str = unsafe.Pointer(n.data(3 + nl + 2))
+	hdr.len = tl
+	return s
+}
+
+func (n name) pkgPath() string {
+	if n.bytes == nil || *n.data(0)&(1<<2) == 0 {
+		return ""
 	}
 	off := 3 + n.nameLen()
 	if tl := n.tagLen(); tl > 0 {
 		off += 2 + tl
 	}
-	off = int(round(uintptr(off), sys.PtrSize))
-	return *(**string)(unsafe.Pointer(n.data(off)))
+	var nameOff nameOff
+	copy((*[4]byte)(unsafe.Pointer(&nameOff))[:], (*[4]byte)(unsafe.Pointer(n.data(off)))[:])
+	pkgPathName := resolveNameOff(unsafe.Pointer(n.bytes), nameOff)
+	return pkgPathName.name()
+}
+
+// typelinksinit scans the types from extra modules and builds the
+// moduledata typemap used to de-duplicate type pointers.
+func typelinksinit() {
+	if firstmoduledata.next == nil {
+		return
+	}
+	typehash := make(map[uint32][]*_type)
+
+	modules := []*moduledata{}
+	for md := &firstmoduledata; md != nil; md = md.next {
+		modules = append(modules, md)
+	}
+	prev, modules := modules[len(modules)-1], modules[:len(modules)-1]
+	for len(modules) > 0 {
+		// Collect types from the previous module into typehash.
+	collect:
+		for _, tl := range prev.typelinks {
+			var t *_type
+			if prev.typemap == nil {
+				t = (*_type)(unsafe.Pointer(prev.types + uintptr(tl)))
+			} else {
+				t = prev.typemap[typeOff(tl)]
+			}
+			// Add to typehash if not seen before.
+			tlist := typehash[t.hash]
+			for _, tcur := range tlist {
+				if tcur == t {
+					continue collect
+				}
+			}
+			typehash[t.hash] = append(tlist, t)
+		}
+
+		// If any of this module's typelinks match a type from a
+		// prior module, prefer that prior type by adding the offset
+		// to this module's typemap.
+		md := modules[len(modules)-1]
+		md.typemap = make(map[typeOff]*_type, len(md.typelinks))
+		for _, tl := range md.typelinks {
+			t := (*_type)(unsafe.Pointer(md.types + uintptr(tl)))
+			for _, candidate := range typehash[t.hash] {
+				if typesEqual(t, candidate) {
+					t = candidate
+					break
+				}
+			}
+			md.typemap[typeOff(tl)] = t
+		}
+
+		prev, modules = md, modules[:len(modules)-1]
+	}
+}
+
+// typesEqual reports whether two types are equal.
+//
+// Everywhere in the runtime and reflect packages, it is assumed that
+// there is exactly one *_type per Go type, so that pointer equality
+// can be used to test if types are equal. There is one place that
+// breaks this assumption: buildmode=shared. In this case a type can
+// appear as two different pieces of memory. This is hidden from the
+// runtime and reflect package by the per-module typemap built in
+// typelinksinit. It uses typesEqual to map types from later modules
+// back into earlier ones.
+//
+// Only typelinksinit needs this function.
+func typesEqual(t, v *_type) bool {
+	if t == v {
+		return true
+	}
+	kind := t.kind & kindMask
+	if kind != v.kind&kindMask {
+		return false
+	}
+	if t.string() != v.string() {
+		return false
+	}
+	ut := t.uncommon()
+	uv := v.uncommon()
+	if ut != nil || uv != nil {
+		if ut == nil || uv == nil {
+			return false
+		}
+		pkgpatht := t.nameOff(ut.pkgpath).name()
+		pkgpathv := v.nameOff(uv.pkgpath).name()
+		if pkgpatht != pkgpathv {
+			return false
+		}
+	}
+	if kindBool <= kind && kind <= kindComplex128 {
+		return true
+	}
+	switch kind {
+	case kindString, kindUnsafePointer:
+		return true
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		av := (*arraytype)(unsafe.Pointer(v))
+		return typesEqual(at.elem, av.elem) && at.len == av.len
+	case kindChan:
+		ct := (*chantype)(unsafe.Pointer(t))
+		cv := (*chantype)(unsafe.Pointer(v))
+		return ct.dir == cv.dir && typesEqual(ct.elem, cv.elem)
+	case kindFunc:
+		ft := (*functype)(unsafe.Pointer(t))
+		fv := (*functype)(unsafe.Pointer(v))
+		if ft.outCount != fv.outCount || ft.inCount != fv.inCount {
+			return false
+		}
+		tin, vin := ft.in(), fv.in()
+		for i := 0; i < len(tin); i++ {
+			if !typesEqual(tin[i], vin[i]) {
+				return false
+			}
+		}
+		tout, vout := ft.out(), fv.out()
+		for i := 0; i < len(tout); i++ {
+			if !typesEqual(tout[i], vout[i]) {
+				return false
+			}
+		}
+		return true
+	case kindInterface:
+		it := (*interfacetype)(unsafe.Pointer(t))
+		iv := (*interfacetype)(unsafe.Pointer(v))
+		if it.pkgpath.name() != iv.pkgpath.name() {
+			return false
+		}
+		if len(it.mhdr) != len(iv.mhdr) {
+			return false
+		}
+		for i := range it.mhdr {
+			tm := &it.mhdr[i]
+			vm := &iv.mhdr[i]
+			tname := it.typ.nameOff(tm.name)
+			vname := iv.typ.nameOff(vm.name)
+			if tname.name() != vname.name() {
+				return false
+			}
+			if tname.pkgPath() != vname.pkgPath() {
+				return false
+			}
+			if !typesEqual(it.typ.typeOff(tm.ityp), iv.typ.typeOff(vm.ityp)) {
+				return false
+			}
+		}
+		return true
+	case kindMap:
+		mt := (*maptype)(unsafe.Pointer(t))
+		mv := (*maptype)(unsafe.Pointer(v))
+		return typesEqual(mt.key, mv.key) && typesEqual(mt.elem, mv.elem)
+	case kindPtr:
+		pt := (*ptrtype)(unsafe.Pointer(t))
+		pv := (*ptrtype)(unsafe.Pointer(v))
+		return typesEqual(pt.elem, pv.elem)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		sv := (*slicetype)(unsafe.Pointer(v))
+		return typesEqual(st.elem, sv.elem)
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		sv := (*structtype)(unsafe.Pointer(v))
+		if len(st.fields) != len(sv.fields) {
+			return false
+		}
+		for i := range st.fields {
+			tf := &st.fields[i]
+			vf := &sv.fields[i]
+			if tf.name.name() != vf.name.name() {
+				return false
+			}
+			if tf.name.pkgPath() != vf.name.pkgPath() {
+				return false
+			}
+			if !typesEqual(tf.typ, vf.typ) {
+				return false
+			}
+			if tf.name.tag() != vf.name.tag() {
+				return false
+			}
+			if tf.offset != vf.offset {
+				return false
+			}
+		}
+		return true
+	default:
+		println("runtime: impossible type kind", kind)
+		throw("runtime: impossible type kind")
+		return false
+	}
 }
diff --git a/src/runtime/unaligned1.go b/src/runtime/unaligned1.go
index 6bd9018352..754d63b417 100644
--- a/src/runtime/unaligned1.go
+++ b/src/runtime/unaligned1.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le
+// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le s390x
 
 package runtime
 
diff --git a/src/runtime/vdso_linux_amd64.go b/src/runtime/vdso_linux_amd64.go
index 42571e063c..8a970dfbe6 100644
--- a/src/runtime/vdso_linux_amd64.go
+++ b/src/runtime/vdso_linux_amd64.go
@@ -4,10 +4,7 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
+import "unsafe"
 
 // Look up symbols in the Linux vDSO.
 
@@ -21,9 +18,7 @@ import (
 // http://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
 
 const (
-	_AT_RANDOM       = 25
 	_AT_SYSINFO_EHDR = 33
-	_AT_NULL         = 0 /* End of vector */
 
 	_PT_LOAD    = 1 /* Loadable program segment */
 	_PT_DYNAMIC = 2 /* Dynamic linking information */
@@ -294,37 +289,18 @@ func vdso_parse_symbols(info *vdso_info, version int32) {
 	}
 }
 
-func sysargs(argc int32, argv **byte) {
-	n := argc + 1
-
-	// skip envp to get to ELF auxiliary vector.
-	for argv_index(argv, n) != nil {
-		n++
-	}
-
-	// skip NULL separator
-	n++
-
-	// now argv+n is auxv
-	auxv := (*[1 << 32]elf64Auxv)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-
-	for i := 0; auxv[i].a_type != _AT_NULL; i++ {
-		av := &auxv[i]
-		switch av.a_type {
-		case _AT_SYSINFO_EHDR:
-			if av.a_val == 0 {
-				// Something went wrong
-				continue
-			}
-			var info vdso_info
-			// TODO(rsc): I don't understand why the compiler thinks info escapes
-			// when passed to the three functions below.
-			info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
-			vdso_init_from_sysinfo_ehdr(info1, (*elf64Ehdr)(unsafe.Pointer(uintptr(av.a_val))))
-			vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
-
-		case _AT_RANDOM:
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(av.a_val)))[:]
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_SYSINFO_EHDR:
+		if val == 0 {
+			// Something went wrong
+			return
 		}
+		var info vdso_info
+		// TODO(rsc): I don't understand why the compiler thinks info escapes
+		// when passed to the three functions below.
+		info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
+		vdso_init_from_sysinfo_ehdr(info1, (*elf64Ehdr)(unsafe.Pointer(val)))
+		vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
 	}
 }
diff --git a/src/runtime/vdso_none.go b/src/runtime/vdso_none.go
index b4e0a0e349..efae23f6ee 100644
--- a/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go
@@ -2,9 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !linux !amd64
-// +build !linux !386
-// +build !linux !arm
+// +build !linux
 
 package runtime
 
diff --git a/src/runtime/vlop_arm_test.go b/src/runtime/vlop_arm_test.go
index 1a211196f2..85cea923a9 100644
--- a/src/runtime/vlop_arm_test.go
+++ b/src/runtime/vlop_arm_test.go
@@ -82,3 +82,47 @@ func TestUsplit(t *testing.T) {
 		}
 	}
 }
+
+//go:noinline
+func armFloatWrite(a *[129]float64) {
+	// This used to miscompile on arm5.
+	// The offset is too big to fit in a load.
+	// So the code does:
+	//   ldr     r0, [sp, #8]
+	//   bl      6f690 <_sfloat>
+	//   ldr     fp, [pc, #32]   ; (address of 128.0)
+	//   vldr    d0, [fp]
+	//   ldr     fp, [pc, #28]   ; (1024)
+	//   add     fp, fp, r0
+	//   vstr    d0, [fp]
+	// The software floating-point emulator gives up on the add.
+	// This causes the store to not work.
+	// See issue 15440.
+	a[128] = 128.0
+}
+func TestArmFloatBigOffsetWrite(t *testing.T) {
+	var a [129]float64
+	for i := 0; i < 128; i++ {
+		a[i] = float64(i)
+	}
+	armFloatWrite(&a)
+	for i, x := range a {
+		if x != float64(i) {
+			t.Errorf("bad entry %d:%f\n", i, x)
+		}
+	}
+}
+
+//go:noinline
+func armFloatRead(a *[129]float64) float64 {
+	return a[128]
+}
+func TestArmFloatBigOffsetRead(t *testing.T) {
+	var a [129]float64
+	for i := 0; i < 129; i++ {
+		a[i] = float64(i)
+	}
+	if x := armFloatRead(&a); x != 128.0 {
+		t.Errorf("bad value %f\n", x)
+	}
+}
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 2419f78ce2..cd37828ae4 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -195,7 +195,6 @@ func dodiv(n, d uint64) (q, r uint64) {
 	if GOARCH == "arm" {
 		// arm doesn't have a division instruction, so
 		// slowdodiv is the best that we can do.
-		// TODO: revisit for arm64.
 		return slowdodiv(n, d)
 	}
author	Rick Hudson <rlh@golang.org>	2016-04-27 18:19:16 -0400
committer	Rick Hudson <rlh@golang.org>	2016-04-27 18:46:52 -0400
commit	23aeb34df172b17b7bfaa85fb59ca64bef9073bb (patch)
tree	a8ab866f1e50f0059856ce628f036d93ab620155 /src/runtime
parent	1354b32cd70f2702381764fd595dd2faa996840c (diff)
parent	d3c79d324acd7300b6f705e66af8ca711af00d9f (diff)
download	go-23aeb34df172b17b7bfaa85fb59ca64bef9073bb.tar.xz