54 files changed, 681 insertions, 1672 deletions
diff --git a/src/runtime/asm.s b/src/runtime/asm.s
index 95a3424de2..27d8df9e06 100644
--- a/src/runtime/asm.s
+++ b/src/runtime/asm.s
@@ -11,24 +11,3 @@
 DATA runtime·no_pointers_stackmap+0x00(SB)/4, $2
 DATA runtime·no_pointers_stackmap+0x04(SB)/4, $0
 GLOBL runtime·no_pointers_stackmap(SB),RODATA, $8
-
-// NaCl requires that these skips be verifiable machine code.
-#ifdef GOARCH_amd64
-#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
-#endif
-#ifdef GOARCH_386
-#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
-#endif
-#ifdef GOARCH_wasm
-#define SKIP4 UNDEF; UNDEF; UNDEF; UNDEF
-#endif
-#ifndef SKIP4
-#define SKIP4 WORD $0
-#endif
-
-#define SKIP16 SKIP4; SKIP4; SKIP4; SKIP4
-#define SKIP64 SKIP16; SKIP16; SKIP16; SKIP16
-
-// This function must be sizeofSkipFunction bytes.
-TEXT runtime·skipPleaseUseCallersFrames(SB),NOSPLIT,$0-0
-	SKIP64; SKIP64; SKIP64; SKIP64
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 11d2f2f51a..23387a2165 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -916,23 +916,23 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
 // - R20 is the destination of the write
 // - R21 is the value being written at R20.
 // It clobbers condition codes.
-// It does not clobber R0 through R15,
+// It does not clobber R0 through R17 (except special registers),
 // but may clobber any other register, *including* R31.
 TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$112
 	// The standard prologue clobbers R31.
-	// We use R16 and R17 as scratch registers.
-	MOVD	g_m(g), R16
-	MOVD	m_p(R16), R16
-	MOVD	(p_wbBuf+wbBuf_next)(R16), R17
+	// We use R18 and R19 as scratch registers.
+	MOVD	g_m(g), R18
+	MOVD	m_p(R18), R18
+	MOVD	(p_wbBuf+wbBuf_next)(R18), R19
 	// Increment wbBuf.next position.
-	ADD	$16, R17
-	MOVD	R17, (p_wbBuf+wbBuf_next)(R16)
-	MOVD	(p_wbBuf+wbBuf_end)(R16), R16
-	CMP	R16, R17
+	ADD	$16, R19
+	MOVD	R19, (p_wbBuf+wbBuf_next)(R18)
+	MOVD	(p_wbBuf+wbBuf_end)(R18), R18
+	CMP	R18, R19
 	// Record the write.
-	MOVD	R21, -16(R17)	// Record value
-	MOVD	(R20), R16	// TODO: This turns bad writes into bad reads.
-	MOVD	R16, -8(R17)	// Record *slot
+	MOVD	R21, -16(R19)	// Record value
+	MOVD	(R20), R18	// TODO: This turns bad writes into bad reads.
+	MOVD	R18, -8(R19)	// Record *slot
 	// Is the buffer full? (flags set in CMP above)
 	BEQ	flush
 ret:
@@ -956,11 +956,12 @@ flush:
 	MOVD	R8, (FIXED_FRAME+56)(R1)
 	MOVD	R9, (FIXED_FRAME+64)(R1)
 	MOVD	R10, (FIXED_FRAME+72)(R1)
-	MOVD	R11, (FIXED_FRAME+80)(R1)
-	MOVD	R12, (FIXED_FRAME+88)(R1)
+	// R11, R12 may be clobbered by external-linker-inserted trampoline
 	// R13 is REGTLS
-	MOVD	R14, (FIXED_FRAME+96)(R1)
-	MOVD	R15, (FIXED_FRAME+104)(R1)
+	MOVD	R14, (FIXED_FRAME+80)(R1)
+	MOVD	R15, (FIXED_FRAME+88)(R1)
+	MOVD	R16, (FIXED_FRAME+96)(R1)
+	MOVD	R17, (FIXED_FRAME+104)(R1)
 
 	// This takes arguments R20 and R21.
 	CALL	runtime·wbBufFlush(SB)
@@ -975,10 +976,10 @@ flush:
 	MOVD	(FIXED_FRAME+56)(R1), R8
 	MOVD	(FIXED_FRAME+64)(R1), R9
 	MOVD	(FIXED_FRAME+72)(R1), R10
-	MOVD	(FIXED_FRAME+80)(R1), R11
-	MOVD	(FIXED_FRAME+88)(R1), R12
-	MOVD	(FIXED_FRAME+96)(R1), R14
-	MOVD	(FIXED_FRAME+104)(R1), R15
+	MOVD	(FIXED_FRAME+80)(R1), R14
+	MOVD	(FIXED_FRAME+88)(R1), R15
+	MOVD	(FIXED_FRAME+96)(R1), R16
+	MOVD	(FIXED_FRAME+104)(R1), R17
 	JMP	ret
 
 // Note: these functions use a special calling convention to save generated code space.
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index d7c45a183d..8f6c8773eb 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -79,7 +79,7 @@ TEXT setg_gcc<>(SB),NOSPLIT,$0-0
 
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-8
-	WORD	$0xc0102573	// rdtime a0
+	RDTIME	A0
 	MOV	A0, ret+0(FP)
 	RET
 
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index a4e64b00cc..099aa540e0 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -605,7 +605,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 		hbits := heapBitsForAddr(base)
 		n := span.elemsize
 		for i = uintptr(0); i < n; i += sys.PtrSize {
-			if i != 1*sys.PtrSize && !hbits.morePointers() {
+			if !hbits.morePointers() {
 				// No more possible pointers.
 				break
 			}
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index f6f4ffd02e..0afe5d962b 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -263,18 +263,19 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	}
 	gp.waiting = nil
 	gp.activeStackChans = false
-	if gp.param == nil {
-		if c.closed == 0 {
-			throw("chansend: spurious wakeup")
-		}
-		panic(plainError("send on closed channel"))
-	}
+	closed := !mysg.success
 	gp.param = nil
 	if mysg.releasetime > 0 {
 		blockevent(mysg.releasetime-t0, 2)
 	}
 	mysg.c = nil
 	releaseSudog(mysg)
+	if closed {
+		if c.closed == 0 {
+			throw("chansend: spurious wakeup")
+		}
+		panic(plainError("send on closed channel"))
+	}
 	return true
 }
 
@@ -311,6 +312,7 @@ func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	gp := sg.g
 	unlockf()
 	gp.param = unsafe.Pointer(sg)
+	sg.success = true
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
@@ -384,7 +386,8 @@ func closechan(c *hchan) {
 			sg.releasetime = cputicks()
 		}
 		gp := sg.g
-		gp.param = nil
+		gp.param = unsafe.Pointer(sg)
+		sg.success = false
 		if raceenabled {
 			raceacquireg(gp, c.raceaddr())
 		}
@@ -402,7 +405,8 @@ func closechan(c *hchan) {
 			sg.releasetime = cputicks()
 		}
 		gp := sg.g
-		gp.param = nil
+		gp.param = unsafe.Pointer(sg)
+		sg.success = false
 		if raceenabled {
 			raceacquireg(gp, c.raceaddr())
 		}
@@ -575,11 +579,11 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	if mysg.releasetime > 0 {
 		blockevent(mysg.releasetime-t0, 2)
 	}
-	closed := gp.param == nil
+	success := mysg.success
 	gp.param = nil
 	mysg.c = nil
 	releaseSudog(mysg)
-	return true, !closed
+	return true, success
 }
 
 // recv processes a receive operation on a full channel c.
@@ -632,6 +636,7 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	gp := sg.g
 	unlockf()
 	gp.param = unsafe.Pointer(sg)
+	sg.success = true
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
diff --git a/src/runtime/checkptr_test.go b/src/runtime/checkptr_test.go
index 8ab8a4937c..194cc1243a 100644
--- a/src/runtime/checkptr_test.go
+++ b/src/runtime/checkptr_test.go
@@ -27,6 +27,7 @@ func TestCheckPtr(t *testing.T) {
 		{"CheckPtrAlignmentPtr", "fatal error: checkptr: misaligned pointer conversion\n"},
 		{"CheckPtrAlignmentNoPtr", ""},
 		{"CheckPtrArithmetic", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
+		{"CheckPtrArithmetic2", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
 		{"CheckPtrSize", "fatal error: checkptr: converted pointer straddles multiple allocations\n"},
 		{"CheckPtrSmall", "fatal error: checkptr: pointer arithmetic computed bad pointer value\n"},
 	}
diff --git a/src/runtime/closure_test.go b/src/runtime/closure_test.go
index ea65fbd5f5..741c932eab 100644
--- a/src/runtime/closure_test.go
+++ b/src/runtime/closure_test.go
@@ -1,6 +1,7 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import "testing"
diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go
index f4db8cf927..64a0fbcaaa 100644
--- a/src/runtime/defs_linux_386.go
+++ b/src/runtime/defs_linux_386.go
@@ -226,14 +226,3 @@ type sockaddr_un struct {
 	family uint16
 	path   [108]byte
 }
-
-const __NEW_UTS_LEN = 64
-
-type new_utsname struct {
-	sysname    [__NEW_UTS_LEN + 1]byte
-	nodename   [__NEW_UTS_LEN + 1]byte
-	release    [__NEW_UTS_LEN + 1]byte
-	version    [__NEW_UTS_LEN + 1]byte
-	machine    [__NEW_UTS_LEN + 1]byte
-	domainname [__NEW_UTS_LEN + 1]byte
-}
diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go
index 8480d85219..1ae18a309b 100644
--- a/src/runtime/defs_linux_amd64.go
+++ b/src/runtime/defs_linux_amd64.go
@@ -262,14 +262,3 @@ type sockaddr_un struct {
 	family uint16
 	path   [108]byte
 }
-
-const __NEW_UTS_LEN = 64
-
-type new_utsname struct {
-	sysname    [__NEW_UTS_LEN + 1]byte
-	nodename   [__NEW_UTS_LEN + 1]byte
-	release    [__NEW_UTS_LEN + 1]byte
-	version    [__NEW_UTS_LEN + 1]byte
-	machine    [__NEW_UTS_LEN + 1]byte
-	domainname [__NEW_UTS_LEN + 1]byte
-}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 5ab03f3f99..d591fdc4e9 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -43,8 +43,6 @@ var PhysHugePageSize = physHugePageSize
 
 var NetpollGenericInit = netpollGenericInit
 
-var ParseRelease = parseRelease
-
 var Memmove = memmove
 var MemclrNoHeapPointers = memclrNoHeapPointers
 
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index ec1ba90c2e..0808b416f0 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -77,7 +77,7 @@ func TestGCInfo(t *testing.T) {
 	}
 
 	for i := 0; i < 10; i++ {
-		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(padDead(infoPtr)))
+		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(infoPtr))
 		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), trimDead(infoPtr10))
 		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), trimDead(infoScalarPtr))
 		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), trimDead(infoScalarPtr4))
@@ -97,25 +97,10 @@ func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	}
 }
 
-func padDead(mask []byte) []byte {
-	// Because the dead bit isn't encoded in the second word,
-	// and because on 32-bit systems a one-word allocation
-	// uses a two-word block, the pointer info for a one-word
-	// object needs to be expanded to include an extra scalar
-	// on 32-bit systems to match the heap bitmap.
-	if runtime.PtrSize == 4 && len(mask) == 1 {
-		return []byte{mask[0], 0}
-	}
-	return mask
-}
-
 func trimDead(mask []byte) []byte {
-	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
+	for len(mask) > 0 && mask[len(mask)-1] == typeScalar {
 		mask = mask[:len(mask)-1]
 	}
-	if len(mask) == 2 && mask[0] == typeScalar && mask[1] == typeScalar {
-		mask = mask[:0]
-	}
 	return mask
 }
 
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index cfd5c251b4..4c35309211 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -713,7 +713,7 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		if i != 1 && !hbits.morePointers() {
+		if !hbits.morePointers() {
 			break // end of object
 		}
 		if hbits.isPointer() {
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 000193585d..b23cf767be 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -67,8 +67,6 @@ const (
 	lockRankRwmutexW
 	lockRankRwmutexR
 
-	lockRankMcentral // For !go115NewMCentralImpl
-	lockRankSpine    // For !go115NewMCentralImpl
 	lockRankSpanSetSpine
 	lockRankGscan
 	lockRankStackpool
@@ -149,8 +147,6 @@ var lockNames = []string{
 	lockRankRwmutexW: "rwmutexW",
 	lockRankRwmutexR: "rwmutexR",
 
-	lockRankMcentral:     "mcentral",
-	lockRankSpine:        "spine",
 	lockRankSpanSetSpine: "spanSetSpine",
 	lockRankGscan:        "gscan",
 	lockRankStackpool:    "stackpool",
@@ -228,18 +224,16 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankRwmutexW: {},
 	lockRankRwmutexR: {lockRankRwmutexW},
 
-	lockRankMcentral:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankSpine:        {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine, lockRankGscan},
-	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankSpanSetSpine, lockRankGscan},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
 	lockRankDefer:        {},
 	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
 	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
 	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGlobalAlloc:  {lockRankProf, lockRankSpine, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index b3fac3de24..e46327f9ce 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1178,11 +1178,9 @@ func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	if s == nil {
 		throw("out of memory")
 	}
-	if go115NewMCentralImpl {
-		// Put the large span in the mcentral swept list so that it's
-		// visible to the background sweeper.
-		mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	}
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
 	s.limit = s.base() + size
 	heapBitsForAddr(s.base()).initSpan(s)
 	return s
diff --git a/src/runtime/map_benchmark_test.go b/src/runtime/map_benchmark_test.go
index 893cb6c5b6..d0becc9ddb 100644
--- a/src/runtime/map_benchmark_test.go
+++ b/src/runtime/map_benchmark_test.go
@@ -1,6 +1,7 @@
 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import (
diff --git a/src/runtime/map_fast32.go b/src/runtime/map_fast32.go
index 534454f3ad..d035ed0386 100644
--- a/src/runtime/map_fast32.go
+++ b/src/runtime/map_fast32.go
@@ -299,8 +299,12 @@ search:
 				continue
 			}
 			// Only clear key if there are pointers in it.
-			if t.key.ptrdata != 0 {
-				memclrHasPointers(k, t.key.size)
+			// This can only happen if pointers are 32 bit
+			// wide as 64 bit pointers do not fit into a 32 bit key.
+			if sys.PtrSize == 4 && t.key.ptrdata != 0 {
+				// The key must be a pointer as we checked pointers are
+				// 32 bits wide and the key is 32 bits wide also.
+				*(*unsafe.Pointer)(k) = nil
 			}
 			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.elemsize))
 			if t.elem.ptrdata != 0 {
diff --git a/src/runtime/map_fast64.go b/src/runtime/map_fast64.go
index 1669c7cfe9..f1f3927598 100644
--- a/src/runtime/map_fast64.go
+++ b/src/runtime/map_fast64.go
@@ -300,7 +300,13 @@ search:
 			}
 			// Only clear key if there are pointers in it.
 			if t.key.ptrdata != 0 {
-				memclrHasPointers(k, t.key.size)
+				if sys.PtrSize == 8 {
+					*(*unsafe.Pointer)(k) = nil
+				} else {
+					// There are three ways to squeeze at one ore more 32 bit pointers into 64 bits.
+					// Just call memclrHasPointers instead of trying to handle all cases here.
+					memclrHasPointers(k, 8)
+				}
 			}
 			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.elemsize))
 			if t.elem.ptrdata != 0 {
diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index 1b7ccad6ed..302b3c23c1 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go
@@ -993,6 +993,27 @@ func benchmarkMapDeleteStr(b *testing.B, n int) {
 	}
 }
 
+func benchmarkMapDeletePointer(b *testing.B, n int) {
+	i2p := make([]*int, n)
+	for i := 0; i < n; i++ {
+		i2p[i] = new(int)
+	}
+	a := make(map[*int]int, n)
+	b.ResetTimer()
+	k := 0
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := 0; j < n; j++ {
+				a[i2p[j]] = j
+			}
+			k = i
+			b.StartTimer()
+		}
+		delete(a, i2p[i-k])
+	}
+}
+
 func runWith(f func(*testing.B, int), v ...int) func(*testing.B) {
 	return func(b *testing.B) {
 		for _, n := range v {
@@ -1023,6 +1044,7 @@ func BenchmarkMapDelete(b *testing.B) {
 	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 100, 1000, 10000))
 	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 100, 1000, 10000))
 	b.Run("Str", runWith(benchmarkMapDeleteStr, 100, 1000, 10000))
+	b.Run("Pointer", runWith(benchmarkMapDeletePointer, 100, 1000, 10000))
 }
 
 func TestDeferDeleteSlow(t *testing.T) {
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 35332c91c4..8de44c14b9 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -6,10 +6,11 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Stack frames and global variables in the data and bss sections are described
-// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
-// to be visited during GC. The bits in each byte are consumed starting with
-// the low bit: 1<<0, 1<<1, and so on.
+// Stack frames and global variables in the data and bss sections are
+// described by bitmaps with 1 bit per pointer-sized word. A "1" bit
+// means the word is a live pointer to be visited by the GC (referred to
+// as "pointer"). A "0" bit means the word should be ignored by GC
+// (referred to as "scalar", though it could be a dead pointer value).
 //
 // Heap bitmap
 //
@@ -20,18 +21,13 @@
 // through start+3*ptrSize, ha.bitmap[1] holds the entries for
 // start+4*ptrSize through start+7*ptrSize, and so on.
 //
-// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
-// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
-// The meaning of the high bit depends on the position of the word being described
-// in its allocated object. In all words *except* the second word, the
-// high bit indicates that the object is still being described. In
-// these words, if a bit pair with a high bit 0 is encountered, the
-// low bit can also be assumed to be 0, and the object description is
-// over. This 00 is called the ``dead'' encoding: it signals that the
-// rest of the words in the object are uninteresting to the garbage
-// collector.
-//
-// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In each 2-bit entry, the lower bit is a pointer/scalar bit, just
+// like in the stack/data bitmaps described above. The upper bit
+// indicates scan/dead: a "1" value ("scan") indicates that there may
+// be pointers in later words of the allocation, and a "0" value
+// ("dead") indicates there are no more pointers in the allocation. If
+// the upper bit is 0, the lower bit must also be 0, and this
+// indicates scanning can ignore the rest of the allocation.
 //
 // The 2-bit entries are split when written into the byte, so that the top half
 // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
@@ -39,38 +35,14 @@
 // This form allows a copy from the 1-bit to the 4-bit form to keep the
 // pointer bits contiguous, instead of having to space them out.
 //
-// The code makes use of the fact that the zero value for a heap bitmap
-// has no live pointer bit set and is (depending on position), not used,
-// not checkmarked, and is the dead encoding.
-// These properties must be preserved when modifying the encoding.
+// The code makes use of the fact that the zero value for a heap
+// bitmap means scalar/dead. This property must be preserved when
+// modifying the encoding.
 //
 // The bitmap for noscan spans is not maintained. Code must ensure
 // that an object is scannable before consulting its bitmap by
 // checking either the noscan bit in the span or by consulting its
 // type's information.
-//
-// Checkmarks
-//
-// In a concurrent garbage collector, one worries about failing to mark
-// a live object due to mutations without write barriers or bugs in the
-// collector implementation. As a sanity check, the GC has a 'checkmark'
-// mode that retraverses the object graph with the world stopped, to make
-// sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
-// for the second word of the object holds the checkmark bit.
-// When not in checkmark mode, this bit is set to 1.
-//
-// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
-// means every allocated object has two words, so there is room for the
-// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
-// just one word, so the second bit pair is not available for encoding the
-// checkmark. However, because non-pointer allocations are combined
-// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
-// must be a pointer, so the type bit in the first word is not actually needed.
-// It is still used in general, except in checkmark the type bit is repurposed
-// as the checkmark bit and then reinitialized (to 1) as the type bit when
-// finished.
-//
 
 package runtime
 
@@ -551,33 +523,6 @@ func (h heapBits) isPointer() bool {
 	return h.bits()&bitPointer != 0
 }
 
-// isCheckmarked reports whether the heap bits have the checkmarked bit set.
-// It must be told how large the object at h is, because the encoding of the
-// checkmark bit varies by size.
-// h must describe the initial word of the object.
-func (h heapBits) isCheckmarked(size uintptr) bool {
-	if size == sys.PtrSize {
-		return (*h.bitp>>h.shift)&bitPointer != 0
-	}
-	// All multiword objects are 2-word aligned,
-	// so we know that the initial word's 2-bit pair
-	// and the second word's 2-bit pair are in the
-	// same heap bitmap byte, *h.bitp.
-	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
-}
-
-// setCheckmarked sets the checkmarked bit.
-// It must be told how large the object at h is, because the encoding of the
-// checkmark bit varies by size.
-// h must describe the initial word of the object.
-func (h heapBits) setCheckmarked(size uintptr) {
-	if size == sys.PtrSize {
-		atomic.Or8(h.bitp, bitPointer<<h.shift)
-		return
-	}
-	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
-}
-
 // bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
@@ -795,7 +740,6 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 
 // initSpan initializes the heap bitmap for a span.
-// It clears all checkmark bits.
 // If this is a span of pointer-sized objects, it initializes all
 // words to pointer/scan.
 // Otherwise, it initializes all words to scalar/dead.
@@ -826,45 +770,6 @@ func (h heapBits) initSpan(s *mspan) {
 	}
 }
 
-// initCheckmarkSpan initializes a span for being checkmarked.
-// It clears the checkmark bits, which are set to 1 in normal operation.
-func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if sys.PtrSize == 8 && size == sys.PtrSize {
-		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must clear type bit (checkmark bit) of every word.
-		// The type bit is the lower of every two-bit pair.
-		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
-			*h.bitp &^= bitPointerAll
-			h = h.forward(wordsPerBitmapByte)
-		}
-		return
-	}
-	for i := uintptr(0); i < n; i++ {
-		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
-		h = h.forward(size / sys.PtrSize)
-	}
-}
-
-// clearCheckmarkSpan undoes all the checkmarking in a span.
-// The actual checkmark bits are ignored, so the only work to do
-// is to fix the pointer bits. (Pointer bits are ignored by scanobject
-// but consulted by typedmemmove.)
-func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if sys.PtrSize == 8 && size == sys.PtrSize {
-		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must clear type bit (checkmark bit) of every word.
-		// The type bit is the lower of every two-bit pair.
-		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
-			*h.bitp |= bitPointerAll
-			h = h.forward(wordsPerBitmapByte)
-		}
-	}
-}
-
 // countAlloc returns the number of objects allocated in span s by
 // scanning the allocation bitmap.
 func (s *mspan) countAlloc() int {
@@ -957,11 +862,11 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
 				// 1 pointer object. On 32-bit machines clear the bit for the
 				// unused second word.
-				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 				*h.bitp |= (bitPointer | bitScan) << h.shift
 			} else {
 				// 2-element slice of pointer.
-				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
+				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 			}
 			return
 		}
@@ -974,11 +879,10 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			}
 		}
 		b := uint32(*ptrmask)
-		hb := (b & 3) | bitScan
-		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
-		// 110011 is shifted h.shift and complemented.
-		// This clears out the bits that are about to be
-		// ored into *h.hbitp in the next instructions.
+		hb := b & 3
+		hb |= bitScanAll & ((bitScan << (typ.ptrdata / sys.PtrSize)) - 1)
+		// Clear the bits for this object so we can set the
+		// appropriate ones.
 		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 		*h.bitp |= uint8(hb << h.shift)
 		return
@@ -1155,11 +1059,6 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		throw("heapBitsSetType: called with non-pointer type")
 		return
 	}
-	if nw < 2 {
-		// Must write at least 2 words, because the "no scan"
-		// encoding doesn't take effect until the third word.
-		nw = 2
-	}
 
 	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
 	// The leading byte is special because it contains the bits for word 1,
@@ -1172,21 +1071,22 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 
 	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
-		// Handle first byte of bitmap specially.
+		//
+		// This is a fast path for small objects.
 		//
 		// The first byte we write out covers the first four
 		// words of the object. The scan/dead bit on the first
 		// word must be set to scan since there are pointers
-		// somewhere in the object. The scan/dead bit on the
-		// second word is the checkmark, so we don't set it.
+		// somewhere in the object.
 		// In all following words, we set the scan/dead
 		// appropriately to indicate that the object contains
 		// to the next 2-bit entry in the bitmap.
 		//
-		// TODO: It doesn't matter if we set the checkmark, so
-		// maybe this case isn't needed any more.
+		// We set four bits at a time here, but if the object
+		// is fewer than four words, phase 3 will clear
+		// unnecessary bits.
 		hb = b & bitPointerAll
-		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+		hb |= bitScanAll
 		if w += 4; w >= nw {
 			goto Phase3
 		}
@@ -1203,14 +1103,13 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// We took care of 1-word and 2-word objects above,
 		// so this is at least a 6-word object.
 		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
-		// This is not noscan, so set the scan bit in the
-		// first word.
 		hb |= bitScan << (2 * heapBitsShift)
+		if nw > 1 {
+			hb |= bitScan << (3 * heapBitsShift)
+		}
 		b >>= 2
 		nb -= 2
-		// Note: no bitScan for second word because that's
-		// the checkmark.
-		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
+		*hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift))
 		*hbitp |= uint8(hb)
 		hbitp = add1(hbitp)
 		if w += 2; w >= nw {
@@ -1403,17 +1302,20 @@ Phase4:
 	// Double check the whole bitmap.
 	if doubleCheck {
 		// x+size may not point to the heap, so back up one
-		// word and then call next().
-		end := heapBitsForAddr(x + size - sys.PtrSize).next()
-		endAI := arenaIdx(end.arena)
-		if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0])) {
-			// The unrolling code above walks hbitp just
-			// past the bitmap without moving to the next
-			// arena. Synthesize this for end.bitp.
-			end.arena--
-			endAI = arenaIdx(end.arena)
-			end.bitp = addb(&mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0], heapArenaBitmapBytes)
-			end.last = nil
+		// word and then advance it the way we do above.
+		end := heapBitsForAddr(x + size - sys.PtrSize)
+		if outOfPlace {
+			// In out-of-place copying, we just advance
+			// using next.
+			end = end.next()
+		} else {
+			// Don't use next because that may advance to
+			// the next arena and the in-place logic
+			// doesn't do that.
+			end.shift += heapBitsShift
+			if end.shift == 4*heapBitsShift {
+				end.bitp, end.shift = add1(end.bitp), 0
+			}
 		}
 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
 			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
@@ -1437,19 +1339,16 @@ Phase4:
 			var have, want uint8
 			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
 			if i >= totalptr {
-				want = 0 // deadmarker
 				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					// heapBitsSetTypeGCProg always fills
+					// in full nibbles of bitScan.
 					want = bitScan
 				}
 			} else {
 				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
-				if i != 1 {
-					want |= bitScan
-				} else {
-					have &^= bitScan
-				}
+				want |= bitScan
 			}
 			if have != want {
 				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
@@ -2009,7 +1908,7 @@ func getgcmask(ep interface{}) (mask []byte) {
 			if hbits.isPointer() {
 				mask[i/sys.PtrSize] = 1
 			}
-			if i != 1*sys.PtrSize && !hbits.morePointers() {
+			if !hbits.morePointers() {
 				mask = mask[:i/sys.PtrSize]
 				break
 			}
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 5bceb51ac9..7a7d33ccae 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -131,11 +131,7 @@ func (c *mcache) refill(spc spanClass) {
 		if s.sweepgen != mheap_.sweepgen+3 {
 			throw("bad sweepgen in refill")
 		}
-		if go115NewMCentralImpl {
-			mheap_.central[spc].mcentral.uncacheSpan(s)
-		} else {
-			atomic.Store(&s.sweepgen, mheap_.sweepgen)
-		}
+		mheap_.central[spc].mcentral.uncacheSpan(s)
 	}
 
 	// Get a new cached span from the central lists.
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49d86d0c..ed49e01677 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -18,7 +18,6 @@ import "runtime/internal/atomic"
 //
 //go:notinheap
 type mcentral struct {
-	lock      mutex
 	spanclass spanClass
 
 	// For !go115NewMCentralImpl.
@@ -55,16 +54,10 @@ type mcentral struct {
 // Initialize a single central free list.
 func (c *mcentral) init(spc spanClass) {
 	c.spanclass = spc
-	if go115NewMCentralImpl {
-		lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
-	} else {
-		c.nonempty.init()
-		c.empty.init()
-		lockInit(&c.lock, lockRankMcentral)
-	}
+	lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
 }
 
 // partialUnswept returns the spanSet which holds partially-filled
@@ -93,9 +86,6 @@ func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
 
 // Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
-	if !go115NewMCentralImpl {
-		return c.oldCacheSpan()
-	}
 	// Deduct credit for this span allocation and sweep if necessary.
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
@@ -213,127 +203,11 @@ havespan:
 	return s
 }
 
-// Allocate a span to use in an mcache.
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) oldCacheSpan() *mspan {
-	// Deduct credit for this span allocation and sweep if necessary.
-	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
-	deductSweepCredit(spanBytes, 0)
-
-	lock(&c.lock)
-	traceDone := false
-	if trace.enabled {
-		traceGCSweepStart()
-	}
-	sg := mheap_.sweepgen
-retry:
-	var s *mspan
-	for s = c.nonempty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			c.nonempty.remove(s)
-			c.empty.insertBack(s)
-			unlock(&c.lock)
-			s.sweep(true)
-			goto havespan
-		}
-		if s.sweepgen == sg-1 {
-			// the span is being swept by background sweeper, skip
-			continue
-		}
-		// we have a nonempty span that does not require sweeping, allocate from it
-		c.nonempty.remove(s)
-		c.empty.insertBack(s)
-		unlock(&c.lock)
-		goto havespan
-	}
-
-	for s = c.empty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			// we have an empty span that requires sweeping,
-			// sweep it and see if we can free some space in it
-			c.empty.remove(s)
-			// swept spans are at the end of the list
-			c.empty.insertBack(s)
-			unlock(&c.lock)
-			s.sweep(true)
-			freeIndex := s.nextFreeIndex()
-			if freeIndex != s.nelems {
-				s.freeindex = freeIndex
-				goto havespan
-			}
-			lock(&c.lock)
-			// the span is still empty after sweep
-			// it is already in the empty list, so just retry
-			goto retry
-		}
-		if s.sweepgen == sg-1 {
-			// the span is being swept by background sweeper, skip
-			continue
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break
-	}
-	if trace.enabled {
-		traceGCSweepDone()
-		traceDone = true
-	}
-	unlock(&c.lock)
-
-	// Replenish central list if empty.
-	s = c.grow()
-	if s == nil {
-		return nil
-	}
-	lock(&c.lock)
-	c.empty.insertBack(s)
-	unlock(&c.lock)
-
-	// At this point s is a non-empty span, queued at the end of the empty list,
-	// c is unlocked.
-havespan:
-	if trace.enabled && !traceDone {
-		traceGCSweepDone()
-	}
-	n := int(s.nelems) - int(s.allocCount)
-	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
-		throw("span has no free objects")
-	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
-	freeByteBase := s.freeindex &^ (64 - 1)
-	whichByte := freeByteBase / 8
-	// Init alloc bits cache.
-	s.refillAllocCache(whichByte)
-
-	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
-	// s.allocCache.
-	s.allocCache >>= s.freeindex % 64
-
-	return s
-}
-
 // Return span from an mcache.
 //
 // s must have a span class corresponding to this
 // mcentral and it must not be empty.
 func (c *mcentral) uncacheSpan(s *mspan) {
-	if !go115NewMCentralImpl {
-		c.oldUncacheSpan(s)
-		return
-	}
 	if s.allocCount == 0 {
 		throw("uncaching span but s.allocCount == 0")
 	}
@@ -393,111 +267,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 	}
 }
 
-// Return span from an mcache.
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) oldUncacheSpan(s *mspan) {
-	if s.allocCount == 0 {
-		throw("uncaching span but s.allocCount == 0")
-	}
-
-	sg := mheap_.sweepgen
-	stale := s.sweepgen == sg+1
-	if stale {
-		// Span was cached before sweep began. It's our
-		// responsibility to sweep it.
-		//
-		// Set sweepgen to indicate it's not cached but needs
-		// sweeping and can't be allocated from. sweep will
-		// set s.sweepgen to indicate s is swept.
-		atomic.Store(&s.sweepgen, sg-1)
-	} else {
-		// Indicate that s is no longer cached.
-		atomic.Store(&s.sweepgen, sg)
-	}
-
-	n := int(s.nelems) - int(s.allocCount)
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		lock(&c.lock)
-		c.empty.remove(s)
-		c.nonempty.insert(s)
-		if !stale {
-			// mCentral_CacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-		unlock(&c.lock)
-	}
-
-	if stale {
-		// Now that s is in the right mcentral list, we can
-		// sweep it.
-		s.sweep(false)
-	}
-}
-
-// freeSpan updates c and s after sweeping s.
-// It sets s's sweepgen to the latest generation,
-// and, based on the number of free objects in s,
-// moves s to the appropriate list of c or returns it
-// to the heap.
-// freeSpan reports whether s was returned to the heap.
-// If preserve=true, it does not move s (the caller
-// must take care of it).
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
-	if sg := mheap_.sweepgen; s.sweepgen == sg+1 || s.sweepgen == sg+3 {
-		throw("freeSpan given cached span")
-	}
-	s.needzero = 1
-
-	if preserve {
-		// preserve is set only when called from (un)cacheSpan above,
-		// the span must be in the empty list.
-		if !s.inList() {
-			throw("can't preserve unlinked span")
-		}
-		atomic.Store(&s.sweepgen, mheap_.sweepgen)
-		return false
-	}
-
-	lock(&c.lock)
-
-	// Move to nonempty if necessary.
-	if wasempty {
-		c.empty.remove(s)
-		c.nonempty.insert(s)
-	}
-
-	// delay updating sweepgen until here. This is the signal that
-	// the span may be used in an mcache, so it must come after the
-	// linked list operations above (actually, just after the
-	// lock of c above.)
-	atomic.Store(&s.sweepgen, mheap_.sweepgen)
-
-	if s.allocCount != 0 {
-		unlock(&c.lock)
-		return false
-	}
-
-	c.nonempty.remove(s)
-	unlock(&c.lock)
-	mheap_.freeSpan(s)
-	return true
-}
-
 // grow allocates a new empty span from the heap and initializes it for c's size class.
 func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
new file mode 100644
index 0000000000..1fd8e4e78f
--- /dev/null
+++ b/src/runtime/mcheckmark.go
@@ -0,0 +1,100 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GC checkmarks
+//
+// In a concurrent garbage collector, one worries about failing to mark
+// a live object due to mutations without write barriers or bugs in the
+// collector implementation. As a sanity check, the GC has a 'checkmark'
+// mode that retraverses the object graph with the world stopped, to make
+// sure that everything that should be marked is marked.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A checkmarksMap stores the GC marks in "checkmarks" mode. It is a
+// per-arena bitmap with a bit for every word in the arena. The mark
+// is stored on the bit corresponding to the first word of the marked
+// allocation.
+//
+//go:notinheap
+type checkmarksMap [heapArenaBytes / sys.PtrSize / 8]uint8
+
+// If useCheckmark is true, marking of an object uses the checkmark
+// bits instead of the standard mark bits.
+var useCheckmark = false
+
+// startCheckmarks prepares for the checkmarks phase.
+//
+// The world must be stopped.
+func startCheckmarks() {
+	// Clear all checkmarks.
+	for _, ai := range mheap_.allArenas {
+		arena := mheap_.arenas[ai.l1()][ai.l2()]
+		bitmap := arena.checkmarks
+
+		if bitmap == nil {
+			// Allocate bitmap on first use.
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gc_sys))
+			if bitmap == nil {
+				throw("out of memory allocating checkmarks bitmap")
+			}
+			arena.checkmarks = bitmap
+		} else {
+			// Otherwise clear the existing bitmap.
+			for i := range bitmap {
+				bitmap[i] = 0
+			}
+		}
+	}
+	// Enable checkmarking.
+	useCheckmark = true
+}
+
+// endCheckmarks ends the checkmarks phase.
+func endCheckmarks() {
+	if gcMarkWorkAvailable(nil) {
+		throw("GC work not flushed")
+	}
+	useCheckmark = false
+}
+
+// setCheckmark throws if marking object is a checkmarks violation,
+// and otherwise sets obj's checkmark. It returns true if obj was
+// already checkmarked.
+func setCheckmark(obj, base, off uintptr, mbits markBits) bool {
+	if !mbits.isMarked() {
+		printlock()
+		print("runtime: checkmarks found unexpected unmarked object obj=", hex(obj), "\n")
+		print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+		// Dump the source (base) object
+		gcDumpObject("base", base, off)
+
+		// Dump the object
+		gcDumpObject("obj", obj, ^uintptr(0))
+
+		getg().m.traceback = 2
+		throw("checkmark found unmarked object")
+	}
+
+	ai := arenaIndex(obj)
+	arena := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaWord := (obj / heapArenaBytes / 8) % uintptr(len(arena.checkmarks))
+	mask := byte(1 << ((obj / heapArenaBytes) % 8))
+	bytep := &arena.checkmarks[arenaWord]
+
+	if atomic.Load8(bytep)&mask != 0 {
+		// Already checkmarked.
+		return true
+	}
+
+	atomic.Or8(bytep, mask)
+	return false
+}
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index b3499516f6..bd87144355 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1670,13 +1670,13 @@ func gcMarkTermination(nextTriggerRatio float64) {
 			// mark using checkmark bits, to check that we
 			// didn't forget to mark anything during the
 			// concurrent mark process.
+			startCheckmarks()
 			gcResetMarkState()
-			initCheckmarks()
 			gcw := &getg().m.p.ptr().gcw
 			gcDrain(gcw, 0)
 			wbBufFlush1(getg().m.p.ptr())
 			gcw.dispose()
-			clearCheckmarks()
+			endCheckmarks()
 		}
 
 		// marking is complete so we can turn the write barrier off
@@ -2149,21 +2149,13 @@ func gcSweep(mode gcMode) {
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
 	mheap_.sweepdone = 0
-	if !go115NewMCentralImpl && mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
-		// We should have drained this list during the last
-		// sweep phase. We certainly need to start this phase
-		// with an empty swept list.
-		throw("non-empty swept list")
-	}
 	mheap_.pagesSwept = 0
 	mheap_.sweepArenas = mheap_.allArenas
 	mheap_.reclaimIndex = 0
 	mheap_.reclaimCredit = 0
 	unlock(&mheap_.lock)
 
-	if go115NewMCentralImpl {
-		sweep.centralIndex.clear()
-	}
+	sweep.centralIndex.clear()
 
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
 		// Special case synchronous sweep.
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index fe988c46d9..2b84945471 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -47,10 +47,6 @@ const (
 	// Must be a multiple of the pageInUse bitmap element size and
 	// must also evenly divide pagesPerArena.
 	pagesPerSpanRoot = 512
-
-	// go115NewMarkrootSpans is a feature flag that indicates whether
-	// to use the new bitmap-based markrootSpans implementation.
-	go115NewMarkrootSpans = true
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
@@ -87,24 +83,16 @@ func gcMarkRootPrepare() {
 	//
 	// We depend on addfinalizer to mark objects that get
 	// finalizers after root marking.
-	if go115NewMarkrootSpans {
-		// We're going to scan the whole heap (that was available at the time the
-		// mark phase started, i.e. markArenas) for in-use spans which have specials.
-		//
-		// Break up the work into arenas, and further into chunks.
-		//
-		// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
-		// is append-only.
-		mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
-		work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
-	} else {
-		// We're only interested in scanning the in-use spans,
-		// which will all be swept at this point. More spans
-		// may be added to this list during concurrent GC, but
-		// we only care about spans that were allocated before
-		// this mark phase.
-		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
-	}
+	//
+	// We're going to scan the whole heap (that was available at the time the
+	// mark phase started, i.e. markArenas) for in-use spans which have specials.
+	//
+	// Break up the work into arenas, and further into chunks.
+	//
+	// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
+	// is append-only.
+	mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
+	work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
 
 	// Scan stacks.
 	//
@@ -316,10 +304,6 @@ func markrootFreeGStacks() {
 //
 //go:nowritebarrier
 func markrootSpans(gcw *gcWork, shard int) {
-	if !go115NewMarkrootSpans {
-		oldMarkrootSpans(gcw, shard)
-		return
-	}
 	// Objects with finalizers have two GC-related invariants:
 	//
 	// 1) Everything reachable from the object must be marked.
@@ -396,90 +380,6 @@ func markrootSpans(gcw *gcWork, shard int) {
 	}
 }
 
-// oldMarkrootSpans marks roots for one shard of work.spans.
-//
-// For go115NewMarkrootSpans = false.
-//
-//go:nowritebarrier
-func oldMarkrootSpans(gcw *gcWork, shard int) {
-	// Objects with finalizers have two GC-related invariants:
-	//
-	// 1) Everything reachable from the object must be marked.
-	// This ensures that when we pass the object to its finalizer,
-	// everything the finalizer can reach will be retained.
-	//
-	// 2) Finalizer specials (which are not in the garbage
-	// collected heap) are roots. In practice, this means the fn
-	// field must be scanned.
-	//
-	// TODO(austin): There are several ideas for making this more
-	// efficient in issue #11485.
-
-	sg := mheap_.sweepgen
-	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
-	// Note that work.spans may not include spans that were
-	// allocated between entering the scan phase and now. We may
-	// also race with spans being added into sweepSpans when they're
-	// just created, and as a result we may see nil pointers in the
-	// spans slice. This is okay because any objects with finalizers
-	// in those spans must have been allocated and given finalizers
-	// after we entered the scan phase, so addfinalizer will have
-	// ensured the above invariants for them.
-	for i := 0; i < len(spans); i++ {
-		// sweepBuf.block requires that we read pointers from the block atomically.
-		// It also requires that we ignore nil pointers.
-		s := (*mspan)(atomic.Loadp(unsafe.Pointer(&spans[i])))
-
-		// This is racing with spans being initialized, so
-		// check the state carefully.
-		if s == nil || s.state.get() != mSpanInUse {
-			continue
-		}
-		// Check that this span was swept (it may be cached or uncached).
-		if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
-			// sweepgen was updated (+2) during non-checkmark GC pass
-			print("sweep ", s.sweepgen, " ", sg, "\n")
-			throw("gc: unswept span")
-		}
-
-		// Speculatively check if there are any specials
-		// without acquiring the span lock. This may race with
-		// adding the first special to a span, but in that
-		// case addfinalizer will observe that the GC is
-		// active (which is globally synchronized) and ensure
-		// the above invariants. We may also ensure the
-		// invariants, but it's okay to scan an object twice.
-		if s.specials == nil {
-			continue
-		}
-
-		// Lock the specials to prevent a special from being
-		// removed from the list while we're traversing it.
-		lock(&s.speciallock)
-
-		for sp := s.specials; sp != nil; sp = sp.next {
-			if sp.kind != _KindSpecialFinalizer {
-				continue
-			}
-			// don't mark finalized object, but scan it so we
-			// retain everything it points to.
-			spf := (*specialfinalizer)(unsafe.Pointer(sp))
-			// A finalizer can be set for an inner byte of an object, find object beginning.
-			p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-
-			// Mark everything that can be reached from
-			// the object (but *not* the object itself or
-			// we'll never collect it).
-			scanobject(p, gcw)
-
-			// The special itself is a root.
-			scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw, nil)
-		}
-
-		unlock(&s.speciallock)
-	}
-}
-
 // gcAssistAlloc performs GC work to make gp's assist debt positive.
 // gp must be the calling user gorountine.
 //
@@ -1354,11 +1254,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 		}
 		// Load bits once. See CL 22712 and issue 16973 for discussion.
 		bits := hbits.bits()
-		// During checkmarking, 1-word objects store the checkmark
-		// in the type bit for the one word. The only one-word objects
-		// are pointers, or else they'd be merged with other non-pointer
-		// data into larger allocations.
-		if i != 1*sys.PtrSize && bits&bitScan == 0 {
+		if bits&bitScan == 0 {
 			break // no more pointers in this object
 		}
 		if bits&bitPointer == 0 {
@@ -1511,28 +1407,10 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
 	mbits := span.markBitsForIndex(objIndex)
 
 	if useCheckmark {
-		if !mbits.isMarked() {
-			printlock()
-			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
-			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
-
-			// Dump the source (base) object
-			gcDumpObject("base", base, off)
-
-			// Dump the object
-			gcDumpObject("obj", obj, ^uintptr(0))
-
-			getg().m.traceback = 2
-			throw("checkmark found unmarked object")
-		}
-		hbits := heapBitsForAddr(obj)
-		if hbits.isCheckmarked(span.elemsize) {
+		if setCheckmark(obj, base, off, mbits) {
+			// Already marked.
 			return
 		}
-		hbits.setCheckmarked(span.elemsize)
-		if !hbits.isCheckmarked(span.elemsize) {
-			throw("setCheckmarked and isCheckmarked disagree")
-		}
 	} else {
 		if debug.gccheckmark > 0 && span.isFree(objIndex) {
 			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
@@ -1661,45 +1539,3 @@ func gcMarkTinyAllocs() {
 		greyobject(c.tiny, 0, 0, span, gcw, objIndex)
 	}
 }
-
-// Checkmarking
-
-// To help debug the concurrent GC we remark with the world
-// stopped ensuring that any object encountered has their normal
-// mark bit set. To do this we use an orthogonal bit
-// pattern to indicate the object is marked. The following pattern
-// uses the upper two bits in the object's boundary nibble.
-// 01: scalar  not marked
-// 10: pointer not marked
-// 11: pointer     marked
-// 00: scalar      marked
-// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
-// The higher bit is 1 for pointers and 0 for scalars, whether the object
-// is marked or not.
-// The first nibble no longer holds the typeDead pattern indicating that the
-// there are no more pointers in the object. This information is held
-// in the second nibble.
-
-// If useCheckmark is true, marking of an object uses the
-// checkmark bits (encoding above) instead of the standard
-// mark bits.
-var useCheckmark = false
-
-//go:nowritebarrier
-func initCheckmarks() {
-	useCheckmark = true
-	for _, s := range mheap_.allspans {
-		if s.state.get() == mSpanInUse {
-			heapBitsForAddr(s.base()).initCheckmarkSpan(s.layout())
-		}
-	}
-}
-
-func clearCheckmarks() {
-	useCheckmark = false
-	for _, s := range mheap_.allspans {
-		if s.state.get() == mSpanInUse {
-			heapBitsForAddr(s.base()).clearCheckmarkSpan(s.layout())
-		}
-	}
-}
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 3aa3afc028..6b8c56ce35 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -132,17 +132,15 @@ func finishsweep_m() {
 		sweep.npausesweep++
 	}
 
-	if go115NewMCentralImpl {
-		// Reset all the unswept buffers, which should be empty.
-		// Do this in sweep termination as opposed to mark termination
-		// so that we can catch unswept spans and reclaim blocks as
-		// soon as possible.
-		sg := mheap_.sweepgen
-		for i := range mheap_.central {
-			c := &mheap_.central[i].mcentral
-			c.partialUnswept(sg).reset()
-			c.fullUnswept(sg).reset()
-		}
+	// Reset all the unswept buffers, which should be empty.
+	// Do this in sweep termination as opposed to mark termination
+	// so that we can catch unswept spans and reclaim blocks as
+	// soon as possible.
+	sg := mheap_.sweepgen
+	for i := range mheap_.central {
+		c := &mheap_.central[i].mcentral
+		c.partialUnswept(sg).reset()
+		c.fullUnswept(sg).reset()
 	}
 
 	// Sweeping is done, so if the scavenger isn't already awake,
@@ -202,11 +200,7 @@ func sweepone() uintptr {
 	var s *mspan
 	sg := mheap_.sweepgen
 	for {
-		if go115NewMCentralImpl {
-			s = mheap_.nextSpanForSweep()
-		} else {
-			s = mheap_.sweepSpans[1-sg/2%2].pop()
-		}
+		s = mheap_.nextSpanForSweep()
 		if s == nil {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
@@ -322,9 +316,6 @@ func (s *mspan) ensureSwept() {
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
 func (s *mspan) sweep(preserve bool) bool {
-	if !go115NewMCentralImpl {
-		return s.oldSweep(preserve)
-	}
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -568,214 +559,6 @@ func (s *mspan) sweep(preserve bool) bool {
 	return false
 }
 
-// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-// It clears the mark bits in preparation for the next GC round.
-// Returns true if the span was returned to heap.
-// If preserve=true, don't return it to heap nor relink in mcentral lists;
-// caller takes care of it.
-//
-// For !go115NewMCentralImpl.
-func (s *mspan) oldSweep(preserve bool) bool {
-	// It's critical that we enter this function with preemption disabled,
-	// GC must not start while we are in the middle of this function.
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("mspan.sweep: m is not locked")
-	}
-	sweepgen := mheap_.sweepgen
-	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-		throw("mspan.sweep: bad span state")
-	}
-
-	if trace.enabled {
-		traceGCSweepSpan(s.npages * _PageSize)
-	}
-
-	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
-
-	spc := s.spanclass
-	size := s.elemsize
-	res := false
-
-	c := _g_.m.p.ptr().mcache
-	freeToHeap := false
-
-	// The allocBits indicate which unmarked objects don't need to be
-	// processed since they were free at the end of the last GC cycle
-	// and were not allocated since then.
-	// If the allocBits index is >= s.freeindex and the bit
-	// is not marked then the object remains unallocated
-	// since the last GC.
-	// This situation is analogous to being on a freelist.
-
-	// Unlink & free special records for any objects we're about to free.
-	// Two complications here:
-	// 1. An object can have both finalizer and profile special records.
-	//    In such case we need to queue finalizer for execution,
-	//    mark the object as live and preserve the profile special.
-	// 2. A tiny object can have several finalizers setup for different offsets.
-	//    If such object is not marked, we need to queue all finalizers at once.
-	// Both 1 and 2 are possible at the same time.
-	hadSpecials := s.specials != nil
-	specialp := &s.specials
-	special := *specialp
-	for special != nil {
-		// A finalizer can be set for an inner byte of an object, find object beginning.
-		objIndex := uintptr(special.offset) / size
-		p := s.base() + objIndex*size
-		mbits := s.markBitsForIndex(objIndex)
-		if !mbits.isMarked() {
-			// This object is not marked and has at least one special record.
-			// Pass 1: see if it has at least one finalizer.
-			hasFin := false
-			endOffset := p - s.base() + size
-			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
-				if tmp.kind == _KindSpecialFinalizer {
-					// Stop freeing of object if it has a finalizer.
-					mbits.setMarkedNonAtomic()
-					hasFin = true
-					break
-				}
-			}
-			// Pass 2: queue all finalizers _or_ handle profile record.
-			for special != nil && uintptr(special.offset) < endOffset {
-				// Find the exact byte for which the special was setup
-				// (as opposed to object beginning).
-				p := s.base() + uintptr(special.offset)
-				if special.kind == _KindSpecialFinalizer || !hasFin {
-					// Splice out special record.
-					y := special
-					special = special.next
-					*specialp = special
-					freespecial(y, unsafe.Pointer(p), size)
-				} else {
-					// This is profile record, but the object has finalizers (so kept alive).
-					// Keep special record.
-					specialp = &special.next
-					special = *specialp
-				}
-			}
-		} else {
-			// object is still live: keep special record
-			specialp = &special.next
-			special = *specialp
-		}
-	}
-	if go115NewMarkrootSpans && hadSpecials && s.specials == nil {
-		spanHasNoSpecials(s)
-	}
-
-	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
-		// Find all newly freed objects. This doesn't have to
-		// efficient; allocfreetrace has massive overhead.
-		mbits := s.markBitsForBase()
-		abits := s.allocBitsForIndex(0)
-		for i := uintptr(0); i < s.nelems; i++ {
-			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
-				x := s.base() + i*s.elemsize
-				if debug.allocfreetrace != 0 {
-					tracefree(unsafe.Pointer(x), size)
-				}
-				if debug.clobberfree != 0 {
-					clobberfree(unsafe.Pointer(x), size)
-				}
-				if raceenabled {
-					racefree(unsafe.Pointer(x), size)
-				}
-				if msanenabled {
-					msanfree(unsafe.Pointer(x), size)
-				}
-			}
-			mbits.advance()
-			abits.advance()
-		}
-	}
-
-	// Count the number of free objects in this span.
-	nalloc := uint16(s.countAlloc())
-	if spc.sizeclass() == 0 && nalloc == 0 {
-		s.needzero = 1
-		freeToHeap = true
-	}
-	nfreed := s.allocCount - nalloc
-	if nalloc > s.allocCount {
-		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
-		throw("sweep increased allocation count")
-	}
-
-	s.allocCount = nalloc
-	wasempty := s.nextFreeIndex() == s.nelems
-	s.freeindex = 0 // reset allocation index to start of span.
-	if trace.enabled {
-		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
-	}
-
-	// gcmarkBits becomes the allocBits.
-	// get a fresh cleared gcmarkBits in preparation for next GC
-	s.allocBits = s.gcmarkBits
-	s.gcmarkBits = newMarkBits(s.nelems)
-
-	// Initialize alloc bits cache.
-	s.refillAllocCache(0)
-
-	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
-	// because of the potential for a concurrent free/SetFinalizer.
-	// But we need to set it before we make the span available for allocation
-	// (return it to heap or mcentral), because allocation code assumes that a
-	// span is already swept if available for allocation.
-	if freeToHeap || nfreed == 0 {
-		// The span must be in our exclusive ownership until we update sweepgen,
-		// check for potential races.
-		if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-			throw("mspan.sweep: bad span state after sweep")
-		}
-		// Serialization point.
-		// At this point the mark bits are cleared and allocation ready
-		// to go so release the span.
-		atomic.Store(&s.sweepgen, sweepgen)
-	}
-
-	if nfreed > 0 && spc.sizeclass() != 0 {
-		c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
-		res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty)
-		// mcentral.freeSpan updates sweepgen
-	} else if freeToHeap {
-		// Free large span to heap
-
-		// NOTE(rsc,dvyukov): The original implementation of efence
-		// in CL 22060046 used sysFree instead of sysFault, so that
-		// the operating system would eventually give the memory
-		// back to us again, so that an efence program could run
-		// longer without running out of memory. Unfortunately,
-		// calling sysFree here without any kind of adjustment of the
-		// heap data structures means that when the memory does
-		// come back to us, we have the wrong metadata for it, either in
-		// the mspan structures or in the garbage collection bitmap.
-		// Using sysFault here means that the program will run out of
-		// memory fairly quickly in efence mode, but at least it won't
-		// have mysterious crashes due to confused memory reuse.
-		// It should be possible to switch back to sysFree if we also
-		// implement and then call some kind of mheap.deleteSpan.
-		if debug.efence > 0 {
-			s.limit = 0 // prevent mlookup from finding this span
-			sysFault(unsafe.Pointer(s.base()), size)
-		} else {
-			mheap_.freeSpan(s)
-		}
-		c.local_nlargefree++
-		c.local_largefree += size
-		res = true
-	}
-	if !res {
-		// The span has been swept and is still in-use, so put
-		// it on the swept in-use list.
-		mheap_.sweepSpans[sweepgen/2%2].push(s)
-	}
-	return res
-}
-
 // reportZombies reports any marked but free objects in s and throws.
 //
 // This generally means one of the following:
diff --git a/src/runtime/mgcsweepbuf.go b/src/runtime/mgcsweepbuf.go
deleted file mode 100644
index 1f722c3d58..0000000000
--- a/src/runtime/mgcsweepbuf.go
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"internal/cpu"
-	"runtime/internal/atomic"
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-// A gcSweepBuf is a set of *mspans.
-//
-// gcSweepBuf is safe for concurrent push operations *or* concurrent
-// pop operations, but not both simultaneously.
-type gcSweepBuf struct {
-	// A gcSweepBuf is a two-level data structure consisting of a
-	// growable spine that points to fixed-sized blocks. The spine
-	// can be accessed without locks, but adding a block or
-	// growing it requires taking the spine lock.
-	//
-	// Because each mspan covers at least 8K of heap and takes at
-	// most 8 bytes in the gcSweepBuf, the growth of the spine is
-	// quite limited.
-	//
-	// The spine and all blocks are allocated off-heap, which
-	// allows this to be used in the memory manager and avoids the
-	// need for write barriers on all of these. We never release
-	// this memory because there could be concurrent lock-free
-	// access and we're likely to reuse it anyway. (In principle,
-	// we could do this during STW.)
-
-	spineLock mutex
-	spine     unsafe.Pointer // *[N]*gcSweepBlock, accessed atomically
-	spineLen  uintptr        // Spine array length, accessed atomically
-	spineCap  uintptr        // Spine array cap, accessed under lock
-
-	// index is the first unused slot in the logical concatenation
-	// of all blocks. It is accessed atomically.
-	index uint32
-}
-
-const (
-	gcSweepBlockEntries    = 512 // 4KB on 64-bit
-	gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit
-)
-
-type gcSweepBlock struct {
-	spans [gcSweepBlockEntries]*mspan
-}
-
-// push adds span s to buffer b. push is safe to call concurrently
-// with other push operations, but NOT to call concurrently with pop.
-func (b *gcSweepBuf) push(s *mspan) {
-	// Obtain our slot.
-	cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-
-	// Do we need to add a block?
-	spineLen := atomic.Loaduintptr(&b.spineLen)
-	var block *gcSweepBlock
-retry:
-	if top < spineLen {
-		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-		blockp := add(spine, sys.PtrSize*top)
-		block = (*gcSweepBlock)(atomic.Loadp(blockp))
-	} else {
-		// Add a new block to the spine, potentially growing
-		// the spine.
-		lock(&b.spineLock)
-		// spineLen cannot change until we release the lock,
-		// but may have changed while we were waiting.
-		spineLen = atomic.Loaduintptr(&b.spineLen)
-		if top < spineLen {
-			unlock(&b.spineLock)
-			goto retry
-		}
-
-		if spineLen == b.spineCap {
-			// Grow the spine.
-			newCap := b.spineCap * 2
-			if newCap == 0 {
-				newCap = gcSweepBufInitSpineCap
-			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
-			if b.spineCap != 0 {
-				// Blocks are allocated off-heap, so
-				// no write barriers.
-				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
-			}
-			// Spine is allocated off-heap, so no write barrier.
-			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
-			b.spineCap = newCap
-			// We can't immediately free the old spine
-			// since a concurrent push with a lower index
-			// could still be reading from it. We let it
-			// leak because even a 1TB heap would waste
-			// less than 2MB of memory on old spines. If
-			// this is a problem, we could free old spines
-			// during STW.
-		}
-
-		// Allocate a new block and add it to the spine.
-		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
-		blockp := add(b.spine, sys.PtrSize*top)
-		// Blocks are allocated off-heap, so no write barrier.
-		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
-		atomic.Storeuintptr(&b.spineLen, spineLen+1)
-		unlock(&b.spineLock)
-	}
-
-	// We have a block. Insert the span atomically, since there may be
-	// concurrent readers via the block API.
-	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
-}
-
-// pop removes and returns a span from buffer b, or nil if b is empty.
-// pop is safe to call concurrently with other pop operations, but NOT
-// to call concurrently with push.
-func (b *gcSweepBuf) pop() *mspan {
-	cursor := atomic.Xadd(&b.index, -1)
-	if int32(cursor) < 0 {
-		atomic.Xadd(&b.index, +1)
-		return nil
-	}
-
-	// There are no concurrent spine or block modifications during
-	// pop, so we can omit the atomics.
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-	blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
-	block := *blockp
-	s := block.spans[bottom]
-	// Clear the pointer for block(i).
-	block.spans[bottom] = nil
-	return s
-}
-
-// numBlocks returns the number of blocks in buffer b. numBlocks is
-// safe to call concurrently with any other operation. Spans that have
-// been pushed prior to the call to numBlocks are guaranteed to appear
-// in some block in the range [0, numBlocks()), assuming there are no
-// intervening pops. Spans that are pushed after the call may also
-// appear in these blocks.
-func (b *gcSweepBuf) numBlocks() int {
-	return int(divRoundUp(uintptr(atomic.Load(&b.index)), gcSweepBlockEntries))
-}
-
-// block returns the spans in the i'th block of buffer b. block is
-// safe to call concurrently with push. The block may contain nil
-// pointers that must be ignored, and each entry in the block must be
-// loaded atomically.
-func (b *gcSweepBuf) block(i int) []*mspan {
-	// Perform bounds check before loading spine address since
-	// push ensures the allocated length is at least spineLen.
-	if i < 0 || uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {
-		throw("block index out of range")
-	}
-
-	// Get block i.
-	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-	blockp := add(spine, sys.PtrSize*uintptr(i))
-	block := (*gcSweepBlock)(atomic.Loadp(blockp))
-
-	// Slice the block if necessary.
-	cursor := uintptr(atomic.Load(&b.index))
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-	var spans []*mspan
-	if uintptr(i) < top {
-		spans = block.spans[:]
-	} else {
-		spans = block.spans[:bottom]
-	}
-	return spans
-}
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 2c7bfd8a59..1a57bcd66e 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -42,17 +42,8 @@ const (
 	// roughly 100µs.
 	//
 	// Must be a multiple of the pageInUse bitmap element size and
-	// must also evenly divid pagesPerArena.
+	// must also evenly divide pagesPerArena.
 	pagesPerReclaimerChunk = 512
-
-	// go115NewMCentralImpl is a feature flag for the new mcentral implementation.
-	//
-	// This flag depends on go115NewMarkrootSpans because the new mcentral
-	// implementation requires that markroot spans no longer rely on mgcsweepbufs.
-	// The definition of this flag helps ensure that if there's a problem with
-	// the new markroot spans implementation and it gets turned off, that the new
-	// mcentral implementation also gets turned off so the runtime isn't broken.
-	go115NewMCentralImpl = true && go115NewMarkrootSpans
 )
 
 // Main malloc heap.
@@ -85,19 +76,6 @@ type mheap struct {
 	// access (since that may free the backing store).
 	allspans []*mspan // all spans out there
 
-	// sweepSpans contains two mspan stacks: one of swept in-use
-	// spans, and one of unswept in-use spans. These two trade
-	// roles on each GC cycle. Since the sweepgen increases by 2
-	// on each cycle, this means the swept spans are in
-	// sweepSpans[sweepgen/2%2] and the unswept spans are in
-	// sweepSpans[1-sweepgen/2%2]. Sweeping pops spans from the
-	// unswept stack and pushes spans that are still in-use on the
-	// swept stack. Likewise, allocating an in-use span pushes it
-	// on the swept stack.
-	//
-	// For !go115NewMCentralImpl.
-	sweepSpans [2]gcSweepBuf
-
 	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
@@ -220,7 +198,7 @@ type mheap struct {
 		base, end uintptr
 	}
 
-	// _ uint32 // ensure 64-bit alignment of central
+	_ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
 	// the padding makes sure that the mcentrals are
@@ -300,6 +278,10 @@ type heapArena struct {
 	// during marking.
 	pageSpecials [pagesPerArena / 8]uint8
 
+	// checkmarks stores the debug.gccheckmark state. It is only
+	// used if debug.gccheckmark > 0.
+	checkmarks *checkmarksMap
+
 	// zeroedBase marks the first byte of the first page in this
 	// arena which hasn't been used yet and is therefore already
 	// zero. zeroedBase is relative to the arena base.
@@ -715,8 +697,6 @@ func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8)
 // Initialize the heap.
 func (h *mheap) init() {
 	lockInit(&h.lock, lockRankMheap)
-	lockInit(&h.sweepSpans[0].spineLock, lockRankSpine)
-	lockInit(&h.sweepSpans[1].spineLock, lockRankSpine)
 	lockInit(&h.speciallock, lockRankMheapSpecial)
 
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
@@ -1290,16 +1270,6 @@ HaveSpan:
 	h.setSpans(s.base(), npages, s)
 
 	if !manual {
-		if !go115NewMCentralImpl {
-			// Add to swept in-use list.
-			//
-			// This publishes the span to root marking.
-			//
-			// h.sweepgen is guaranteed to only change during STW,
-			// and preemption is disabled in the page allocator.
-			h.sweepSpans[h.sweepgen/2%2].push(s)
-		}
-
 		// Mark in-use span in arena page bitmap.
 		//
 		// This publishes the span to the page sweeper, so
@@ -1701,9 +1671,7 @@ func addspecial(p unsafe.Pointer, s *special) bool {
 	s.offset = uint16(offset)
 	s.next = *t
 	*t = s
-	if go115NewMarkrootSpans {
-		spanHasSpecials(span)
-	}
+	spanHasSpecials(span)
 	unlock(&span.speciallock)
 	releasem(mp)
 
@@ -1744,7 +1712,7 @@ func removespecial(p unsafe.Pointer, kind uint8) *special {
 		}
 		t = &s.next
 	}
-	if go115NewMarkrootSpans && span.specials == nil {
+	if span.specials == nil {
 		spanHasNoSpecials(span)
 	}
 	unlock(&span.speciallock)
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 1fe77663b9..44dea22ef3 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -131,7 +131,7 @@ func header(arch string) {
 
 func p(f string, args ...interface{}) {
 	fmted := fmt.Sprintf(f, args...)
-	fmt.Fprintf(out, "\t%s\n", strings.Replace(fmted, "\n", "\n\t", -1))
+	fmt.Fprintf(out, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t"))
 }
 
 func label(l string) {
diff --git a/src/runtime/mpallocbits.go b/src/runtime/mpallocbits.go
index a8011341bc..ff112300c3 100644
--- a/src/runtime/mpallocbits.go
+++ b/src/runtime/mpallocbits.go
@@ -120,84 +120,105 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
 // sake of documentation, 0s are free pages and 1s are allocated pages.
 type pallocBits pageBits
 
-// consec8tab is a table containing the number of consecutive
-// zero bits for any uint8 value.
-//
-// The table is generated by calling consec8(i) for each
-// possible uint8 value, which is defined as:
-//
-// // consec8 counts the maximum number of consecutive 0 bits
-// // in a uint8.
-// func consec8(n uint8) int {
-// 	n = ^n
-// 	i := 0
-// 	for n != 0 {
-// 		n &= (n << 1)
-// 		i++
-// 	}
-// 	return i
-// }
-var consec8tab = [256]uint{
-	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-	4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
-}
-
 // summarize returns a packed summary of the bitmap in pallocBits.
 func (b *pallocBits) summarize() pallocSum {
-	// TODO(mknyszek): There may be something more clever to be done
-	// here to make the summarize operation more efficient. For example,
-	// we can compute start and end with 64-bit wide operations easily,
-	// but max is a bit more complex. Perhaps there exists some way to
-	// leverage the 64-bit start and end to our advantage?
-	var start, max, end uint
+	var start, max, cur uint
+	const notSetYet = ^uint(0) // sentinel for start value
+	start = notSetYet
 	for i := 0; i < len(b); i++ {
-		a := b[i]
-		for j := 0; j < 64; j += 8 {
-			k := uint8(a >> j)
+		x := b[i]
+		if x == 0 {
+			cur += 64
+			continue
+		}
+		t := uint(sys.TrailingZeros64(x))
+		l := uint(sys.LeadingZeros64(x))
 
-			// Compute start.
-			si := uint(sys.TrailingZeros8(k))
-			if start == uint(i*64+j) {
-				start += si
-			}
+		// Finish any region spanning the uint64s
+		cur += t
+		if start == notSetYet {
+			start = cur
+		}
+		if cur > max {
+			max = cur
+		}
+		// Final region that might span to next uint64
+		cur = l
+	}
+	if start == notSetYet {
+		// Made it all the way through without finding a single 1 bit.
+		const n = uint(64 * len(b))
+		return packPallocSum(n, n, n)
+	}
+	if cur > max {
+		max = cur
+	}
+	if max >= 64-2 {
+		// There is no way an internal run of zeros could beat max.
+		return packPallocSum(start, max, cur)
+	}
+	// Now look inside each uint64 for runs of zeros.
+	// All uint64s must be nonzero, or we would have aborted above.
+outer:
+	for i := 0; i < len(b); i++ {
+		x := b[i]
 
-			// Compute max.
-			if end+si > max {
-				max = end + si
-			}
-			if mi := consec8tab[k]; mi > max {
-				max = mi
+		// Look inside this uint64. We have a pattern like
+		// 000000 1xxxxx1 000000
+		// We need to look inside the 1xxxxx1 for any contiguous
+		// region of zeros.
+
+		// We already know the trailing zeros are no larger than max. Remove them.
+		x >>= sys.TrailingZeros64(x) & 63
+		if x&(x+1) == 0 { // no more zeros (except at the top).
+			continue
+		}
+
+		// Strategy: shrink all runs of zeros by max. If any runs of zero
+		// remain, then we've identified a larger maxiumum zero run.
+		p := max     // number of zeros we still need to shrink by.
+		k := uint(1) // current minimum length of runs of ones in x.
+		for {
+			// Shrink all runs of zeros by p places (except the top zeros).
+			for p > 0 {
+				if p <= k {
+					// Shift p ones down into the top of each run of zeros.
+					x |= x >> (p & 63)
+					if x&(x+1) == 0 { // no more zeros (except at the top).
+						continue outer
+					}
+					break
+				}
+				// Shift k ones down into the top of each run of zeros.
+				x |= x >> (k & 63)
+				if x&(x+1) == 0 { // no more zeros (except at the top).
+					continue outer
+				}
+				p -= k
+				// We've just doubled the minimum length of 1-runs.
+				// This allows us to shift farther in the next iteration.
+				k *= 2
 			}
 
-			// Compute end.
-			if k == 0 {
-				end += 8
-			} else {
-				end = uint(sys.LeadingZeros8(k))
+			// The length of the lowest-order zero run is an increment to our maximum.
+			j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
+			x >>= j & 63                       // remove trailing ones
+			j = uint(sys.TrailingZeros64(x))   // count contiguous trailing zeros
+			x >>= j & 63                       // remove zeros
+			max += j                           // we have a new maximum!
+			if x&(x+1) == 0 {                  // no more zeros (except at the top).
+				continue outer
 			}
+			p = j // remove j more zeros from each zero run.
 		}
 	}
-	return packPallocSum(start, max, end)
+	return packPallocSum(start, max, cur)
 }
 
 // find searches for npages contiguous free pages in pallocBits and returns
 // the index where that run starts, as well as the index of the first free page
 // it found in the search. searchIdx represents the first known free page and
-// where to begin the search from.
+// where to begin the next search from.
 //
 // If find fails to find any free space, it returns an index of ^uint(0) and
 // the new searchIdx should be ignored.
@@ -218,9 +239,10 @@ func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
 //
 // See find for an explanation of the searchIdx parameter.
 func (b *pallocBits) find1(searchIdx uint) uint {
+	_ = b[0] // lift nil check out of loop
 	for i := searchIdx / 64; i < uint(len(b)); i++ {
 		x := b[i]
-		if x == ^uint64(0) {
+		if ^x == 0 {
 			continue
 		}
 		return i*64 + uint(sys.TrailingZeros64(^x))
@@ -242,18 +264,18 @@ func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
 	end, newSearchIdx := uint(0), ^uint(0)
 	for i := searchIdx / 64; i < uint(len(b)); i++ {
 		bi := b[i]
-		if bi == ^uint64(0) {
+		if ^bi == 0 {
 			end = 0
 			continue
 		}
 		// First see if we can pack our allocation in the trailing
 		// zeros plus the end of the last 64 bits.
-		start := uint(sys.TrailingZeros64(bi))
 		if newSearchIdx == ^uint(0) {
 			// The new searchIdx is going to be at these 64 bits after any
 			// 1s we file, so count trailing 1s.
 			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
 		}
+		start := uint(sys.TrailingZeros64(bi))
 		if end+start >= uint(npages) {
 			return i*64 - end, newSearchIdx
 		}
@@ -348,15 +370,33 @@ func (b *pallocBits) pages64(i uint) uint64 {
 // findBitRange64 returns the bit index of the first set of
 // n consecutive 1 bits. If no consecutive set of 1 bits of
 // size n may be found in c, then it returns an integer >= 64.
+// n must be > 0.
 func findBitRange64(c uint64, n uint) uint {
-	i := uint(0)
-	cont := uint(sys.TrailingZeros64(^c))
-	for cont < n && i < 64 {
-		i += cont
-		i += uint(sys.TrailingZeros64(c >> i))
-		cont = uint(sys.TrailingZeros64(^(c >> i)))
+	// This implementation is based on shrinking the length of
+	// runs of contiguous 1 bits. We remove the top n-1 1 bits
+	// from each run of 1s, then look for the first remaining 1 bit.
+	p := n - 1   // number of 1s we want to remove.
+	k := uint(1) // current minimum width of runs of 0 in c.
+	for p > 0 {
+		if p <= k {
+			// Shift p 0s down into the top of each run of 1s.
+			c &= c >> (p & 63)
+			break
+		}
+		// Shift k 0s down into the top of each run of 1s.
+		c &= c >> (k & 63)
+		if c == 0 {
+			return 64
+		}
+		p -= k
+		// We've just doubled the minimum length of 0-runs.
+		// This allows us to shift farther in the next iteration.
+		k *= 2
 	}
-	return i
+	// Find first remaining 1.
+	// Since we shrunk from the top down, the first 1 is in
+	// its correct original position.
+	return uint(sys.TrailingZeros64(c))
 }
 
 // pallocData encapsulates pallocBits and a bitmap for
diff --git a/src/runtime/mpallocbits_test.go b/src/runtime/mpallocbits_test.go
index 71a29f3b3a..5095e24220 100644
--- a/src/runtime/mpallocbits_test.go
+++ b/src/runtime/mpallocbits_test.go
@@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) {
 
 // Ensures two packed summaries are identical, and reports a detailed description
 // of the difference if they're not.
-func checkPallocSum(t *testing.T, got, want PallocSum) {
+func checkPallocSum(t testing.TB, got, want PallocSum) {
 	if got.Start() != want.Start() {
 		t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
 	}
@@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) {
 
 // Benchmarks how quickly we can summarize a PallocBits.
 func BenchmarkPallocBitsSummarize(b *testing.B) {
-	buf0 := new(PallocBits)
-	buf1 := new(PallocBits)
-	for i := 0; i < len(buf1); i++ {
-		buf1[i] = ^uint64(0)
+	patterns := []uint64{
+		0,
+		^uint64(0),
+		0xaa,
+		0xaaaaaaaaaaaaaaaa,
+		0x80000000aaaaaaaa,
+		0xaaaaaaaa00000001,
+		0xbbbbbbbbbbbbbbbb,
+		0x80000000bbbbbbbb,
+		0xbbbbbbbb00000001,
+		0xcccccccccccccccc,
+		0x4444444444444444,
+		0x4040404040404040,
+		0x4000400040004000,
+		0x1000404044ccaaff,
 	}
-	bufa := new(PallocBits)
-	for i := 0; i < len(bufa); i++ {
-		bufa[i] = 0xaa
-	}
-	for _, buf := range []*PallocBits{buf0, buf1, bufa} {
-		b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
+	for _, p := range patterns {
+		buf := new(PallocBits)
+		for i := 0; i < len(buf); i++ {
+			buf[i] = p
+		}
+		b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) {
+			checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf))
 			for i := 0; i < b.N; i++ {
 				buf.Summarize()
 			}
@@ -492,10 +504,9 @@ func TestFindBitRange64(t *testing.T) {
 			t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result)
 		}
 	}
-	for i := uint(0); i <= 64; i++ {
+	for i := uint(1); i <= 64; i++ {
 		check(^uint64(0), i, 0)
 	}
-	check(0, 0, 0)
 	for i := uint(1); i <= 64; i++ {
 		check(0, i, ^uint(0))
 	}
@@ -508,3 +519,33 @@ func TestFindBitRange64(t *testing.T) {
 	check(0xffff03ff0107ffff, 16, 0)
 	check(0x0fff03ff01079fff, 16, ^uint(0))
 }
+
+func BenchmarkFindBitRange64(b *testing.B) {
+	patterns := []uint64{
+		0,
+		^uint64(0),
+		0xaa,
+		0xaaaaaaaaaaaaaaaa,
+		0x80000000aaaaaaaa,
+		0xaaaaaaaa00000001,
+		0xbbbbbbbbbbbbbbbb,
+		0x80000000bbbbbbbb,
+		0xbbbbbbbb00000001,
+		0xcccccccccccccccc,
+		0x4444444444444444,
+		0x4040404040404040,
+		0x4000400040004000,
+	}
+	sizes := []uint{
+		2, 8, 32,
+	}
+	for _, pattern := range patterns {
+		for _, size := range sizes {
+			b.Run(fmt.Sprintf("Pattern%02XSize%d", pattern, size), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					FindBitRange64(pattern, size)
+				}
+			})
+		}
+	}
+}
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 7b95ff2428..9702920bcf 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -249,6 +249,10 @@ func sysargs(argc int32, argv **byte) {
 	sysauxv(buf[:])
 }
 
+// startupRandomData holds random bytes initialized at startup. These come from
+// the ELF AT_RANDOM auxiliary vector.
+var startupRandomData []byte
+
 func sysauxv(auxv []uintptr) int {
 	var i int
 	for ; auxv[i] != _AT_NULL; i += 2 {
@@ -328,20 +332,11 @@ func libpreinit() {
 	initsig(true)
 }
 
-// gsignalInitQuirk, if non-nil, is called for every allocated gsignal G.
-//
-// TODO(austin): Remove this after Go 1.15 when we remove the
-// mlockGsignal workaround.
-var gsignalInitQuirk func(gsignal *g)
-
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
 func mpreinit(mp *m) {
 	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
 	mp.gsignal.m = mp
-	if gsignalInitQuirk != nil {
-		gsignalInitQuirk(mp.gsignal)
-	}
 }
 
 func gettid() uint32
diff --git a/src/runtime/os_linux_x86.go b/src/runtime/os_linux_x86.go
index 97f870707d..d91fa1a0d1 100644
--- a/src/runtime/os_linux_x86.go
+++ b/src/runtime/os_linux_x86.go
@@ -7,120 +7,4 @@
 
 package runtime
 
-import (
-	"runtime/internal/atomic"
-	"unsafe"
-)
-
-//go:noescape
-func uname(utsname *new_utsname) int
-
-func mlock(addr, len uintptr) int
-
-func osArchInit() {
-	// Linux 5.2 introduced a bug that can corrupt vector
-	// registers on return from a signal if the signal stack isn't
-	// faulted in:
-	// https://bugzilla.kernel.org/show_bug.cgi?id=205663
-	//
-	// It was fixed in 5.3.15, 5.4.2, and all 5.5 and later
-	// kernels.
-	//
-	// If we're on an affected kernel, work around this issue by
-	// mlocking the top page of every signal stack. This doesn't
-	// help for signal stacks created in C, but there's not much
-	// we can do about that.
-	//
-	// TODO(austin): Remove this in Go 1.15, at which point it
-	// will be unlikely to encounter any of the affected kernels
-	// in the wild.
-
-	var uts new_utsname
-	if uname(&uts) < 0 {
-		throw("uname failed")
-	}
-	// Check for null terminator to ensure gostringnocopy doesn't
-	// walk off the end of the release string.
-	found := false
-	for _, b := range uts.release {
-		if b == 0 {
-			found = true
-			break
-		}
-	}
-	if !found {
-		return
-	}
-	rel := gostringnocopy(&uts.release[0])
-
-	major, minor, patch, ok := parseRelease(rel)
-	if !ok {
-		return
-	}
-
-	if major == 5 && minor == 4 && patch < 2 {
-		// All 5.4 versions of Ubuntu are patched.
-		procVersion := []byte("/proc/version\000")
-		f := open(&procVersion[0], _O_RDONLY, 0)
-		if f >= 0 {
-			var buf [512]byte
-			p := noescape(unsafe.Pointer(&buf[0]))
-			n := read(f, p, int32(len(buf)))
-			closefd(f)
-
-			needle := []byte("Ubuntu")
-		contains:
-			for i, c := range buf[:n] {
-				if c != needle[0] {
-					continue
-				}
-				if int(n)-i < len(needle) {
-					break
-				}
-				for j, c2 := range needle {
-					if c2 != buf[i+j] {
-						continue contains
-					}
-				}
-				// This is an Ubuntu system.
-				return
-			}
-		}
-	}
-
-	if major == 5 && (minor == 2 || minor == 3 && patch < 15 || minor == 4 && patch < 2) {
-		gsignalInitQuirk = mlockGsignal
-		if m0.gsignal != nil {
-			throw("gsignal quirk too late")
-		}
-		throwReportQuirk = throwBadKernel
-	}
-}
-
-func mlockGsignal(gsignal *g) {
-	if atomic.Load(&touchStackBeforeSignal) != 0 {
-		// mlock has already failed, don't try again.
-		return
-	}
-
-	// This mlock call may fail, but we don't report the failure.
-	// Instead, if something goes badly wrong, we rely on prepareSignalM
-	// and throwBadKernel to do further mitigation and to report a problem
-	// to the user if mitigation fails. This is because many
-	// systems have a limit on the total mlock size, and many kernels
-	// that appear to have bad versions are actually patched to avoid the
-	// bug described above. We want Go 1.14 to run on those systems.
-	// See #37436.
-	if errno := mlock(gsignal.stack.hi-physPageSize, physPageSize); errno < 0 {
-		atomic.Store(&touchStackBeforeSignal, uint32(-errno))
-	}
-}
-
-// throwBadKernel is called, via throwReportQuirk, by throw.
-func throwBadKernel() {
-	if errno := atomic.Load(&touchStackBeforeSignal); errno != 0 {
-		println("runtime: note: your Linux kernel may be buggy")
-		println("runtime: note: see https://golang.org/wiki/LinuxKernelSignalVectorBug")
-		println("runtime: note: mlock workaround for kernel bug failed with errno", errno)
-	}
-}
+func osArchInit() {}
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index 97106c7b9d..f7f90cedc1 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -95,18 +95,28 @@ var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)
 
 // From NetBSD's <sys/sysctl.h>
 const (
-	_CTL_HW      = 6
-	_HW_NCPU     = 3
-	_HW_PAGESIZE = 7
+	_CTL_HW        = 6
+	_HW_NCPU       = 3
+	_HW_PAGESIZE   = 7
+	_HW_NCPUONLINE = 16
 )
 
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
+func sysctlInt(mib []uint32) (int32, bool) {
+	var out int32
 	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
+	ret := sysctl(&mib[0], uint32(len(mib)), (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret < 0 {
+		return 0, false
+	}
+	return out, true
+}
+
+func getncpu() int32 {
+	if n, ok := sysctlInt([]uint32{_CTL_HW, _HW_NCPUONLINE}); ok {
+		return int32(n)
+	}
+	if n, ok := sysctlInt([]uint32{_CTL_HW, _HW_NCPU}); ok {
+		return int32(n)
 	}
 	return 1
 }
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 9e187d2220..128c30adeb 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -82,10 +82,10 @@ func sigpanic() {
 	note := gostringnocopy((*byte)(unsafe.Pointer(g.m.notesig)))
 	switch g.sig {
 	case _SIGRFAULT, _SIGWFAULT:
-		i := index(note, "addr=")
+		i := indexNoFloat(note, "addr=")
 		if i >= 0 {
 			i += 5
-		} else if i = index(note, "va="); i >= 0 {
+		} else if i = indexNoFloat(note, "va="); i >= 0 {
 			i += 3
 		} else {
 			panicmem()
@@ -111,6 +111,20 @@ func sigpanic() {
 	}
 }
 
+// indexNoFloat is bytealg.IndexString but safe to use in a note
+// handler.
+func indexNoFloat(s, t string) int {
+	if len(t) == 0 {
+		return 0
+	}
+	for i := 0; i < len(s); i++ {
+		if s[i] == t[0] && hasPrefix(s[i:], t) {
+			return i
+		}
+	}
+	return -1
+}
+
 func atolwhex(p string) int64 {
 	for hasPrefix(p, " ") || hasPrefix(p, "\t") {
 		p = p[1:]
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index a584ada702..a62e941229 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -1010,11 +1010,6 @@ func ctrlhandler1(_type uint32) uint32 {
 	if sigsend(s) {
 		return 1
 	}
-	if !islibrary && !isarchive {
-		// Only exit the program if we don't have a DLL.
-		// See https://golang.org/issues/35965.
-		exit(2) // SIGINT, SIGTERM, etc
-	}
 	return 0
 }
 
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 615249f33c..127843b081 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -1283,12 +1283,6 @@ func startpanic_m() bool {
 	}
 }
 
-// throwReportQuirk, if non-nil, is called by throw after dumping the stacks.
-//
-// TODO(austin): Remove this after Go 1.15 when we remove the
-// mlockGsignal workaround.
-var throwReportQuirk func()
-
 var didothers bool
 var deadlock mutex
 
@@ -1335,10 +1329,6 @@ func dopanic_m(gp *g, pc, sp uintptr) bool {
 
 	printDebugLog()
 
-	if throwReportQuirk != nil {
-		throwReportQuirk()
-	}
-
 	return docrash
 }
 
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 035822216d..5e38b3194c 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/bytealg"
 	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
@@ -557,7 +558,6 @@ func schedinit() {
 
 	sched.maxmcount = 10000
 
-	tracebackinit()
 	moduledataverify()
 	stackinit()
 	mallocinit()
@@ -2575,15 +2575,20 @@ func injectglist(glist *gList) {
 		return
 	}
 
-	lock(&sched.lock)
-	npidle := int(sched.npidle)
+	npidle := int(atomic.Load(&sched.npidle))
+	var globq gQueue
 	var n int
 	for n = 0; n < npidle && !q.empty(); n++ {
-		globrunqput(q.pop())
+		g := q.pop()
+		globq.pushBack(g)
+	}
+	if n > 0 {
+		lock(&sched.lock)
+		globrunqputbatch(&globq, int32(n))
+		unlock(&sched.lock)
+		startIdle(n)
+		qsize -= n
 	}
-	unlock(&sched.lock)
-	startIdle(n)
-	qsize -= n
 
 	if !q.empty() {
 		runqputbatch(pp, &q, qsize)
@@ -5460,7 +5465,7 @@ func haveexperiment(name string) bool {
 	x := sys.Goexperiment
 	for x != "" {
 		xname := ""
-		i := index(x, ",")
+		i := bytealg.IndexByteString(x, ',')
 		if i < 0 {
 			xname, x = x, ""
 		} else {
diff --git a/src/runtime/rt0_freebsd_arm64.s b/src/runtime/rt0_freebsd_arm64.s
index 3a348c33e2..a938d98262 100644
--- a/src/runtime/rt0_freebsd_arm64.s
+++ b/src/runtime/rt0_freebsd_arm64.s
@@ -45,8 +45,7 @@ TEXT _rt0_arm64_freebsd_lib(SB),NOSPLIT,$184
 
 	// Create a new thread to do the runtime initialization and return.
 	MOVD	_cgo_sys_thread_create(SB), R4
-	CMP	$0, R4
-	BEQ	nocgo
+	CBZ	R4, nocgo
 	MOVD	$_rt0_arm64_freebsd_lib_go(SB), R0
 	MOVD	$0, R1
 	SUB	$16, RSP	// reserve 16 bytes for sp-8 where fp may be saved.
diff --git a/src/runtime/rt0_netbsd_arm64.s b/src/runtime/rt0_netbsd_arm64.s
index 75ecbe5176..2f3b5a5a87 100644
--- a/src/runtime/rt0_netbsd_arm64.s
+++ b/src/runtime/rt0_netbsd_arm64.s
@@ -44,8 +44,7 @@ TEXT _rt0_arm64_netbsd_lib(SB),NOSPLIT,$184
 
 	// Create a new thread to do the runtime initialization and return.
 	MOVD	_cgo_sys_thread_create(SB), R4
-	CMP	$0, R4
-	BEQ	nocgo
+	CBZ	R4, nocgo
 	MOVD	$_rt0_arm64_netbsd_lib_go(SB), R0
 	MOVD	$0, R1
 	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
diff --git a/src/runtime/rt0_openbsd_arm64.s b/src/runtime/rt0_openbsd_arm64.s
index 12408f2eec..722fab6129 100644
--- a/src/runtime/rt0_openbsd_arm64.s
+++ b/src/runtime/rt0_openbsd_arm64.s
@@ -50,8 +50,7 @@ TEXT _rt0_arm64_openbsd_lib(SB),NOSPLIT,$184
 
 	// Create a new thread to do the runtime initialization and return.
 	MOVD	_cgo_sys_thread_create(SB), R4
-	CMP	$0, R4
-	BEQ	nocgo
+	CBZ	R4, nocgo
 	MOVD	$_rt0_arm64_openbsd_lib_go(SB), R0
 	MOVD	$0, R1
 	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index c65a534ef6..7c893aa25c 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/bytealg"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -347,13 +348,13 @@ func parsedebugvars() {
 
 	for p := gogetenv("GODEBUG"); p != ""; {
 		field := ""
-		i := index(p, ",")
+		i := bytealg.IndexByteString(p, ',')
 		if i < 0 {
 			field, p = p, ""
 		} else {
 			field, p = p[:i], p[i+1:]
 		}
-		i = index(field, "=")
+		i = bytealg.IndexByteString(field, '=')
 		if i < 0 {
 			continue
 		}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 755c409078..eba68da624 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -366,6 +366,12 @@ type sudog struct {
 	// g.selectDone must be CAS'd to win the wake-up race.
 	isSelect bool
 
+	// success indicates whether communication over channel c
+	// succeeded. It is true if the goroutine was awoken because a
+	// value was delivered over channel c, and false if awoken
+	// because c was closed.
+	success bool
+
 	parent   *sudog // semaRoot binary tree
 	waitlink *sudog // g.waiting list or semaRoot
 	waittail *sudog // semaRoot
@@ -846,10 +852,6 @@ type forcegcstate struct {
 	idle uint32
 }
 
-// startup_random_data holds random bytes initialized at startup. These come from
-// the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
-var startupRandomData []byte
-
 // extendRandom extends the random numbers in r[:n] to the whole slice r.
 // Treats n<0 as n==0.
 func extendRandom(r []byte, n int) {
diff --git a/src/runtime/select.go b/src/runtime/select.go
index a069e3e050..80768b285b 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -12,25 +12,12 @@ import (
 
 const debugSelect = false
 
-// scase.kind values.
-// Known to compiler.
-// Changes here must also be made in src/cmd/compile/internal/gc/select.go's walkselectcases.
-const (
-	caseNil = iota
-	caseRecv
-	caseSend
-	caseDefault
-)
-
 // Select case descriptor.
 // Known to compiler.
 // Changes here must also be made in src/cmd/internal/gc/select.go's scasetype.
 type scase struct {
-	c           *hchan         // chan
-	elem        unsafe.Pointer // data element
-	kind        uint16
-	pc          uintptr // race pc (for race detector / msan)
-	releasetime int64
+	c    *hchan         // chan
+	elem unsafe.Pointer // data element
 }
 
 var (
@@ -38,15 +25,15 @@ var (
 	chanrecvpc = funcPC(chanrecv)
 )
 
-func selectsetpc(cas *scase) {
-	cas.pc = getcallerpc()
+func selectsetpc(pc *uintptr) {
+	*pc = getcallerpc()
 }
 
 func sellock(scases []scase, lockorder []uint16) {
 	var c *hchan
 	for _, o := range lockorder {
 		c0 := scases[o].c
-		if c0 != nil && c0 != c {
+		if c0 != c {
 			c = c0
 			lock(&c.lock)
 		}
@@ -62,11 +49,8 @@ func selunlock(scases []scase, lockorder []uint16) {
 	// the G that calls select runnable again and schedules it for execution.
 	// When the G runs on another M, it locks all the locks and frees sel.
 	// Now if the first M touches sel, it will access freed memory.
-	for i := len(scases) - 1; i >= 0; i-- {
+	for i := len(lockorder) - 1; i >= 0; i-- {
 		c := scases[lockorder[i]].c
-		if c == nil {
-			break
-		}
 		if i > 0 && c == scases[lockorder[i-1]].c {
 			continue // will unlock it on the next iteration
 		}
@@ -112,11 +96,15 @@ func block() {
 // Both reside on the goroutine's stack (regardless of any escaping in
 // selectgo).
 //
+// For race detector builds, pc0 points to an array of type
+// [ncases]uintptr (also on the stack); for other builds, it's set to
+// nil.
+//
 // selectgo returns the index of the chosen scase, which matches the
 // ordinal position of its respective select{recv,send,default} call.
 // Also, if the chosen scase was a receive operation, it reports whether
 // a value was received.
-func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
+func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, block bool) (int, bool) {
 	if debugSelect {
 		print("select: cas0=", cas0, "\n")
 	}
@@ -126,25 +114,29 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 	cas1 := (*[1 << 16]scase)(unsafe.Pointer(cas0))
 	order1 := (*[1 << 17]uint16)(unsafe.Pointer(order0))
 
+	ncases := nsends + nrecvs
 	scases := cas1[:ncases:ncases]
 	pollorder := order1[:ncases:ncases]
 	lockorder := order1[ncases:][:ncases:ncases]
 
-	// Replace send/receive cases involving nil channels with
-	// caseNil so logic below can assume non-nil channel.
-	for i := range scases {
-		cas := &scases[i]
-		if cas.c == nil && cas.kind != caseDefault {
-			*cas = scase{}
+	// Even when raceenabled is true, there might be select
+	// statements in packages compiled without -race (e.g.,
+	// ensureSigM in runtime/signal_unix.go).
+	var pcs []uintptr
+	if raceenabled && pc0 != nil {
+		pc1 := (*[1 << 16]uintptr)(unsafe.Pointer(pc0))
+		pcs = pc1[:ncases:ncases]
+	}
+	casePC := func(casi int) uintptr {
+		if pcs == nil {
+			return 0
 		}
+		return pcs[casi]
 	}
 
 	var t0 int64
 	if blockprofilerate > 0 {
 		t0 = cputicks()
-		for i := 0; i < ncases; i++ {
-			scases[i].releasetime = -1
-		}
 	}
 
 	// The compiler rewrites selects that statically have
@@ -156,15 +148,27 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 	// optimizing (and needing to test).
 
 	// generate permuted order
-	for i := 1; i < ncases; i++ {
-		j := fastrandn(uint32(i + 1))
-		pollorder[i] = pollorder[j]
+	norder := 0
+	for i := range scases {
+		cas := &scases[i]
+
+		// Omit cases without channels from the poll and lock orders.
+		if cas.c == nil {
+			cas.elem = nil // allow GC
+			continue
+		}
+
+		j := fastrandn(uint32(norder + 1))
+		pollorder[norder] = pollorder[j]
 		pollorder[j] = uint16(i)
+		norder++
 	}
+	pollorder = pollorder[:norder]
+	lockorder = lockorder[:norder]
 
 	// sort the cases by Hchan address to get the locking order.
 	// simple heap sort, to guarantee n log n time and constant stack footprint.
-	for i := 0; i < ncases; i++ {
+	for i := range lockorder {
 		j := i
 		// Start with the pollorder to permute cases on the same channel.
 		c := scases[pollorder[i]].c
@@ -175,7 +179,7 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 		}
 		lockorder[j] = pollorder[i]
 	}
-	for i := ncases - 1; i >= 0; i-- {
+	for i := len(lockorder) - 1; i >= 0; i-- {
 		o := lockorder[i]
 		c := scases[o].c
 		lockorder[i] = lockorder[0]
@@ -199,7 +203,7 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 	}
 
 	if debugSelect {
-		for i := 0; i+1 < ncases; i++ {
+		for i := 0; i+1 < len(lockorder); i++ {
 			if scases[lockorder[i]].c.sortkey() > scases[lockorder[i+1]].c.sortkey() {
 				print("i=", i, " x=", lockorder[i], " y=", lockorder[i+1], "\n")
 				throw("select: broken sort")
@@ -221,23 +225,18 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 		nextp  **sudog
 	)
 
-loop:
 	// pass 1 - look for something already waiting
-	var dfli int
-	var dfl *scase
 	var casi int
 	var cas *scase
+	var caseSuccess bool
+	var caseReleaseTime int64 = -1
 	var recvOK bool
-	for i := 0; i < ncases; i++ {
-		casi = int(pollorder[i])
+	for _, casei := range pollorder {
+		casi = int(casei)
 		cas = &scases[casi]
 		c = cas.c
 
-		switch cas.kind {
-		case caseNil:
-			continue
-
-		case caseRecv:
+		if casi >= nsends {
 			sg = c.sendq.dequeue()
 			if sg != nil {
 				goto recv
@@ -248,10 +247,9 @@ loop:
 			if c.closed != 0 {
 				goto rclose
 			}
-
-		case caseSend:
+		} else {
 			if raceenabled {
-				racereadpc(c.raceaddr(), cas.pc, chansendpc)
+				racereadpc(c.raceaddr(), casePC(casi), chansendpc)
 			}
 			if c.closed != 0 {
 				goto sclose
@@ -263,17 +261,12 @@ loop:
 			if c.qcount < c.dataqsiz {
 				goto bufsend
 			}
-
-		case caseDefault:
-			dfli = casi
-			dfl = cas
 		}
 	}
 
-	if dfl != nil {
+	if !block {
 		selunlock(scases, lockorder)
-		casi = dfli
-		cas = dfl
+		casi = -1
 		goto retc
 	}
 
@@ -286,9 +279,6 @@ loop:
 	for _, casei := range lockorder {
 		casi = int(casei)
 		cas = &scases[casi]
-		if cas.kind == caseNil {
-			continue
-		}
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
@@ -305,12 +295,10 @@ loop:
 		*nextp = sg
 		nextp = &sg.waitlink
 
-		switch cas.kind {
-		case caseRecv:
-			c.recvq.enqueue(sg)
-
-		case caseSend:
+		if casi < nsends {
 			c.sendq.enqueue(sg)
+		} else {
+			c.recvq.enqueue(sg)
 		}
 	}
 
@@ -331,6 +319,7 @@ loop:
 	// We singly-linked up the SudoGs in lock order.
 	casi = -1
 	cas = nil
+	caseSuccess = false
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
@@ -342,19 +331,17 @@ loop:
 
 	for _, casei := range lockorder {
 		k = &scases[casei]
-		if k.kind == caseNil {
-			continue
-		}
-		if sglist.releasetime > 0 {
-			k.releasetime = sglist.releasetime
-		}
 		if sg == sglist {
 			// sg has already been dequeued by the G that woke us up.
 			casi = int(casei)
 			cas = k
+			caseSuccess = sglist.success
+			if sglist.releasetime > 0 {
+				caseReleaseTime = sglist.releasetime
+			}
 		} else {
 			c = k.c
-			if k.kind == caseSend {
+			if int(casei) < nsends {
 				c.sendq.dequeueSudoG(sglist)
 			} else {
 				c.recvq.dequeueSudoG(sglist)
@@ -367,40 +354,35 @@ loop:
 	}
 
 	if cas == nil {
-		// We can wake up with gp.param == nil (so cas == nil)
-		// when a channel involved in the select has been closed.
-		// It is easiest to loop and re-run the operation;
-		// we'll see that it's now closed.
-		// Maybe some day we can signal the close explicitly,
-		// but we'd have to distinguish close-on-reader from close-on-writer.
-		// It's easiest not to duplicate the code and just recheck above.
-		// We know that something closed, and things never un-close,
-		// so we won't block again.
-		goto loop
+		throw("selectgo: bad wakeup")
 	}
 
 	c = cas.c
 
 	if debugSelect {
-		print("wait-return: cas0=", cas0, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
+		print("wait-return: cas0=", cas0, " c=", c, " cas=", cas, " send=", casi < nsends, "\n")
 	}
 
-	if cas.kind == caseRecv {
-		recvOK = true
+	if casi < nsends {
+		if !caseSuccess {
+			goto sclose
+		}
+	} else {
+		recvOK = caseSuccess
 	}
 
 	if raceenabled {
-		if cas.kind == caseRecv && cas.elem != nil {
-			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
-		} else if cas.kind == caseSend {
-			raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+		if casi < nsends {
+			raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
+		} else if cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, casePC(casi), chanrecvpc)
 		}
 	}
 	if msanenabled {
-		if cas.kind == caseRecv && cas.elem != nil {
-			msanwrite(cas.elem, c.elemtype.size)
-		} else if cas.kind == caseSend {
+		if casi < nsends {
 			msanread(cas.elem, c.elemtype.size)
+		} else if cas.elem != nil {
+			msanwrite(cas.elem, c.elemtype.size)
 		}
 	}
 
@@ -411,7 +393,7 @@ bufrecv:
 	// can receive from buffer
 	if raceenabled {
 		if cas.elem != nil {
-			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
+			raceWriteObjectPC(c.elemtype, cas.elem, casePC(casi), chanrecvpc)
 		}
 		raceacquire(chanbuf(c, c.recvx))
 		racerelease(chanbuf(c, c.recvx))
@@ -438,7 +420,7 @@ bufsend:
 	if raceenabled {
 		raceacquire(chanbuf(c, c.sendx))
 		racerelease(chanbuf(c, c.sendx))
-		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+		raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
 	}
 	if msanenabled {
 		msanread(cas.elem, c.elemtype.size)
@@ -476,7 +458,7 @@ rclose:
 send:
 	// can send to a sleeping receiver (sg)
 	if raceenabled {
-		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+		raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
 	}
 	if msanenabled {
 		msanread(cas.elem, c.elemtype.size)
@@ -488,8 +470,8 @@ send:
 	goto retc
 
 retc:
-	if cas.releasetime > 0 {
-		blockevent(cas.releasetime-t0, 1)
+	if caseReleaseTime > 0 {
+		blockevent(caseReleaseTime-t0, 1)
 	}
 	return casi, recvOK
 
@@ -528,23 +510,57 @@ func reflect_rselect(cases []runtimeSelect) (int, bool) {
 		block()
 	}
 	sel := make([]scase, len(cases))
-	order := make([]uint16, 2*len(cases))
-	for i := range cases {
-		rc := &cases[i]
+	orig := make([]int, len(cases))
+	nsends, nrecvs := 0, 0
+	dflt := -1
+	for i, rc := range cases {
+		var j int
 		switch rc.dir {
 		case selectDefault:
-			sel[i] = scase{kind: caseDefault}
+			dflt = i
+			continue
 		case selectSend:
-			sel[i] = scase{kind: caseSend, c: rc.ch, elem: rc.val}
+			j = nsends
+			nsends++
 		case selectRecv:
-			sel[i] = scase{kind: caseRecv, c: rc.ch, elem: rc.val}
+			nrecvs++
+			j = len(cases) - nrecvs
 		}
-		if raceenabled || msanenabled {
-			selectsetpc(&sel[i])
+
+		sel[j] = scase{c: rc.ch, elem: rc.val}
+		orig[j] = i
+	}
+
+	// Only a default case.
+	if nsends+nrecvs == 0 {
+		return dflt, false
+	}
+
+	// Compact sel and orig if necessary.
+	if nsends+nrecvs < len(cases) {
+		copy(sel[nsends:], sel[len(cases)-nrecvs:])
+		copy(orig[nsends:], orig[len(cases)-nrecvs:])
+	}
+
+	order := make([]uint16, 2*(nsends+nrecvs))
+	var pc0 *uintptr
+	if raceenabled {
+		pcs := make([]uintptr, nsends+nrecvs)
+		for i := range pcs {
+			selectsetpc(&pcs[i])
 		}
+		pc0 = &pcs[0]
 	}
 
-	return selectgo(&sel[0], &order[0], len(cases))
+	chosen, recvOK := selectgo(&sel[0], &order[0], pc0, nsends, nrecvs, dflt == -1)
+
+	// Translate chosen back to caller's ordering.
+	if chosen < 0 {
+		chosen = dflt
+	} else {
+		chosen = orig[chosen]
+	}
+	return chosen, recvOK
 }
 
 func (q *waitq) dequeueSudoG(sgp *sudog) {
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index dd6d79f8ec..064a0ea100 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -272,6 +272,12 @@ func setProcessCPUProfiler(hz int32) {
 			atomic.Storeuintptr(&fwdSig[_SIGPROF], getsig(_SIGPROF))
 			setsig(_SIGPROF, funcPC(sighandler))
 		}
+
+		var it itimerval
+		it.it_interval.tv_sec = 0
+		it.it_interval.set_usec(1000000 / hz)
+		it.it_value = it.it_interval
+		setitimer(_ITIMER_PROF, &it, nil)
 	} else {
 		// If the Go signal handler should be disabled by default,
 		// switch back to the signal handler that was installed
@@ -296,23 +302,16 @@ func setProcessCPUProfiler(hz int32) {
 				setsig(_SIGPROF, h)
 			}
 		}
+
+		setitimer(_ITIMER_PROF, &itimerval{}, nil)
 	}
 }
 
 // setThreadCPUProfiler makes any thread-specific changes required to
 // implement profiling at a rate of hz.
+// No changes required on Unix systems.
 func setThreadCPUProfiler(hz int32) {
-	var it itimerval
-	if hz == 0 {
-		setitimer(_ITIMER_PROF, &it, nil)
-	} else {
-		it.it_interval.tv_sec = 0
-		it.it_interval.set_usec(1000000 / hz)
-		it.it_value = it.it_interval
-		setitimer(_ITIMER_PROF, &it, nil)
-	}
-	_g_ := getg()
-	_g_.m.profilehz = hz
+	getg().m.profilehz = hz
 }
 
 func sigpipe() {
@@ -616,7 +615,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 		print("signal arrived during cgo execution\n")
 		gp = _g_.m.lockedg.ptr()
 	}
-	if sig == _SIGILL {
+	if sig == _SIGILL || sig == _SIGFPE {
 		// It would be nice to know how long the instruction is.
 		// Unfortunately, that's complicated to do in general (mostly for x86
 		// and s930x, but other archs have non-standard instruction lengths also).
diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go
index e963a43dd3..cd2bc26d1e 100644
--- a/src/runtime/slice_test.go
+++ b/src/runtime/slice_test.go
@@ -1,6 +1,7 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import (
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 0515b56573..9a601f0094 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -335,22 +335,6 @@ func gostringn(p *byte, l int) string {
 	return s
 }
 
-func index(s, t string) int {
-	if len(t) == 0 {
-		return 0
-	}
-	for i := 0; i < len(s); i++ {
-		if s[i] == t[0] && hasPrefix(s[i:], t) {
-			return i
-		}
-	}
-	return -1
-}
-
-func contains(s, t string) bool {
-	return index(s, t) >= 0
-}
-
 func hasPrefix(s, prefix string) bool {
 	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
 }
@@ -499,37 +483,3 @@ func gostringw(strw *uint16) string {
 	b[n2] = 0 // for luck
 	return s[:n2]
 }
-
-// parseRelease parses a dot-separated version number. It follows the
-// semver syntax, but allows the minor and patch versions to be
-// elided.
-func parseRelease(rel string) (major, minor, patch int, ok bool) {
-	// Strip anything after a dash or plus.
-	for i := 0; i < len(rel); i++ {
-		if rel[i] == '-' || rel[i] == '+' {
-			rel = rel[:i]
-			break
-		}
-	}
-
-	next := func() (int, bool) {
-		for i := 0; i < len(rel); i++ {
-			if rel[i] == '.' {
-				ver, ok := atoi(rel[:i])
-				rel = rel[i+1:]
-				return ver, ok
-			}
-		}
-		ver, ok := atoi(rel)
-		rel = ""
-		return ver, ok
-	}
-	if major, ok = next(); !ok || rel == "" {
-		return
-	}
-	if minor, ok = next(); !ok || rel == "" {
-		return
-	}
-	patch, ok = next()
-	return
-}
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index b9ac667533..4eda12c35d 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go
@@ -454,34 +454,3 @@ func TestAtoi32(t *testing.T) {
 		}
 	}
 }
-
-type parseReleaseTest struct {
-	in                  string
-	major, minor, patch int
-}
-
-var parseReleaseTests = []parseReleaseTest{
-	{"", -1, -1, -1},
-	{"x", -1, -1, -1},
-	{"5", 5, 0, 0},
-	{"5.12", 5, 12, 0},
-	{"5.12-x", 5, 12, 0},
-	{"5.12.1", 5, 12, 1},
-	{"5.12.1-x", 5, 12, 1},
-	{"5.12.1.0", 5, 12, 1},
-	{"5.20496382327982653440", -1, -1, -1},
-}
-
-func TestParseRelease(t *testing.T) {
-	for _, test := range parseReleaseTests {
-		major, minor, patch, ok := runtime.ParseRelease(test.in)
-		if !ok {
-			major, minor, patch = -1, -1, -1
-		}
-		if test.major != major || test.minor != minor || test.patch != patch {
-			t.Errorf("parseRelease(%q) = (%v, %v, %v) want (%v, %v, %v)",
-				test.in, major, minor, patch,
-				test.major, test.minor, test.patch)
-		}
-	}
-}
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index 06474434c9..e4f19bbf41 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -489,9 +489,3 @@ func setNonblock(fd int32) {
 //go:cgo_import_dynamic libc_pthread_cond_wait pthread_cond_wait "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_cond_timedwait_relative_np pthread_cond_timedwait_relative_np "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_cond_signal pthread_cond_signal "/usr/lib/libSystem.B.dylib"
-
-// Magic incantation to get libSystem and friends actually dynamically linked.
-// TODO: Why does the code require this?  See cmd/link/internal/ld/go.go
-//go:cgo_import_dynamic _ _ "/usr/lib/libSystem.B.dylib"
-//go:cgo_import_dynamic _ _ "/System/Library/Frameworks/Security.framework/Versions/A/Security"
-//go:cgo_import_dynamic _ _ "/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation"
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 5b9b638ad7..1e3a834812 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -39,8 +39,6 @@
 #define SYS_socketcall		102
 #define SYS_setittimer		104
 #define SYS_clone		120
-#define SYS_uname		122
-#define SYS_mlock		150
 #define SYS_sched_yield 	158
 #define SYS_nanosleep		162
 #define SYS_rt_sigreturn	173
@@ -808,20 +806,3 @@ TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
 	INVOKE_SYSCALL
 	MOVL	AX, ret+0(FP)
 	RET
-
-// func uname(utsname *new_utsname) int
-TEXT ·uname(SB),NOSPLIT,$0-8
-	MOVL    $SYS_uname, AX
-	MOVL    utsname+0(FP), BX
-	INVOKE_SYSCALL
-	MOVL	AX, ret+4(FP)
-	RET
-
-// func mlock(addr, len uintptr) int
-TEXT ·mlock(SB),NOSPLIT,$0-12
-	MOVL    $SYS_mlock, AX
-	MOVL    addr+0(FP), BX
-	MOVL    len+4(FP), CX
-	INVOKE_SYSCALL
-	MOVL	AX, ret+8(FP)
-	RET
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index fe9c6bce85..8d90813589 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -33,10 +33,8 @@
 #define SYS_clone		56
 #define SYS_exit		60
 #define SYS_kill		62
-#define SYS_uname		63
 #define SYS_fcntl		72
 #define SYS_sigaltstack 	131
-#define SYS_mlock		149
 #define SYS_arch_prctl		158
 #define SYS_gettid		186
 #define SYS_futex		202
@@ -214,7 +212,7 @@ TEXT runtime·walltime1(SB),NOSPLIT,$16-12
 	// due to stack probes inserted to avoid stack/heap collisions.
 	// See issue #20427.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -252,7 +250,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
@@ -279,7 +277,7 @@ fallback:
 TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
 	// Switch to g0 stack. See comment above in runtime·walltime.
 
-	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
 	get_tls(CX)
 	MOVQ	g(CX), AX
@@ -317,7 +315,7 @@ noswitch:
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
 ret:
-	MOVQ	BP, SP		// Restore real SP
+	MOVQ	R12, SP		// Restore real SP
 	// Restore vdsoPC, vdsoSP
 	// We don't worry about being signaled between the two stores.
 	// If we are not in a signal handler, we'll restore vdsoSP to 0,
@@ -594,13 +592,25 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	MOVQ	stk+8(FP), SI
 	MOVQ	$0, DX
 	MOVQ	$0, R10
-
+	MOVQ    $0, R8
 	// Copy mp, gp, fn off parent stack for use by child.
 	// Careful: Linux system call clobbers CX and R11.
-	MOVQ	mp+16(FP), R8
+	MOVQ	mp+16(FP), R13
 	MOVQ	gp+24(FP), R9
 	MOVQ	fn+32(FP), R12
-
+	CMPQ	R13, $0    // m
+	JEQ	nog1
+	CMPQ	R9, $0    // g
+	JEQ	nog1
+	LEAQ	m_tls(R13), R8
+#ifdef GOOS_android
+	// Android stores the TLS offset in runtime·tls_g.
+	SUBQ	runtime·tls_g(SB), R8
+#else
+	ADDQ	$8, R8	// ELF wants to use -8(FS)
+#endif
+	ORQ 	$0x00080000, DI //add flag CLONE_SETTLS(0x00080000) to call clone
+nog1:
 	MOVL	$SYS_clone, AX
 	SYSCALL
 
@@ -614,27 +624,23 @@ TEXT runtime·clone(SB),NOSPLIT,$0
 	MOVQ	SI, SP
 
 	// If g or m are nil, skip Go-related setup.
-	CMPQ	R8, $0    // m
-	JEQ	nog
+	CMPQ	R13, $0    // m
+	JEQ	nog2
 	CMPQ	R9, $0    // g
-	JEQ	nog
+	JEQ	nog2
 
 	// Initialize m->procid to Linux tid
 	MOVL	$SYS_gettid, AX
 	SYSCALL
-	MOVQ	AX, m_procid(R8)
-
-	// Set FS to point at m->tls.
-	LEAQ	m_tls(R8), DI
-	CALL	runtime·settls(SB)
+	MOVQ	AX, m_procid(R13)
 
 	// In child, set up new stack
 	get_tls(CX)
-	MOVQ	R8, g_m(R9)
+	MOVQ	R13, g_m(R9)
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
-nog:
+nog2:
 	// Call fn
 	CALL	R12
 
@@ -789,20 +795,3 @@ TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
 	SYSCALL
 	MOVQ	AX, ret+0(FP)
 	RET
-
-// func uname(utsname *new_utsname) int
-TEXT ·uname(SB),NOSPLIT,$0-16
-	MOVQ    utsname+0(FP), DI
-	MOVL    $SYS_uname, AX
-	SYSCALL
-	MOVQ	AX, ret+8(FP)
-	RET
-
-// func mlock(addr, len uintptr) int
-TEXT ·mlock(SB),NOSPLIT,$0-24
-	MOVQ    addr+0(FP), DI
-	MOVQ    len+8(FP), SI
-	MOVL    $SYS_mlock, AX
-	SYSCALL
-	MOVQ	AX, ret+16(FP)
-	RET
diff --git a/src/runtime/testdata/testprog/checkptr.go b/src/runtime/testdata/testprog/checkptr.go
index 45e6fb1aa5..e0a2794f4c 100644
--- a/src/runtime/testdata/testprog/checkptr.go
+++ b/src/runtime/testdata/testprog/checkptr.go
@@ -10,6 +10,7 @@ func init() {
 	register("CheckPtrAlignmentNoPtr", CheckPtrAlignmentNoPtr)
 	register("CheckPtrAlignmentPtr", CheckPtrAlignmentPtr)
 	register("CheckPtrArithmetic", CheckPtrArithmetic)
+	register("CheckPtrArithmetic2", CheckPtrArithmetic2)
 	register("CheckPtrSize", CheckPtrSize)
 	register("CheckPtrSmall", CheckPtrSmall)
 }
@@ -32,6 +33,13 @@ func CheckPtrArithmetic() {
 	sink2 = (*int)(unsafe.Pointer(i))
 }
 
+func CheckPtrArithmetic2() {
+	var x [2]int64
+	p := unsafe.Pointer(&x[1])
+	var one uintptr = 1
+	sink2 = unsafe.Pointer(uintptr(p) & ^one)
+}
+
 func CheckPtrSize() {
 	p := new(int64)
 	sink2 = p
diff --git a/src/runtime/time.go b/src/runtime/time.go
index fdb5066b24..f895bf8443 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -403,7 +403,7 @@ func dodeltimer0(pp *p) {
 }
 
 // modtimer modifies an existing timer.
-// This is called by the netpoll code or time.Ticker.Reset.
+// This is called by the netpoll code or time.Ticker.Reset or time.Timer.Reset.
 // Reports whether the timer was modified before it was run.
 func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
 	if when < 0 {
diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
index cfc0419b72..be3adc9801 100644
--- a/src/runtime/trace/trace_stack_test.go
+++ b/src/runtime/trace/trace_stack_test.go
@@ -252,7 +252,7 @@ func TestTraceSymbolize(t *testing.T) {
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
-				{"internal/poll.ignoringEINTR", 0},
+				{"internal/poll.ignoringEINTRIO", 0},
 				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 944c8473d2..7850eceafa 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/bytealg"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -35,16 +36,6 @@ import (
 
 const usesLR = sys.MinFrameSize > 0
 
-var skipPC uintptr
-
-func tracebackinit() {
-	// Go variable initialization happens late during runtime startup.
-	// Instead of initializing the variables above in the declarations,
-	// schedinit calls this function so that the variables are
-	// initialized and available earlier in the startup sequence.
-	skipPC = funcPC(skipPleaseUseCallersFrames)
-}
-
 // Traceback over the deferred function calls.
 // Report them like calls that have been invoked but not started executing yet.
 func tracebackdefers(gp *g, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer) {
@@ -82,9 +73,6 @@ func tracebackdefers(gp *g, callback func(*stkframe, unsafe.Pointer) bool, v uns
 
 const sizeofSkipFunction = 256
 
-// This function is defined in asm.s to be sizeofSkipFunction bytes long.
-func skipPleaseUseCallersFrames()
-
 // Generic traceback. Handles runtime stack prints (pcbuf == nil),
 // the runtime.Callers function (pcbuf != nil), as well as the garbage
 // collector (callback != nil).  A little clunky to merge these, but avoids
@@ -848,7 +836,7 @@ func showfuncinfo(f funcInfo, firstFrame bool, funcID, childID funcID) bool {
 		return true
 	}
 
-	return contains(name, ".") && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
+	return bytealg.IndexByteString(name, '.') >= 0 && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.