57 files changed, 2294 insertions, 1425 deletions
diff --git a/src/runtime/arch1_386.go b/src/runtime/arch1_386.go
index b024d7a51f..d41696a6d6 100644
--- a/src/runtime/arch1_386.go
+++ b/src/runtime/arch1_386.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '8'
-	_BigEndian        = 0
-	_CacheLineSize    = 64
-	_RuntimeGogoBytes = 64
-	_PhysPageSize     = goos_nacl*65536 + (1-goos_nacl)*4096 // 4k normally; 64k on NaCl
-	_PCQuantum        = 1
-	_Int64Align       = 4
-	hugePageSize      = 1 << 21
+	thechar        = '8'
+	_BigEndian     = 0
+	_CacheLineSize = 64
+	_PhysPageSize  = goos_nacl*65536 + (1-goos_nacl)*4096 // 4k normally; 64k on NaCl
+	_PCQuantum     = 1
+	_Int64Align    = 4
+	hugePageSize   = 1 << 21
 )
diff --git a/src/runtime/arch1_amd64.go b/src/runtime/arch1_amd64.go
index 932b2b7c55..15f4cc65fe 100644
--- a/src/runtime/arch1_amd64.go
+++ b/src/runtime/arch1_amd64.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '6'
-	_BigEndian        = 0
-	_CacheLineSize    = 64
-	_RuntimeGogoBytes = 80 + (goos_solaris)*16
-	_PhysPageSize     = 4096
-	_PCQuantum        = 1
-	_Int64Align       = 8
-	hugePageSize      = 1 << 21
+	thechar        = '6'
+	_BigEndian     = 0
+	_CacheLineSize = 64
+	_PhysPageSize  = 4096
+	_PCQuantum     = 1
+	_Int64Align    = 8
+	hugePageSize   = 1 << 21
 )
diff --git a/src/runtime/arch1_amd64p32.go b/src/runtime/arch1_amd64p32.go
index 79421e848a..3c5456f933 100644
--- a/src/runtime/arch1_amd64p32.go
+++ b/src/runtime/arch1_amd64p32.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '6'
-	_BigEndian        = 0
-	_CacheLineSize    = 64
-	_RuntimeGogoBytes = 64
-	_PhysPageSize     = 65536*goos_nacl + 4096*(1-goos_nacl)
-	_PCQuantum        = 1
-	_Int64Align       = 8
-	hugePageSize      = 1 << 21
+	thechar        = '6'
+	_BigEndian     = 0
+	_CacheLineSize = 64
+	_PhysPageSize  = 65536*goos_nacl + 4096*(1-goos_nacl)
+	_PCQuantum     = 1
+	_Int64Align    = 8
+	hugePageSize   = 1 << 21
 )
diff --git a/src/runtime/arch1_arm.go b/src/runtime/arch1_arm.go
index c3fe4f0cb3..0ec2093881 100644
--- a/src/runtime/arch1_arm.go
+++ b/src/runtime/arch1_arm.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '5'
-	_BigEndian        = 0
-	_CacheLineSize    = 32
-	_RuntimeGogoBytes = 60
-	_PhysPageSize     = 65536*goos_nacl + 4096*(1-goos_nacl)
-	_PCQuantum        = 4
-	_Int64Align       = 4
-	hugePageSize      = 0
+	thechar        = '5'
+	_BigEndian     = 0
+	_CacheLineSize = 32
+	_PhysPageSize  = 65536*goos_nacl + 4096*(1-goos_nacl)
+	_PCQuantum     = 4
+	_Int64Align    = 4
+	hugePageSize   = 0
 )
diff --git a/src/runtime/arch1_arm64.go b/src/runtime/arch1_arm64.go
index 549a635ca4..1a3165c8b7 100644
--- a/src/runtime/arch1_arm64.go
+++ b/src/runtime/arch1_arm64.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '7'
-	_BigEndian        = 0
-	_CacheLineSize    = 32
-	_RuntimeGogoBytes = 64
-	_PhysPageSize     = 4096*(1-goos_darwin) + 16384*goos_darwin
-	_PCQuantum        = 4
-	_Int64Align       = 8
-	hugePageSize      = 0
+	thechar        = '7'
+	_BigEndian     = 0
+	_CacheLineSize = 32
+	_PhysPageSize  = 4096*(1-goos_darwin) + 16384*goos_darwin
+	_PCQuantum     = 4
+	_Int64Align    = 8
+	hugePageSize   = 0
 )
diff --git a/src/runtime/arch1_ppc64.go b/src/runtime/arch1_ppc64.go
index ee453c09f2..de6dd91401 100644
--- a/src/runtime/arch1_ppc64.go
+++ b/src/runtime/arch1_ppc64.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '9'
-	_BigEndian        = 1
-	_CacheLineSize    = 64
-	_RuntimeGogoBytes = 72
-	_PhysPageSize     = 65536
-	_PCQuantum        = 4
-	_Int64Align       = 8
-	hugePageSize      = 0
+	thechar        = '9'
+	_BigEndian     = 1
+	_CacheLineSize = 64
+	_PhysPageSize  = 65536
+	_PCQuantum     = 4
+	_Int64Align    = 8
+	hugePageSize   = 0
 )
diff --git a/src/runtime/arch1_ppc64le.go b/src/runtime/arch1_ppc64le.go
index aa028a10f3..9a55c71101 100644
--- a/src/runtime/arch1_ppc64le.go
+++ b/src/runtime/arch1_ppc64le.go
@@ -5,12 +5,11 @@
 package runtime
 
 const (
-	thechar           = '9'
-	_BigEndian        = 0
-	_CacheLineSize    = 64
-	_RuntimeGogoBytes = 72
-	_PhysPageSize     = 65536
-	_PCQuantum        = 4
-	_Int64Align       = 8
-	hugePageSize      = 0
+	thechar        = '9'
+	_BigEndian     = 0
+	_CacheLineSize = 64
+	_PhysPageSize  = 65536
+	_PCQuantum     = 4
+	_Int64Align    = 8
+	hugePageSize   = 0
 )
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 36353d108f..0f9aeb8f37 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1693,8 +1693,10 @@ TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
 	RET
 
 // This is called from .init_array and follows the platform, not Go, ABI.
-TEXT runtime·addmoduledata(SB),NOSPLIT,$0-8
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
+	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
 	MOVQ	runtime·lastmoduledatap(SB), AX
 	MOVQ	DI, moduledata_next(AX)
 	MOVQ	DI, runtime·lastmoduledatap(SB)
+	POPQ	R15
 	RET
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index 50a30242d9..f84afe0362 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -20,18 +20,12 @@ import "unsafe"
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 	atomicstorep1(noescape(ptr), new)
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(ptr)))
-	}
 }
 
 //go:nosplit
 func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
 	old := xchgp1(noescape(ptr), new)
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(ptr)))
-	}
 	return old
 }
 
@@ -41,9 +35,6 @@ func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
 		return false
 	}
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(unsafe.Pointer(ptr))))
-	}
 	return true
 }
 
@@ -60,9 +51,6 @@ func sync_atomic_StorePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
 	sync_atomic_StoreUintptr((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 	atomicstorep1(noescape(unsafe.Pointer(ptr)), new)
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(unsafe.Pointer(ptr))))
-	}
 }
 
 //go:linkname sync_atomic_SwapUintptr sync/atomic.SwapUintptr
@@ -73,9 +61,6 @@ func sync_atomic_SwapUintptr(ptr *uintptr, new uintptr) uintptr
 func sync_atomic_SwapPointer(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
 	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(ptr)), uintptr(new)))
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(ptr)))
-	}
 	return old
 }
 
@@ -89,8 +74,5 @@ func sync_atomic_CompareAndSwapPointer(ptr *unsafe.Pointer, old, new unsafe.Poin
 		return false
 	}
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	if mheap_.shadow_enabled {
-		writebarrierptr_noshadow((*uintptr)(noescape(unsafe.Pointer(ptr))))
-	}
 	return true
 }
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index 3ecaac10bc..9aec3b03e0 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -22,17 +22,12 @@ func GOMAXPROCS(n int) int {
 		return ret
 	}
 
-	semacquire(&worldsema, false)
-	gp := getg()
-	gp.m.preemptoff = "GOMAXPROCS"
-	systemstack(stoptheworld)
+	stopTheWorld("GOMAXPROCS")
 
-	// newprocs will be processed by starttheworld
+	// newprocs will be processed by startTheWorld
 	newprocs = int32(n)
 
-	gp.m.preemptoff = ""
-	semrelease(&worldsema)
-	systemstack(starttheworld)
+	startTheWorld()
 	return ret
 }
 
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index e0c8b17bd3..3fddcc868f 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -76,24 +76,17 @@ func ParForIters(desc *ParFor, tid uint32) (uint32, uint32) {
 }
 
 func GCMask(x interface{}) (ret []byte) {
-	e := (*eface)(unsafe.Pointer(&x))
-	s := (*slice)(unsafe.Pointer(&ret))
 	systemstack(func() {
-		var len uintptr
-		var a *byte
-		getgcmask(e.data, e._type, &a, &len)
-		s.array = unsafe.Pointer(a)
-		s.len = int(len)
-		s.cap = s.len
+		ret = getgcmask(x)
 	})
 	return
 }
 
 func RunSchedLocalQueueTest() {
-	systemstack(testSchedLocalQueue)
+	testSchedLocalQueue()
 }
 func RunSchedLocalQueueStealTest() {
-	systemstack(testSchedLocalQueueSteal)
+	testSchedLocalQueueSteal()
 }
 
 var StringHash = stringHash
@@ -106,11 +99,6 @@ var MemclrBytes = memclrBytes
 
 var HashLoad = &hashLoad
 
-// For testing.
-func GogoBytes() int32 {
-	return _RuntimeGogoBytes
-}
-
 // entry point for testing
 func GostringW(w []uint16) (s string) {
 	systemstack(func() {
@@ -133,3 +121,34 @@ func Envs() []string     { return envs }
 func SetEnvs(e []string) { envs = e }
 
 var BigEndian = _BigEndian
+
+// For benchmarking.
+
+func BenchSetType(n int, x interface{}) {
+	e := *(*eface)(unsafe.Pointer(&x))
+	t := e._type
+	var size uintptr
+	var p unsafe.Pointer
+	switch t.kind & kindMask {
+	case _KindPtr:
+		t = (*ptrtype)(unsafe.Pointer(t)).elem
+		size = t.size
+		p = e.data
+	case _KindSlice:
+		slice := *(*struct {
+			ptr      unsafe.Pointer
+			len, cap uintptr
+		})(e.data)
+		t = (*slicetype)(unsafe.Pointer(t)).elem
+		size = t.size * slice.len
+		p = slice.ptr
+	}
+	allocSize := roundupsize(size)
+	systemstack(func() {
+		for i := 0; i < n; i++ {
+			heapBitsSetType(uintptr(p), allocSize, size, t)
+		}
+	})
+}
+
+const PtrSize = ptrSize
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 540d7b5124..476c3c5ae3 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -58,18 +58,6 @@ a comma-separated list of name=val pairs. Supported names are:
 
 	scavenge: scavenge=1 enables debugging mode of heap scavenger.
 
-	wbshadow: setting wbshadow=1 enables a shadow copy of the heap
-	used to detect missing write barriers at the next write to a
-	given location. If a bug can be detected in this mode it is
-	typically easy to understand, since the crash says quite
-	clearly what kind of word has missed a write barrier.
-	Setting wbshadow=2 checks the shadow copy during garbage
-	collection as well. Bugs detected at garbage collection can be
-	difficult to understand, because there is no context for what
-	the found word means. Typically you have to reproduce the
-	problem with allocfreetrace=1 in order to understand the type
-	of the badly updated word.
-
 	gccheckmark: setting gccheckmark=1 enables verification of the
 	garbage collector's concurrent mark phase by performing a
 	second mark pass while the world is stopped.  If the second
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 6abec4cca7..e3e0c3a583 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	"os"
+	"reflect"
 	"runtime"
 	"runtime/debug"
 	"testing"
@@ -197,45 +198,166 @@ func TestHugeGCInfo(t *testing.T) {
 	}
 }
 
-func BenchmarkSetTypeNoPtr1(b *testing.B) {
-	type NoPtr1 struct {
-		p uintptr
-	}
-	var p *NoPtr1
-	for i := 0; i < b.N; i++ {
-		p = &NoPtr1{}
-	}
-	_ = p
+func BenchmarkSetTypePtr(b *testing.B) {
+	benchSetType(b, new(*byte))
 }
-func BenchmarkSetTypeNoPtr2(b *testing.B) {
-	type NoPtr2 struct {
-		p, q uintptr
-	}
-	var p *NoPtr2
-	for i := 0; i < b.N; i++ {
-		p = &NoPtr2{}
-	}
-	_ = p
+
+func BenchmarkSetTypePtr8(b *testing.B) {
+	benchSetType(b, new([8]*byte))
 }
-func BenchmarkSetTypePtr1(b *testing.B) {
-	type Ptr1 struct {
-		p *byte
-	}
-	var p *Ptr1
-	for i := 0; i < b.N; i++ {
-		p = &Ptr1{}
-	}
-	_ = p
+
+func BenchmarkSetTypePtr16(b *testing.B) {
+	benchSetType(b, new([16]*byte))
 }
-func BenchmarkSetTypePtr2(b *testing.B) {
-	type Ptr2 struct {
-		p, q *byte
-	}
-	var p *Ptr2
-	for i := 0; i < b.N; i++ {
-		p = &Ptr2{}
+
+func BenchmarkSetTypePtr32(b *testing.B) {
+	benchSetType(b, new([32]*byte))
+}
+
+func BenchmarkSetTypePtr64(b *testing.B) {
+	benchSetType(b, new([64]*byte))
+}
+
+func BenchmarkSetTypePtr126(b *testing.B) {
+	benchSetType(b, new([126]*byte))
+}
+
+func BenchmarkSetTypePtr128(b *testing.B) {
+	benchSetType(b, new([128]*byte))
+}
+
+func BenchmarkSetTypePtrSlice(b *testing.B) {
+	benchSetType(b, make([]*byte, 1<<10))
+}
+
+type Node1 struct {
+	Value       [1]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode1(b *testing.B) {
+	benchSetType(b, new(Node1))
+}
+
+func BenchmarkSetTypeNode1Slice(b *testing.B) {
+	benchSetType(b, make([]Node1, 32))
+}
+
+type Node8 struct {
+	Value       [8]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode8(b *testing.B) {
+	benchSetType(b, new(Node8))
+}
+
+func BenchmarkSetTypeNode8Slice(b *testing.B) {
+	benchSetType(b, make([]Node8, 32))
+}
+
+type Node64 struct {
+	Value       [64]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode64(b *testing.B) {
+	benchSetType(b, new(Node64))
+}
+
+func BenchmarkSetTypeNode64Slice(b *testing.B) {
+	benchSetType(b, make([]Node64, 32))
+}
+
+type Node64Dead struct {
+	Left, Right *byte
+	Value       [64]uintptr
+}
+
+func BenchmarkSetTypeNode64Dead(b *testing.B) {
+	benchSetType(b, new(Node64Dead))
+}
+
+func BenchmarkSetTypeNode64DeadSlice(b *testing.B) {
+	benchSetType(b, make([]Node64Dead, 32))
+}
+
+type Node124 struct {
+	Value       [124]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode124(b *testing.B) {
+	benchSetType(b, new(Node124))
+}
+
+func BenchmarkSetTypeNode124Slice(b *testing.B) {
+	benchSetType(b, make([]Node124, 32))
+}
+
+type Node126 struct {
+	Value       [126]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode126(b *testing.B) {
+	benchSetType(b, new(Node126))
+}
+
+func BenchmarkSetTypeNode126Slice(b *testing.B) {
+	benchSetType(b, make([]Node126, 32))
+}
+
+type Node128 struct {
+	Value       [128]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode128(b *testing.B) {
+	benchSetType(b, new(Node128))
+}
+
+func BenchmarkSetTypeNode128Slice(b *testing.B) {
+	benchSetType(b, make([]Node128, 32))
+}
+
+type Node130 struct {
+	Value       [130]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode130(b *testing.B) {
+	benchSetType(b, new(Node130))
+}
+
+func BenchmarkSetTypeNode130Slice(b *testing.B) {
+	benchSetType(b, make([]Node130, 32))
+}
+
+type Node1024 struct {
+	Value       [1024]uintptr
+	Left, Right *byte
+}
+
+func BenchmarkSetTypeNode1024(b *testing.B) {
+	benchSetType(b, new(Node1024))
+}
+
+func BenchmarkSetTypeNode1024Slice(b *testing.B) {
+	benchSetType(b, make([]Node1024, 32))
+}
+
+func benchSetType(b *testing.B, x interface{}) {
+	v := reflect.ValueOf(x)
+	t := v.Type()
+	switch t.Kind() {
+	case reflect.Ptr:
+		b.SetBytes(int64(t.Elem().Size()))
+	case reflect.Slice:
+		b.SetBytes(int64(t.Elem().Size()) * int64(v.Len()))
 	}
-	_ = p
+	b.ResetTimer()
+	runtime.BenchSetType(b.N, x)
 }
 
 func BenchmarkAllocation(b *testing.B) {
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index 66b0353f08..f330bf2430 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -10,8 +10,14 @@ import (
 	"testing"
 )
 
+const (
+	typeScalar  = 0
+	typePointer = 1
+)
+
 // TestGCInfo tests that various objects in heap, data and bss receive correct GC pointer type info.
 func TestGCInfo(t *testing.T) {
+	verifyGCInfo(t, "bss Ptr", &bssPtr, infoPtr)
 	verifyGCInfo(t, "bss ScalarPtr", &bssScalarPtr, infoScalarPtr)
 	verifyGCInfo(t, "bss PtrScalar", &bssPtrScalar, infoPtrScalar)
 	verifyGCInfo(t, "bss BigStruct", &bssBigStruct, infoBigStruct())
@@ -20,6 +26,7 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "bss eface", &bssEface, infoEface)
 	verifyGCInfo(t, "bss iface", &bssIface, infoIface)
 
+	verifyGCInfo(t, "data Ptr", &dataPtr, infoPtr)
 	verifyGCInfo(t, "data ScalarPtr", &dataScalarPtr, infoScalarPtr)
 	verifyGCInfo(t, "data PtrScalar", &dataPtrScalar, infoPtrScalar)
 	verifyGCInfo(t, "data BigStruct", &dataBigStruct, infoBigStruct())
@@ -28,6 +35,7 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "data eface", &dataEface, infoEface)
 	verifyGCInfo(t, "data iface", &dataIface, infoIface)
 
+	verifyGCInfo(t, "stack Ptr", new(Ptr), infoPtr)
 	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), infoScalarPtr)
 	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), infoPtrScalar)
 	verifyGCInfo(t, "stack BigStruct", new(BigStruct), infoBigStruct())
@@ -37,38 +45,43 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "stack iface", new(Iface), infoIface)
 
 	for i := 0; i < 10; i++ {
-		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
-		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
-		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), infoBigStruct())
-		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
-		verifyGCInfo(t, "heap eface", escape(new(interface{})), infoEface)
-		verifyGCInfo(t, "heap iface", escape(new(Iface)), infoIface)
+		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(padDead(infoPtr)))
+		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), trimDead(infoPtr10))
+		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), trimDead(infoScalarPtr))
+		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), trimDead(infoScalarPtr4))
+		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), trimDead(infoPtrScalar))
+		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), trimDead(infoBigStruct()))
+		verifyGCInfo(t, "heap string", escape(new(string)), trimDead(infoString))
+		verifyGCInfo(t, "heap eface", escape(new(interface{})), trimDead(infoEface))
+		verifyGCInfo(t, "heap iface", escape(new(Iface)), trimDead(infoIface))
 	}
-
 }
 
 func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	mask := runtime.GCMask(p)
-	if len(mask) > len(mask0) {
-		mask0 = append(mask0, typeDead)
-		mask = mask[:len(mask0)]
-	}
 	if bytes.Compare(mask, mask0) != 0 {
 		t.Errorf("bad GC program for %v:\nwant %+v\ngot  %+v", name, mask0, mask)
 		return
 	}
 }
 
-func nonStackInfo(mask []byte) []byte {
-	// typeDead is replaced with typeScalar everywhere except stacks.
-	mask1 := make([]byte, len(mask))
-	for i, v := range mask {
-		if v == typeDead {
-			v = typeScalar
-		}
-		mask1[i] = v
+func padDead(mask []byte) []byte {
+	// Because the dead bit isn't encoded until the third word,
+	// and because on 32-bit systems a one-word allocation
+	// uses a two-word block, the pointer info for a one-word
+	// object needs to be expanded to include an extra scalar
+	// on 32-bit systems to match the heap bitmap.
+	if runtime.PtrSize == 4 && len(mask) == 1 {
+		return []byte{mask[0], 0}
+	}
+	return mask
+}
+
+func trimDead(mask []byte) []byte {
+	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
+		mask = mask[:len(mask)-1]
 	}
-	return mask1
+	return mask
 }
 
 var gcinfoSink interface{}
@@ -78,18 +91,13 @@ func escape(p interface{}) interface{} {
 	return p
 }
 
-const (
-	typeDead = iota
-	typeScalar
-	typePointer
-)
+var infoPtr = []byte{typePointer}
 
-const (
-	BitsString = iota // unused
-	BitsSlice         // unused
-	BitsIface
-	BitsEface
-)
+type Ptr struct {
+	*byte
+}
+
+var infoPtr10 = []byte{typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer}
 
 type ScalarPtr struct {
 	q int
@@ -102,6 +110,8 @@ type ScalarPtr struct {
 
 var infoScalarPtr = []byte{typeScalar, typePointer, typeScalar, typePointer, typeScalar, typePointer}
 
+var infoScalarPtr4 = append(append(append(append([]byte(nil), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...)
+
 type PtrScalar struct {
 	q *int
 	w int
@@ -166,6 +176,7 @@ func (IfaceImpl) f() {
 
 var (
 	// BSS
+	bssPtr       Ptr
 	bssScalarPtr ScalarPtr
 	bssPtrScalar PtrScalar
 	bssBigStruct BigStruct
@@ -175,6 +186,7 @@ var (
 	bssIface     Iface
 
 	// DATA
+	dataPtr                   = Ptr{new(byte)}
 	dataScalarPtr             = ScalarPtr{q: 1}
 	dataPtrScalar             = PtrScalar{w: 1}
 	dataBigStruct             = BigStruct{w: 1}
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index 9ca33992bb..b199330a1e 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go
@@ -233,6 +233,9 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 		throw("need padding in bucket (value)")
 	}
 
+	// make sure zero of element type is available.
+	mapzero(t.elem)
+
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
 	for ; hint > bucketCnt && float32(hint) > loadFactor*float32(uintptr(1)<<B); B++ {
@@ -990,3 +993,60 @@ func reflect_maplen(h *hmap) int {
 func reflect_ismapkey(t *_type) bool {
 	return ismapkey(t)
 }
+
+var zerobuf struct {
+	lock mutex
+	p    *byte
+	size uintptr
+}
+
+var zerotiny [1024]byte
+
+// mapzero ensures that t.zero points at a zero value for type t.
+// Types known to the compiler are in read-only memory and all point
+// to a single zero in the bss of a large enough size.
+// Types allocated by package reflect are in writable memory and
+// start out with zero set to nil; we initialize those on demand.
+func mapzero(t *_type) {
+	// On ARM, atomicloadp is implemented as xadd(p, 0),
+	// so we cannot use atomicloadp on read-only memory.
+	// Check whether the pointer is in the heap; if not, it's not writable
+	// so the zero value must already be set.
+	if GOARCH == "arm" && !inheap(uintptr(unsafe.Pointer(t))) {
+		if t.zero == nil {
+			print("runtime: map element ", *t._string, " missing zero value\n")
+			throw("mapzero")
+		}
+		return
+	}
+
+	// Already done?
+	// Check without lock, so must use atomicload to sync with atomicstore in allocation case below.
+	if atomicloadp(unsafe.Pointer(&t.zero)) != nil {
+		return
+	}
+
+	// Small enough for static buffer?
+	if t.size <= uintptr(len(zerotiny)) {
+		atomicstorep(unsafe.Pointer(&t.zero), unsafe.Pointer(&zerotiny[0]))
+		return
+	}
+
+	// Use allocated buffer.
+	lock(&zerobuf.lock)
+	if zerobuf.size < t.size {
+		if zerobuf.size == 0 {
+			zerobuf.size = 4 * 1024
+		}
+		for zerobuf.size < t.size {
+			zerobuf.size *= 2
+			if zerobuf.size == 0 {
+				// need >2GB zero on 32-bit machine
+				throw("map element too large")
+			}
+		}
+		zerobuf.p = (*byte)(persistentalloc(zerobuf.size, 64, &memstats.other_sys))
+	}
+	atomicstorep(unsafe.Pointer(&t.zero), unsafe.Pointer(zerobuf.p))
+	unlock(&zerobuf.lock)
+}
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index e18aa79164..c0fff3f1ce 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -15,20 +15,13 @@ import "unsafe"
 
 //go:linkname runtime_debug_WriteHeapDump runtime/debug.WriteHeapDump
 func runtime_debug_WriteHeapDump(fd uintptr) {
-	semacquire(&worldsema, false)
-	gp := getg()
-	gp.m.preemptoff = "write heap dump"
-	systemstack(stoptheworld)
+	stopTheWorld("write heap dump")
 
 	systemstack(func() {
 		writeheapdump_m(fd)
 	})
 
-	gp.m.preemptoff = ""
-	gp.m.locks++
-	semrelease(&worldsema)
-	systemstack(starttheworld)
-	gp.m.locks--
+	startTheWorld()
 }
 
 const (
@@ -730,14 +723,13 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		bits := hbits.typeBits()
-		if bits == typeDead {
+		if i >= 2 && !hbits.isMarked() {
 			break // end of object
 		}
-		hbits = hbits.next()
-		if bits == typePointer {
+		if hbits.isPointer() {
 			tmpbuf[i/8] |= 1 << (i % 8)
 		}
+		hbits = hbits.next()
 	}
 	return bitvector{int32(i), &tmpbuf[0]}
 }
diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go
index 68f221d6ef..4da4d88619 100644
--- a/src/runtime/lfstack_test.go
+++ b/src/runtime/lfstack_test.go
@@ -24,8 +24,12 @@ func toMyNode(node *LFNode) *MyNode {
 	return (*MyNode)(unsafe.Pointer(node))
 }
 
+var global interface{}
+
 func TestLFStack(t *testing.T) {
 	stack := new(uint64)
+	global = stack // force heap allocation
+
 	// Need to keep additional referenfces to nodes, the stack is not all that type-safe.
 	var nodes []*MyNode
 
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 1619ccb9f4..2d7e55643f 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -424,9 +424,6 @@ func mHeap_SysAlloc(h *mheap, n uintptr) unsafe.Pointer {
 		if raceenabled {
 			racemapshadow((unsafe.Pointer)(p), n)
 		}
-		if mheap_.shadow_enabled {
-			sysMap(unsafe.Pointer(p+mheap_.shadow_heap), n, h.shadow_reserved, &memstats.other_sys)
-		}
 
 		if uintptr(p)&(_PageSize-1) != 0 {
 			throw("misrounded allocation in MHeap_SysAlloc")
@@ -512,6 +509,9 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	if mp.mallocing != 0 {
 		throw("malloc deadlock")
 	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
 	mp.mallocing = 1
 
 	shouldhelpgc := false
@@ -669,10 +669,6 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		})
 	}
 
-	if mheap_.shadow_enabled {
-		clearshadow(uintptr(x), size)
-	}
-
 	if raceenabled {
 		racemalloc(x, size)
 	}
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index eb5881707b..53a0a00ae7 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -10,12 +10,6 @@
 // implementation, markwb, and the various wrappers called by the
 // compiler to implement pointer assignment, slice assignment,
 // typed memmove, and so on.
-//
-// To check for missed write barriers, the GODEBUG=wbshadow debugging
-// mode allocates a second copy of the heap. Write barrier-based pointer
-// updates make changes to both the real heap and the shadow, and both
-// the pointer updates and the GC look for inconsistencies between the two,
-// indicating pointer writes that bypassed the barrier.
 
 package runtime
 
@@ -66,7 +60,7 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 	default:
 		throw("gcphasework in bad gcphase")
 
-	case _GCoff, _GCquiesce, _GCstw, _GCsweep, _GCscan:
+	case _GCoff, _GCstw, _GCsweep, _GCscan:
 		// ok
 
 	case _GCmark, _GCmarktermination:
@@ -107,43 +101,19 @@ func writebarrierptr_nostore1(dst *uintptr, src uintptr) {
 // but if we do that, Go inserts a write barrier on *dst = src.
 //go:nosplit
 func writebarrierptr(dst *uintptr, src uintptr) {
+	*dst = src
 	if !writeBarrierEnabled {
-		*dst = src
 		return
 	}
-
 	if src != 0 && (src < _PhysPageSize || src == poisonStack) {
-		systemstack(func() { throw("bad pointer in write barrier") })
-	}
-
-	if mheap_.shadow_enabled {
-		writebarrierptr_shadow(dst, src)
+		systemstack(func() {
+			print("runtime: writebarrierptr *", dst, " = ", hex(src), "\n")
+			throw("bad pointer in write barrier")
+		})
 	}
-
-	*dst = src
 	writebarrierptr_nostore1(dst, src)
 }
 
-//go:nosplit
-func writebarrierptr_shadow(dst *uintptr, src uintptr) {
-	systemstack(func() {
-		addr := uintptr(unsafe.Pointer(dst))
-		shadow := shadowptr(addr)
-		if shadow == nil {
-			return
-		}
-		// There is a race here but only if the program is using
-		// racy writes instead of sync/atomic. In that case we
-		// don't mind crashing.
-		if *shadow != *dst && *shadow != noShadow && istrackedptr(*dst) {
-			mheap_.shadow_enabled = false
-			print("runtime: write barrier dst=", dst, " old=", hex(*dst), " shadow=", shadow, " old=", hex(*shadow), " new=", hex(src), "\n")
-			throw("missed write barrier")
-		}
-		*shadow = src
-	})
-}
-
 // Like writebarrierptr, but the store has already been applied.
 // Do not reapply.
 //go:nosplit
@@ -151,44 +121,12 @@ func writebarrierptr_nostore(dst *uintptr, src uintptr) {
 	if !writeBarrierEnabled {
 		return
 	}
-
 	if src != 0 && (src < _PhysPageSize || src == poisonStack) {
 		systemstack(func() { throw("bad pointer in write barrier") })
 	}
-
-	// Apply changes to shadow.
-	// Since *dst has been overwritten already, we cannot check
-	// whether there were any missed updates, but writebarrierptr_nostore
-	// is only rarely used.
-	if mheap_.shadow_enabled {
-		systemstack(func() {
-			addr := uintptr(unsafe.Pointer(dst))
-			shadow := shadowptr(addr)
-			if shadow == nil {
-				return
-			}
-			*shadow = src
-		})
-	}
-
 	writebarrierptr_nostore1(dst, src)
 }
 
-// writebarrierptr_noshadow records that the value in *dst
-// has been written to using an atomic operation and the shadow
-// has not been updated. (In general if dst must be manipulated
-// atomically we cannot get the right bits for use in the shadow.)
-//go:nosplit
-func writebarrierptr_noshadow(dst *uintptr) {
-	addr := uintptr(unsafe.Pointer(dst))
-	shadow := shadowptr(addr)
-	if shadow == nil {
-		return
-	}
-
-	*shadow = noShadow
-}
-
 //go:nosplit
 func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
 	writebarrierptr(&dst[0], src[0])
@@ -217,37 +155,11 @@ func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
 // typedmemmove copies a value of type t to dst from src.
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
-	if !writeBarrierEnabled || (typ.kind&kindNoPointers) != 0 {
-		memmove(dst, src, typ.size)
+	memmove(dst, src, typ.size)
+	if typ.kind&kindNoPointers != 0 {
 		return
 	}
-
-	systemstack(func() {
-		mask := typeBitmapInHeapBitmapFormat(typ)
-		nptr := typ.size / ptrSize
-		for i := uintptr(0); i < nptr; i += 2 {
-			bits := mask[i/2]
-			if (bits>>2)&typeMask == typePointer {
-				writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
-			} else {
-				*(*uintptr)(dst) = *(*uintptr)(src)
-			}
-			// TODO(rsc): The noescape calls should be unnecessary.
-			dst = add(noescape(dst), ptrSize)
-			src = add(noescape(src), ptrSize)
-			if i+1 == nptr {
-				break
-			}
-			bits >>= 4
-			if (bits>>2)&typeMask == typePointer {
-				writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
-			} else {
-				*(*uintptr)(dst) = *(*uintptr)(src)
-			}
-			dst = add(noescape(dst), ptrSize)
-			src = add(noescape(src), ptrSize)
-		}
-	})
+	heapBitsBulkBarrier(uintptr(dst), typ.size)
 }
 
 //go:linkname reflect_typedmemmove reflect.typedmemmove
@@ -259,38 +171,16 @@ func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 // dst and src point off bytes into the value and only copies size bytes.
 //go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
 func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
-	if !writeBarrierEnabled || (typ.kind&kindNoPointers) != 0 || size < ptrSize {
-		memmove(dst, src, size)
+	memmove(dst, src, size)
+	if !writeBarrierEnabled || typ.kind&kindNoPointers != 0 || size < ptrSize || !inheap(uintptr(dst)) {
 		return
 	}
 
-	if off&(ptrSize-1) != 0 {
-		frag := -off & (ptrSize - 1)
-		// frag < size, because size >= ptrSize, checked above.
-		memmove(dst, src, frag)
+	if frag := -off & (ptrSize - 1); frag != 0 {
+		dst = add(dst, frag)
 		size -= frag
-		dst = add(noescape(dst), frag)
-		src = add(noescape(src), frag)
-		off += frag
-	}
-
-	mask := typeBitmapInHeapBitmapFormat(typ)
-	nptr := (off + size) / ptrSize
-	for i := uintptr(off / ptrSize); i < nptr; i++ {
-		bits := mask[i/2] >> ((i & 1) << 2)
-		if (bits>>2)&typeMask == typePointer {
-			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
-		} else {
-			*(*uintptr)(dst) = *(*uintptr)(src)
-		}
-		// TODO(rsc): The noescape calls should be unnecessary.
-		dst = add(noescape(dst), ptrSize)
-		src = add(noescape(src), ptrSize)
-	}
-	size &= ptrSize - 1
-	if size > 0 {
-		memmove(dst, src, size)
 	}
+	heapBitsBulkBarrier(uintptr(dst), size&^(ptrSize-1))
 }
 
 // callwritebarrier is invoked at the end of reflectcall, to execute
@@ -302,29 +192,16 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
 // not to be preempted before the write barriers have been run.
 //go:nosplit
 func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) {
-	if !writeBarrierEnabled || typ == nil || (typ.kind&kindNoPointers) != 0 || framesize-retoffset < ptrSize {
+	if !writeBarrierEnabled || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < ptrSize || !inheap(uintptr(frame)) {
 		return
 	}
-
-	systemstack(func() {
-		mask := typeBitmapInHeapBitmapFormat(typ)
-		// retoffset is known to be pointer-aligned (at least).
-		// TODO(rsc): The noescape call should be unnecessary.
-		dst := add(noescape(frame), retoffset)
-		nptr := framesize / ptrSize
-		for i := uintptr(retoffset / ptrSize); i < nptr; i++ {
-			bits := mask[i/2] >> ((i & 1) << 2)
-			if (bits>>2)&typeMask == typePointer {
-				writebarrierptr_nostore((*uintptr)(dst), *(*uintptr)(dst))
-			}
-			// TODO(rsc): The noescape call should be unnecessary.
-			dst = add(noescape(dst), ptrSize)
-		}
-	})
+	heapBitsBulkBarrier(uintptr(add(frame, retoffset)), framesize-retoffset)
 }
 
 //go:nosplit
 func typedslicecopy(typ *_type, dst, src slice) int {
+	// TODO(rsc): If typedslicecopy becomes faster than calling
+	// typedmemmove repeatedly, consider using during func growslice.
 	n := dst.len
 	if n > src.len {
 		n = src.len
@@ -342,6 +219,10 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
 	}
 
+	// Note: No point in checking typ.kind&kindNoPointers here:
+	// compiler only emits calls to typedslicecopy for types with pointers,
+	// and growslice and reflect_typedslicecopy check for pointers
+	// before calling typedslicecopy.
 	if !writeBarrierEnabled {
 		memmove(dstp, srcp, uintptr(n)*typ.size)
 		return n
@@ -382,134 +263,13 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 
 //go:linkname reflect_typedslicecopy reflect.typedslicecopy
 func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
-	return typedslicecopy(elemType, dst, src)
-}
-
-// Shadow heap for detecting missed write barriers.
-
-// noShadow is stored in as the shadow pointer to mark that there is no
-// shadow word recorded. It matches any actual pointer word.
-// noShadow is used when it is impossible to know the right word
-// to store in the shadow heap, such as when the real heap word
-// is being manipulated atomically.
-const noShadow uintptr = 1
-
-func wbshadowinit() {
-	// Initialize write barrier shadow heap if we were asked for it
-	// and we have enough address space (not on 32-bit).
-	if debug.wbshadow == 0 {
-		return
-	}
-	if ptrSize != 8 {
-		print("runtime: GODEBUG=wbshadow=1 disabled on 32-bit system\n")
-		return
-	}
-
-	var reserved bool
-	p1 := sysReserveHigh(mheap_.arena_end-mheap_.arena_start, &reserved)
-	if p1 == nil {
-		throw("cannot map shadow heap")
-	}
-	mheap_.shadow_heap = uintptr(p1) - mheap_.arena_start
-	sysMap(p1, mheap_.arena_used-mheap_.arena_start, reserved, &memstats.other_sys)
-	memmove(p1, unsafe.Pointer(mheap_.arena_start), mheap_.arena_used-mheap_.arena_start)
-
-	mheap_.shadow_reserved = reserved
-
-	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		start := ^uintptr(0)
-		end := uintptr(0)
-		if start > datap.noptrdata {
-			start = datap.noptrdata
-		}
-		if start > datap.data {
-			start = datap.data
-		}
-		if start > datap.noptrbss {
-			start = datap.noptrbss
-		}
-		if start > datap.bss {
-			start = datap.bss
-		}
-		if end < datap.enoptrdata {
-			end = datap.enoptrdata
-		}
-		if end < datap.edata {
-			end = datap.edata
-		}
-		if end < datap.enoptrbss {
-			end = datap.enoptrbss
-		}
-		if end < datap.ebss {
-			end = datap.ebss
-		}
-		start &^= _PhysPageSize - 1
-		end = round(end, _PhysPageSize)
-		datap.data_start = start
-		datap.data_end = end
-		reserved = false
-		p1 = sysReserveHigh(end-start, &reserved)
-		if p1 == nil {
-			throw("cannot map shadow data")
+	if elemType.kind&kindNoPointers != 0 {
+		n := dst.len
+		if n > src.len {
+			n = src.len
 		}
-		datap.shadow_data = uintptr(p1) - start
-		sysMap(p1, end-start, reserved, &memstats.other_sys)
-		memmove(p1, unsafe.Pointer(start), end-start)
-	}
-
-	mheap_.shadow_enabled = true
-	writeBarrierEnabled = true
-}
-
-// shadowptr returns a pointer to the shadow value for addr.
-//go:nosplit
-func shadowptr(addr uintptr) *uintptr {
-	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		if datap.data_start <= addr && addr < datap.data_end {
-			return (*uintptr)(unsafe.Pointer(addr + datap.shadow_data))
-		}
-	}
-	if inheap(addr) {
-		return (*uintptr)(unsafe.Pointer(addr + mheap_.shadow_heap))
-	}
-	return nil
-}
-
-// istrackedptr reports whether the pointer value p requires a write barrier
-// when stored into the heap.
-func istrackedptr(p uintptr) bool {
-	return inheap(p)
-}
-
-// checkwbshadow checks that p matches its shadow word.
-// The garbage collector calls checkwbshadow for each pointer during the checkmark phase.
-// It is only called when mheap_.shadow_enabled is true.
-func checkwbshadow(p *uintptr) {
-	addr := uintptr(unsafe.Pointer(p))
-	shadow := shadowptr(addr)
-	if shadow == nil {
-		return
-	}
-	// There is no race on the accesses here, because the world is stopped,
-	// but there may be racy writes that lead to the shadow and the
-	// heap being inconsistent. If so, we will detect that here as a
-	// missed write barrier and crash. We don't mind.
-	// Code should use sync/atomic instead of racy pointer writes.
-	if *shadow != *p && *shadow != noShadow && istrackedptr(*p) {
-		mheap_.shadow_enabled = false
-		print("runtime: checkwritebarrier p=", p, " *p=", hex(*p), " shadow=", shadow, " *shadow=", hex(*shadow), "\n")
-		throw("missed write barrier")
-	}
-}
-
-// clearshadow clears the shadow copy associated with the n bytes of memory at addr.
-func clearshadow(addr, n uintptr) {
-	if !mheap_.shadow_enabled {
-		return
-	}
-	p := shadowptr(addr)
-	if p == nil || n <= ptrSize {
-		return
+		memmove(dst.array, src.array, uintptr(n)*elemType.size)
+		return n
 	}
-	memclr(unsafe.Pointer(p), n)
+	return typedslicecopy(elemType, dst, src)
 }
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index f0c7520e38..b20908fb49 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -6,48 +6,40 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Not handled in this file, but worth mentioning: stack frames and global data
-// in the data and bss sections are described by 1-bit bitmaps in which 0 means
-// scalar or uninitialized or dead and 1 means pointer to visit during GC.
-//
-// Comparing this 1-bit form with the 2-bit form described below, 0 represents
-// both the 2-bit 00 and 01, while 1 represents the 2-bit 10.
-// Therefore conversions between the two (until the 2-bit form is gone)
-// can be done by x>>1 for 2-bit to 1-bit and x+1 for 1-bit to 2-bit.
-//
-// Type bitmaps
-//
-// Types that aren't too large
-// record information about the layout of their memory words using a type bitmap.
-// The bitmap holds two bits for each pointer-sized word. The two-bit values are:
-//
-// 	00 - typeDead: not a pointer, and no pointers in the rest of the object
-//	01 - typeScalar: not a pointer
-//	10 - typePointer: a pointer that GC should trace
-//	11 - unused
-//
-// typeDead only appears in type bitmaps in Go type descriptors
-// and in type bitmaps embedded in the heap bitmap (see below).
+// Stack frames and global variables in the data and bss sections are described
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
+// to be visited during GC. The bits in each byte are consumed starting with
+// the low bit: 1<<0, 1<<1, and so on.
 //
 // Heap bitmap
 //
 // The allocated heap comes from a subset of the memory in the range [start, used),
 // where start == mheap_.arena_start and used == mheap_.arena_used.
-// The heap bitmap comprises 4 bits for each pointer-sized word in that range,
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
 // stored in bytes indexed backward in memory from start.
-// That is, the byte at address start-1 holds the 4-bit entries for the two words
-// start, start+ptrSize, the byte at start-2 holds the entries for start+2*ptrSize,
-// start+3*ptrSize, and so on.
-// In the byte holding the entries for addresses p and p+ptrSize, the low 4 bits
-// describe p and the high 4 bits describe p+ptrSize.
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
 //
-// The 4 bits for each word are:
-//	0001 - not used
-//	0010 - bitMarked: this object has been marked by GC
-//	tt00 - word type bits, as in a type bitmap.
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
+// The meaning of the high bit depends on the position of the word being described
+// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In the third and later words, the high bit indicates that the object is still
+// being described. In these words, if a bit pair with a high bit 0 is encountered,
+// the low bit can also be assumed to be 0, and the object description is over.
+// This 00 is called the ``dead'' encoding: it signals that the rest of the words
+// in the object are uninteresting to the garbage collector.
 //
-// The code makes use of the fact that the zero value for a heap bitmap nibble
-// has no boundary bit set, no marked bit set, and type bits == typeDead.
+// The 2-bit entries are split when written into the byte, so that the top half
+// of the byte contains 4 mark bits and the bottom half contains 4 pointer bits.
+// This form allows a copy from the 1-bit to the 4-bit form to keep the
+// pointer bits contiguous, instead of having to space them out.
+//
+// The code makes use of the fact that the zero value for a heap bitmap
+// has no live pointer bit set and is (depending on position), not marked,
+// not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
 // Checkmarks
@@ -57,55 +49,71 @@
 // collector implementation. As a sanity check, the GC has a 'checkmark'
 // mode that retraverses the object graph with the world stopped, to make
 // sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the type bits for the first word
-// of an object are redefined:
-//
-//	00 - typeScalarCheckmarked // typeScalar, checkmarked
-//	01 - typeScalar // typeScalar, not checkmarked
-//	10 - typePointer // typePointer, not checkmarked
-//	11 - typePointerCheckmarked // typePointer, checkmarked
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
+// for the second word of the object holds the checkmark bit.
+// When not in checkmark mode, this bit is set to 1.
 //
-// That is, typeDead is redefined to be typeScalar + a checkmark, and the
-// previously unused 11 pattern is redefined to be typePointer + a checkmark.
-// To prepare for this mode, we must move any typeDead in the first word of
-// a multiword object to the second word.
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
+// means every allocated object has two words, so there is room for the
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
 
 package runtime
 
 import "unsafe"
 
 const (
-	typeDead               = 0
-	typeScalarCheckmarked  = 0
-	typeScalar             = 1
-	typePointer            = 2
-	typePointerCheckmarked = 3
+	bitPointer = 1 << 0
+	bitMarked  = 1 << 4
 
-	typeBitsWidth = 2 // # of type bits per pointer-sized word
-	typeMask      = 1<<typeBitsWidth - 1
+	heapBitsShift   = 1                 // shift offset between successive bitPointer or bitMarked entries
+	heapBitmapScale = ptrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
 
-	heapBitsWidth   = 4
-	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes per heap bitmap byte
-	bitMarked       = 2
-	typeShift       = 2
+	// all mark/pointer bits in a byte
+	bitMarkedAll  = bitMarked | bitMarked<<heapBitsShift | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
+	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
 )
 
-// Information from the compiler about the layout of stack frames.
-type bitvector struct {
-	n        int32 // # of bits
-	bytedata *uint8
-}
-
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
 func addb(p *byte, n uintptr) *byte {
-	return (*byte)(add(unsafe.Pointer(p), n))
+	// Note: wrote out full expression instead of calling add(p, n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
 }
 
 // subtractb returns the byte pointer p-n.
 //go:nowritebarrier
 func subtractb(p *byte, n uintptr) *byte {
-	return (*byte)(add(unsafe.Pointer(p), -n))
+	// Note: wrote out full expression instead of calling add(p, -n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
+}
+
+// add1 returns the byte pointer p+1.
+//go:nowritebarrier
+func add1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling addb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
+}
+
+// subtract1 returns the byte pointer p-1.
+//go:nowritebarrier
+func subtract1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling subtractb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 }
 
 // mHeap_MapBits is called each time arena_used is extended.
@@ -140,9 +148,13 @@ type heapBits struct {
 
 // heapBitsForAddr returns the heapBits for the address addr.
 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
 func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 	off := (addr - mheap_.arena_start) / ptrSize
-	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/2 - 1)), uint32(4 * (off & 1))}
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(off & 3)}
 }
 
 // heapBitsForSpan returns the heapBits for the span base address base.
@@ -229,20 +241,39 @@ func (h heapBits) prefetch() {
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
 func (h heapBits) next() heapBits {
-	if h.shift == 0 {
-		return heapBits{h.bitp, 4}
+	if h.shift < 3*heapBitsShift {
+		return heapBits{h.bitp, h.shift + heapBitsShift}
 	}
-	return heapBits{subtractb(h.bitp, 1), 0}
+	return heapBits{subtract1(h.bitp), 0}
+}
+
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsShift
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
+}
+
+// The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	return uint32(*h.bitp) >> h.shift
 }
 
 // isMarked reports whether the heap bits have the marked bit set.
+// h must describe the initial word of the object.
 func (h heapBits) isMarked() bool {
 	return *h.bitp&(bitMarked<<h.shift) != 0
 }
 
 // setMarked sets the marked bit in the heap bits, atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarked() {
-	// Each byte of GC bitmap holds info for two words.
+	// Each byte of GC bitmap holds info for four words.
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
 	// cases, but it's not worth the risk.
@@ -250,30 +281,103 @@ func (h heapBits) setMarked() {
 }
 
 // setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarkedNonAtomic() {
 	*h.bitp |= bitMarked << h.shift
 }
 
-// typeBits returns the heap bits' type bits.
-func (h heapBits) typeBits() uint8 {
-	return (*h.bitp >> (h.shift + typeShift)) & typeMask
+// isPointer reports whether the heap bits describe a pointer word.
+// h must describe the initial word of the object.
+func (h heapBits) isPointer() bool {
+	return (*h.bitp>>h.shift)&bitPointer != 0
+}
+
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is, so that it does not read too
+// far into the bitmap.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == ptrSize { // 1-word objects are always pointers
+		return true
+	}
+	// Otherwise, at least a 2-word object, and at least 2-word aligned,
+	// so h.shift is either 0 or 4, so we know we can get the bits for the
+	// first two words out of *h.bitp.
+	// If either of the first two words is a pointer, not pointer free.
+	b := uint32(*h.bitp >> h.shift)
+	if b&(bitPointer|bitPointer<<heapBitsShift) != 0 {
+		return true
+	}
+	if size == 2*ptrSize {
+		return false
+	}
+	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
+	if h.shift == 0 {
+		return b&(bitMarked<<(2*heapBitsShift)) != 0
+	}
+	return uint32(*subtract1(h.bitp))&bitMarked != 0
 }
 
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
-func (h heapBits) isCheckmarked() bool {
-	typ := h.typeBits()
-	return typ == typeScalarCheckmarked || typ == typePointerCheckmarked
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == ptrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsShift+h.shift))&bitMarked != 0
 }
 
 // setCheckmarked sets the checkmarked bit.
-func (h heapBits) setCheckmarked() {
-	typ := h.typeBits()
-	if typ == typeScalar {
-		// Clear low type bit to turn 01 into 00.
-		atomicand8(h.bitp, ^((1 << typeShift) << h.shift))
-	} else if typ == typePointer {
-		// Set low type bit to turn 10 into 11.
-		atomicor8(h.bitp, (1<<typeShift)<<h.shift)
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) setCheckmarked(size uintptr) {
+	if size == ptrSize {
+		atomicor8(h.bitp, bitPointer<<h.shift)
+		return
+	}
+	atomicor8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
+}
+
+// heapBitsBulkBarrier executes writebarrierptr_nostore
+// for every pointer slot in the memory range [p, p+size),
+// using the heap bitmap to locate those pointer slots.
+// This executes the write barriers necessary after a memmove.
+// Both p and size must be pointer-aligned.
+// The range [p, p+size) must lie within a single allocation.
+//
+// Callers should call heapBitsBulkBarrier immediately after
+// calling memmove(p, src, size). This function is marked nosplit
+// to avoid being preempted; the GC must not stop the goroutine
+// betwen the memmove and the execution of the barriers.
+//
+// The heap bitmap is not maintained for allocations containing
+// no pointers at all; any caller of heapBitsBulkBarrier must first
+// make sure the underlying allocation contains pointers, usually
+// by checking typ.kind&kindNoPointers.
+//
+//go:nosplit
+func heapBitsBulkBarrier(p, size uintptr) {
+	if (p|size)&(ptrSize-1) != 0 {
+		throw("heapBitsBulkBarrier: unaligned arguments")
+	}
+	if !writeBarrierEnabled || !inheap(p) {
+		return
+	}
+
+	h := heapBitsForAddr(p)
+	for i := uintptr(0); i < size; i += ptrSize {
+		if h.isPointer() {
+			x := (*uintptr)(unsafe.Pointer(p + i))
+			writebarrierptr_nostore(x, *x)
+		}
+		h = h.next()
 	}
 }
 
@@ -291,99 +395,59 @@ func (h heapBits) initSpan(size, n, total uintptr) {
 		throw("initSpan: unaligned length")
 	}
 	nbyte := total / heapBitmapScale
+	if ptrSize == 8 && size == ptrSize {
+		end := h.bitp
+		bitp := subtractb(end, nbyte-1)
+		for {
+			*bitp = bitPointerAll
+			if bitp == end {
+				break
+			}
+			bitp = add1(bitp)
+		}
+		return
+	}
 	memclr(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
 }
 
 // initCheckmarkSpan initializes a span for being checkmarked.
-// This would be a no-op except that we need to rewrite any
-// typeDead bits in the first word of the object into typeScalar
-// followed by a typeDead in the second word of the object.
+// It clears the checkmark bits, which are set to 1 in normal operation.
 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// There is no second word in these objects, so all we have
-		// to do is rewrite typeDead to typeScalar by adding the 1<<typeShift bit.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-
-			if (x>>typeShift)&typeMask == typeDead {
-				x += (typeScalar - typeDead) << typeShift
-			}
-			if (x>>(4+typeShift))&typeMask == typeDead {
-				x += (typeScalar - typeDead) << (4 + typeShift)
-			}
-			*bitp = uint8(x)
-			bitp = subtractb(bitp, 1)
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp &^= bitPointerAll
+			bitp = subtract1(bitp)
 		}
 		return
 	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeDead, change to typeScalar
-	// and clear top nibble to mark as typeDead.
-	bitp := h.bitp
-	step := size / heapBitmapScale
 	for i := uintptr(0); i < n; i++ {
-		x := *bitp
-		if (x>>typeShift)&typeMask == typeDead {
-			x += (typeScalar - typeDead) << typeShift
-			x &= 0x0f // clear top nibble to typeDead
-		}
-		bitp = subtractb(bitp, step)
+		*h.bitp &^= bitMarked << (heapBitsShift + h.shift)
+		h = h.forward(size / ptrSize)
 	}
 }
 
-// clearCheckmarkSpan removes all the checkmarks from a span.
-// If it finds a multiword object starting with typeScalar typeDead,
-// it rewrites the heap bits to the simpler typeDead typeDead.
+// clearCheckmarkSpan undoes all the checkmarking in a span.
+// The actual checkmark bits are ignored, so the only work to do
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// typeScalarCheckmarked can be left as typeDead,
-		// but we want to change typeScalar back to typeDead.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-			switch typ := (x >> typeShift) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << typeShift
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << typeShift
-			}
-
-			switch typ := (x >> (4 + typeShift)) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << (4 + typeShift)
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << (4 + typeShift)
-			}
-
-			*bitp = uint8(x)
-			bitp = subtractb(bitp, 1)
-		}
-		return
-	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeScalarCheckmarked and the top is not typeDead,
-	// change to typeScalar. Otherwise leave, since typeScalarCheckmarked == typeDead.
-	// If the bottom nibble says typePointerCheckmarked, change to typePointer.
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		switch typ := (x >> typeShift) & typeMask; {
-		case typ == typeScalarCheckmarked && (x>>(4+typeShift))&typeMask != typeDead:
-			x += (typeScalar - typeScalarCheckmarked) << typeShift
-		case typ == typePointerCheckmarked:
-			x += (typePointer - typePointerCheckmarked) << typeShift
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp |= bitPointerAll
+			bitp = subtract1(bitp)
 		}
-
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
@@ -393,348 +457,1046 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 // bits for the first two words (or one for single-word objects) to typeDead
 // and then calls f(p), where p is the object's base address.
 // f is expected to add the object to a free list.
+// For non-free objects, heapBitsSweepSpan turns off the marked bit.
 func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 	h := heapBitsForSpan(base)
-	if size == ptrSize {
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must read and update both top and bottom nibble of each byte.
+	switch {
+	default:
+		throw("heapBitsSweepSpan")
+	case ptrSize == 8 && size == ptrSize:
+		// Consider mark bits in all four 2-bit entries of each bitmap byte.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
+		for i := uintptr(0); i < n; i += 4 {
+			x := uint32(*bitp)
+			// Note that unlike the other size cases, we leave the pointer bits set here.
+			// These are initialized during initSpan when the span is created and left
+			// in place the whole time the span is used for pointer-sized objects.
+			// That lets heapBitsSetType avoid an atomic update to set the pointer bit
+			// during allocation.
 			if x&bitMarked != 0 {
 				x &^= bitMarked
 			} else {
-				x &^= typeMask << typeShift
 				f(base + i*ptrSize)
 			}
-			if x&(bitMarked<<4) != 0 {
-				x &^= bitMarked << 4
+			if x&(bitMarked<<heapBitsShift) != 0 {
+				x &^= bitMarked << heapBitsShift
 			} else {
-				x &^= typeMask << (4 + typeShift)
 				f(base + (i+1)*ptrSize)
 			}
+			if x&(bitMarked<<(2*heapBitsShift)) != 0 {
+				x &^= bitMarked << (2 * heapBitsShift)
+			} else {
+				f(base + (i+2)*ptrSize)
+			}
+			if x&(bitMarked<<(3*heapBitsShift)) != 0 {
+				x &^= bitMarked << (3 * heapBitsShift)
+			} else {
+				f(base + (i+3)*ptrSize)
+			}
 			*bitp = uint8(x)
-			bitp = subtractb(bitp, 1)
+			bitp = subtract1(bitp)
 		}
-		return
-	}
 
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		if x&bitMarked != 0 {
-			x &^= bitMarked
-		} else {
-			x = 0
-			f(base + i*size)
+	case size%(4*ptrSize) == 0:
+		// Mark bit is in first word of each object.
+		// Each object starts at bit 0 of a heap bitmap byte.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		for i := uintptr(0); i < n; i++ {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x = 0
+				f(base + i*size)
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step)
+		}
+
+	case size%(4*ptrSize) == 2*ptrSize:
+		// Mark bit is in first word of each object,
+		// but every other object starts halfway through a heap bitmap byte.
+		// Unroll loop 2x to handle alternating shift count and step size.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		var i uintptr
+		for i = uintptr(0); i < n; i += 2 {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x &^= bitMarked | bitPointer | (bitMarked|bitPointer)<<heapBitsShift
+				f(base + i*size)
+				if size > 2*ptrSize {
+					x = 0
+				}
+			}
+			*bitp = uint8(x)
+			if i+1 >= n {
+				break
+			}
+			bitp = subtractb(bitp, step)
+			x = uint32(*bitp)
+			if x&(bitMarked<<(2*heapBitsShift)) != 0 {
+				x &^= bitMarked << (2 * heapBitsShift)
+			} else {
+				x &^= (bitMarked|bitPointer)<<(2*heapBitsShift) | (bitMarked|bitPointer)<<(3*heapBitsShift)
+				f(base + (i+1)*size)
+				if size > 2*ptrSize {
+					*subtract1(bitp) = 0
+				}
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step+1)
 		}
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
-// TODO(rsc): Clean up the next two functions.
-
 // heapBitsSetType records that the new allocation [x, x+size)
 // holds in [x, x+dataSize) one or more values of type typ.
 // (The number of values is given by dataSize / typ.size.)
 // If dataSize < size, the fragment [x+dataSize, x+size) is
 // recorded as non-pointer data.
+// It is known that the type has pointers somewhere;
+// malloc does not call heapBitsSetType when there are no pointers,
+// because all free objects are marked as noscan during
+// heapBitsSweepSpan.
+// There can only be one allocation from a given span active at a time,
+// so this code is not racing with other instances of itself,
+// and we don't allocate from a span until it has been swept,
+// so this code is not racing with heapBitsSweepSpan.
+// It is, however, racing with the concurrent GC mark phase,
+// which can be setting the mark bit in the leading 2-bit entry
+// of an allocated block. The block we are modifying is not quite
+// allocated yet, so the GC marker is not racing with updates to x's bits,
+// but if the start or end of x shares a bitmap byte with an adjacent
+// object, the GC marker is racing with updates to those object's mark bits.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
-	// From here till marked label marking the object as allocated
-	// and storing type info in the GC bitmap.
-	h := heapBitsForAddr(x)
+	const doubleCheck = false // slow but helpful; enable to test modifications to this code
 
-	var ti, te uintptr
-	var ptrmask *uint8
-	if size == ptrSize {
+	// dataSize is always size rounded up to the next malloc size class,
+	// except in the case of allocating a defer block, in which case
+	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
+	// arbitrarily larger.
+	//
+	// The checks for size == ptrSize and size == 2*ptrSize can therefore
+	// assume that dataSize == size without checking it explicitly.
+
+	if ptrSize == 8 && size == ptrSize {
 		// It's one word and it has pointers, it must be a pointer.
-		// The bitmap byte is shared with the one-word object
-		// next to it, and concurrent GC might be marking that
-		// object, so we must use an atomic update.
-		atomicor8(h.bitp, typePointer<<(typeShift+h.shift))
+		// In general we'd need an atomic update here if the
+		// concurrent GC were marking objects in this span,
+		// because each bitmap byte describes 3 other objects
+		// in addition to the one being allocated.
+		// However, since all allocated one-word objects are pointers
+		// (non-pointers are aggregated into tinySize allocations),
+		// initSpan sets the pointer bits for us. Nothing to do here.
+		if doubleCheck {
+			h := heapBitsForAddr(x)
+			if !h.isPointer() {
+				throw("heapBitsSetType: pointer bit missing")
+			}
+		}
 		return
 	}
-	if typ.kind&kindGCProg != 0 {
-		nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
-		masksize := nptr
-		if masksize%2 != 0 {
-			masksize *= 2 // repeated
-		}
-		const typeBitsPerByte = 8 / typeBitsWidth
-		masksize = masksize * typeBitsPerByte / 8 // 4 bits per word
-		masksize++                                // unroll flag in the beginning
-		if masksize > maxGCMask && typ.gc[1] != 0 {
-			// write barriers have not been updated to deal with this case yet.
-			throw("maxGCMask too small for now")
-			// If the mask is too large, unroll the program directly
-			// into the GC bitmap. It's 7 times slower than copying
-			// from the pre-unrolled mask, but saves 1/16 of type size
-			// memory for the mask.
-			systemstack(func() {
-				unrollgcproginplace_m(unsafe.Pointer(x), typ, size, dataSize)
-			})
+
+	h := heapBitsForAddr(x)
+	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
+
+	// Heap bitmap bits for 2-word object are only 4 bits,
+	// so also shared with objects next to it; use atomic updates.
+	// This is called out as a special case primarily for 32-bit systems,
+	// so that on 32-bit systems the code below can assume all objects
+	// are 4-word aligned (because they're all 16-byte aligned).
+	if size == 2*ptrSize {
+		if typ.size == ptrSize {
+			// We're allocating a block big enough to hold two pointers.
+			// On 64-bit, that means the actual object must be two pointers,
+			// or else we'd have used the one-pointer-sized block.
+			// On 32-bit, however, this is the 8-byte block, the smallest one.
+			// So it could be that we're allocating one pointer and this was
+			// just the smallest block available. Distinguish by checking dataSize.
+			// (In general the number of instances of typ being allocated is
+			// dataSize/typ.size.)
+			if ptrSize == 4 && dataSize == ptrSize {
+				// 1 pointer.
+				if gcphase == _GCoff {
+					*h.bitp |= bitPointer << h.shift
+				} else {
+					atomicor8(h.bitp, bitPointer<<h.shift)
+				}
+			} else {
+				// 2-element slice of pointer.
+				if gcphase == _GCoff {
+					*h.bitp |= (bitPointer | bitPointer<<heapBitsShift) << h.shift
+				} else {
+					atomicor8(h.bitp, (bitPointer|bitPointer<<heapBitsShift)<<h.shift)
+				}
+			}
 			return
 		}
-		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
-		// Check whether the program is already unrolled
-		// by checking if the unroll flag byte is set
-		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
-		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
-			systemstack(func() {
-				unrollgcprog_m(typ)
-			})
+		// Otherwise typ.size must be 2*ptrSize, and typ.kind&kindGCProg == 0.
+		if doubleCheck {
+			if typ.size != 2*ptrSize || typ.kind&kindGCProg != 0 {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
+				throw("heapBitsSetType")
+			}
+		}
+		b := uint32(*ptrmask)
+		hb := b & 3
+		if gcphase == _GCoff {
+			*h.bitp |= uint8(hb << h.shift)
+		} else {
+			atomicor8(h.bitp, uint8(hb<<h.shift))
 		}
-		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+		return
+	}
+
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// The basic approach is to use a single uintptr as a bit buffer,
+	// alternating between reloading the buffer and writing bitmap bytes.
+	// In general, one load can supply two bitmap byte writes.
+	// This is a lot of lines of code, but it compiles into relatively few
+	// machine instructions.
+
+	var (
+		// Ptrmask input.
+		p     *byte   // last ptrmask byte read
+		b     uintptr // ptrmask bits already loaded
+		nb    uintptr // number of bits in b at next read
+		endp  *byte   // final ptrmask byte to read (then repeat)
+		endnb uintptr // number of valid bits in *endp
+		pbits uintptr // alternate source of bits
+
+		// Heap bitmap output.
+		w     uintptr // words processed
+		nw    uintptr // number of words to process
+		hbitp *byte   // next heap bitmap byte to write
+		hb    uintptr // bits being prepared for *hbitp
+	)
+
+	hbitp = h.bitp
+
+	// Handle GC program. Delayed until this part of the code
+	// so that we can use the same double-checking mechanism
+	// as the 1-bit case. Nothing above could have encountered
+	// GC programs: the cases were all too small.
+	if typ.kind&kindGCProg != 0 {
+		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
+		if doubleCheck {
+			// Double-check the heap bits written by GC program
+			// by running the GC program to create a 1-bit pointer mask
+			// and then jumping to the double-check code below.
+			// This doesn't catch bugs shared between the 1-bit and 4-bit
+			// GC program execution, but it does catch mistakes specific
+			// to just one of those and bugs in heapBitsSetTypeGCProg's
+			// implementation of arrays.
+			lock(&debugPtrmask.lock)
+			if debugPtrmask.data == nil {
+				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
+			}
+			ptrmask = debugPtrmask.data
+			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
+			goto Phase4
+		}
+		return
+	}
+
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
+	p = ptrmask
+	if typ.size < dataSize {
+		// Filling in bits for an array of typ.
+		// Set up for repetition of ptrmask during main loop.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = ptrSize*8 - 7
+		if typ.ptrdata/ptrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
+			// Load into pbits and never read from ptrmask again.
+			// This is especially important when the ptrmask has
+			// fewer than 8 bits in it; otherwise the reload in the middle
+			// of the Phase 2 loop would itself need to loop to gather
+			// at least 8 bits.
+
+			// Accumulate ptrmask into b.
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / ptrSize
+			for i := uintptr(0); i < nb; i += 8 {
+				b |= uintptr(*p) << i
+				p = add1(p)
+			}
+			nb = typ.size / ptrSize
+
+			// Replicate ptrmask to fill entire pbits uintptr.
+			// Doubling and truncating is fewer steps than
+			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/ptrSize bits
+			// but are pretending to have typ.size/ptrSize,
+			// there might be no replication necessary/possible.
+			pbits = b
+			endnb = nb
+			if nb+nb <= maxBits {
+				for endnb <= ptrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				endnb = maxBits / nb * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
+			}
+
+			// Clear p and endp as sentinel for using pbits.
+			// Checked during Phase 2 loop.
+			p = nil
+			endp = nil
+		} else {
+			// Ptrmask is larger. Read it multiple times.
+			n := (typ.ptrdata/ptrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/ptrSize - n*8
+		}
+	}
+	if p != nil {
+		b = uintptr(*p)
+		p = add1(p)
+		nb = 8
+	}
+
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / ptrSize
 	} else {
-		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+		// Repeated instances of typ in an array.
+		// Have to process first N-1 entries in full, but can stop
+		// once we reach the non-pointer data in the final entry.
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / ptrSize
 	}
-	if size == 2*ptrSize {
-		// h.shift is 0 for all sizes > ptrSize.
-		*h.bitp = *ptrmask
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", *typ._string)
+		throw("heapBitsSetType: called with non-pointer type")
 		return
 	}
-	te = uintptr(typ.size) / ptrSize
-	// If the type occupies odd number of words, its mask is repeated.
-	if te%2 == 0 {
-		te /= 2
+	if nw < 2 {
+		// Must write at least 2 words, because the "no scan"
+		// encoding doesn't take effect until the third word.
+		nw = 2
 	}
-	// Copy pointer bitmask into the bitmap.
-	// TODO(rlh): add comment addressing the following concerns:
-	// If size > 2*ptrSize, is x guaranteed to be at least 2*ptrSize-aligned?
-	// And if type occupies and odd number of words, why are we only going through half
-	// of ptrmask and why don't we have to shift everything by 4 on odd iterations?
 
-	for i := uintptr(0); i < dataSize; i += 2 * ptrSize {
-		v := *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
-		ti++
-		if ti == te {
-			ti = 0
+	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
+	// The leading byte is special because it contains the bits for words 0 and 1,
+	// which do not have the marked bits set.
+	// The leading half-byte is special because it's a half a byte and must be
+	// manipulated atomically.
+	switch {
+	default:
+		throw("heapBitsSetType: unexpected shift")
+
+	case h.shift == 0:
+		// Ptrmask and heap bitmap are aligned.
+		// Handle first byte of bitmap specially.
+		// The first byte we write out contains the first two words of the object.
+		// In those words, the mark bits are mark and checkmark, respectively,
+		// and must not be set. In all following words, we want to set the mark bit
+		// as a signal that the object continues to the next 2-bit entry in the bitmap.
+		hb = b & bitPointerAll
+		hb |= bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
+		if w += 4; w >= nw {
+			goto Phase3
 		}
-		if i+ptrSize == dataSize {
-			v &^= typeMask << (4 + typeShift)
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+		nb -= 4
+
+	case ptrSize == 8 && h.shift == 2:
+		// Ptrmask and heap bitmap are misaligned.
+		// The bits for the first two words are in a byte shared with another object
+		// and must be updated atomically.
+		// NOTE(rsc): The atomic here may not be necessary.
+		// We took care of 1-word and 2-word objects above,
+		// so this is at least a 6-word object, so our start bits
+		// are shared only with the type bits of another object,
+		// not with its mark bit. Since there is only one allocation
+		// from a given span at a time, we should be able to set
+		// these bits non-atomically. Not worth the risk right now.
+		hb = (b & 3) << (2 * heapBitsShift)
+		b >>= 2
+		nb -= 2
+		// Note: no bitMarker in hb because the first two words don't get markers from us.
+		if gcphase == _GCoff {
+			*hbitp |= uint8(hb)
+		} else {
+			atomicor8(hbitp, uint8(hb))
+		}
+		hbitp = subtract1(hbitp)
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			hb = 0
+			w += 4
+			goto Phase3
+		}
+	}
+
+	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
+	// The loop computes the bits for that last write but does not execute the write;
+	// it leaves the bits in hb for processing by phase 3.
+	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
+	// use in the first half of the loop right now, and then we only adjust nb explicitly
+	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
+	nb -= 4
+	for {
+		// Emit bitmap byte.
+		// b has at least nb+4 bits, with one exception:
+		// if w+4 >= nw, then b has only nw-w bits,
+		// but we'll stop at the break and then truncate
+		// appropriately in Phase 3.
+		hb = b & bitPointerAll
+		hb |= bitMarkedAll
+		if w += 4; w >= nw {
+			break
 		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
 
-		*h.bitp = v
-		h.bitp = subtractb(h.bitp, 1)
+		// Load more bits. b has nb right now.
+		if p != endp {
+			// Fast path: keep reading from ptrmask.
+			// nb unmodified: we just loaded 8 bits,
+			// and the next iteration will consume 8 bits,
+			// leaving us with the same nb the next time we're here.
+			b |= uintptr(*p) << nb
+			p = add1(p)
+		} else if p == nil {
+			// Almost as fast path: track bit count and refill from pbits.
+			// For short repetitions.
+			if nb < 8 {
+				b |= pbits << nb
+				nb += endnb
+			}
+			nb -= 8 // for next iteration
+		} else {
+			// Slow path: reached end of ptrmask.
+			// Process final partial byte and rewind to start.
+			b |= uintptr(*p) << nb
+			nb += endnb
+			if nb < 8 {
+				b |= uintptr(*ptrmask) << nb
+				p = add1(ptrmask)
+			} else {
+				nb -= 8
+				p = ptrmask
+			}
+		}
+
+		// Emit bitmap byte.
+		hb = b & bitPointerAll
+		hb |= bitMarkedAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
 	}
-	if dataSize%(2*ptrSize) == 0 && dataSize < size {
-		// Mark the word after last object's word as typeDead.
-		*h.bitp = 0
+
+Phase3:
+	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
+	if w > nw {
+		// Counting the 4 entries in hb not yet written to memory,
+		// there are more entries than possible pointer slots.
+		// Discard the excess entries (can't be more than 3).
+		mask := uintptr(1)<<(4-(w-nw)) - 1
+		hb &= mask | mask<<4 // apply mask to both pointer bits and mark bits
+	}
+
+	// Change nw from counting possibly-pointer words to total words in allocation.
+	nw = size / ptrSize
+
+	// Write whole bitmap bytes.
+	// The first is hb, the rest are zero.
+	if w <= nw {
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		hb = 0 // for possible final half-byte below
+		for w += 4; w <= nw; w += 4 {
+			*hbitp = 0
+			hbitp = subtract1(hbitp)
+		}
+	}
+
+	// Write final partial bitmap byte if any.
+	// We know w > nw, or else we'd still be in the loop above.
+	// It can be bigger only due to the 4 entries in hb that it counts.
+	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
+	// and can discard the 4 sitting in hb.
+	// But if w == nw+2, we need to write first two in hb.
+	// The byte is shared with the next object so we may need an atomic.
+	if w == nw+2 {
+		if gcphase == _GCoff {
+			*hbitp = *hbitp&^(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift) | uint8(hb)
+		} else {
+			atomicand8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
+			atomicor8(hbitp, uint8(hb))
+		}
+	}
+
+Phase4:
+	// Phase 4: all done, but perhaps double check.
+	if doubleCheck {
+		end := heapBitsForAddr(x + size)
+		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
+			println("ended at wrong bitmap byte for", *typ._string, "x", dataSize/typ.size)
+			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+			h0 := heapBitsForAddr(x)
+			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
+			throw("bad heapBitsSetType")
+		}
+
+		// Double-check that bits to be written were written correctly.
+		// Does not check that other bits were not written, unfortunately.
+		h := heapBitsForAddr(x)
+		nptr := typ.ptrdata / ptrSize
+		ndata := typ.size / ptrSize
+		count := dataSize / typ.size
+		totalptr := ((count-1)*typ.size + typ.ptrdata) / ptrSize
+		for i := uintptr(0); i < size/ptrSize; i++ {
+			j := i % ndata
+			var have, want uint8
+			have = (*h.bitp >> h.shift) & (bitPointer | bitMarked)
+			if i >= totalptr {
+				want = 0 // deadmarker
+				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					want = bitMarked
+				}
+			} else {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+					want |= bitPointer
+				}
+				if i >= 2 {
+					want |= bitMarked
+				} else {
+					have &^= bitMarked
+				}
+			}
+			if have != want {
+				println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+				h0 := heapBitsForAddr(x)
+				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
+				if typ.kind&kindGCProg != 0 {
+					println("GC program:")
+					dumpGCProg(addb(typ.gcdata, 4))
+				}
+				throw("bad heapBitsSetType")
+			}
+			h = h.next()
+		}
+		if ptrmask == debugPtrmask.data {
+			unlock(&debugPtrmask.lock)
+		}
 	}
 }
 
-// typeBitmapInHeapBitmapFormat returns a bitmap holding
-// the type bits for the type typ, but expanded into heap bitmap format
-// to make it easier to copy them into the heap bitmap.
-// TODO(rsc): Change clients to use the type bitmap format instead,
-// which can be stored more densely (especially if we drop to 1 bit per pointer).
+var debugPtrmask struct {
+	lock mutex
+	data *byte
+}
+
+// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
+// progSize is the size of the memory described by the program.
+// elemSize is the size of the element that the GC program describes (a prefix of).
+// dataSize is the total size of the intended data, a multiple of elemSize.
+// allocSize is the total size of the allocated memory.
 //
-// To make it easier to replicate the bits when filling out the heap
-// bitmap for an array of typ, if typ holds an odd number of words
-// (meaning the heap bitmap would stop halfway through a byte),
-// typeBitmapInHeapBitmapFormat returns the bitmap for two instances
-// of typ in a row.
-// TODO(rsc): Remove doubling.
-func typeBitmapInHeapBitmapFormat(typ *_type) []uint8 {
-	var ptrmask *uint8
-	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
-	if typ.kind&kindGCProg != 0 {
-		masksize := nptr
-		if masksize%2 != 0 {
-			masksize *= 2 // repeated
+// GC programs are only used for large allocations.
+// heapBitsSetType requires that allocSize is a multiple of 4 words,
+// so that the relevant bitmap bytes are not shared with surrounding
+// objects and need not be accessed with atomic instructions.
+func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
+	if ptrSize == 8 && allocSize%(4*ptrSize) != 0 {
+		// Alignment will be wrong.
+		throw("heapBitsSetTypeGCProg: small allocation")
+	}
+	var totalBits uintptr
+	if elemSize == dataSize {
+		totalBits = runGCProg(prog, nil, h.bitp, 2)
+		if totalBits*ptrSize != progSize {
+			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
+			throw("heapBitsSetTypeGCProg: unexpected bit count")
+		}
+	} else {
+		count := dataSize / elemSize
+
+		// Piece together program trailer to run after prog that does:
+		//	literal(0)
+		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
+		//	repeat(elemSize, count-1) // repeat that element for count
+		// This zero-pads the data remaining in the first element and then
+		// repeats that first element to fill the array.
+		var trailer [40]byte // 3 varints (max 10 each) + some bytes
+		i := 0
+		if n := elemSize/ptrSize - progSize/ptrSize; n > 0 {
+			// literal(0)
+			trailer[i] = 0x01
+			i++
+			trailer[i] = 0
+			i++
+			if n > 1 {
+				// repeat(1, n-1)
+				trailer[i] = 0x81
+				i++
+				n--
+				for ; n >= 0x80; n >>= 7 {
+					trailer[i] = byte(n | 0x80)
+					i++
+				}
+				trailer[i] = byte(n)
+				i++
+			}
 		}
-		const typeBitsPerByte = 8 / typeBitsWidth
-		masksize = masksize * typeBitsPerByte / 8 // 4 bits per word
-		masksize++                                // unroll flag in the beginning
-		if masksize > maxGCMask && typ.gc[1] != 0 {
-			// write barriers have not been updated to deal with this case yet.
-			throw("maxGCMask too small for now")
+		// repeat(elemSize/ptrSize, count-1)
+		trailer[i] = 0x80
+		i++
+		n := elemSize / ptrSize
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
 		}
-		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
-		// Check whether the program is already unrolled
-		// by checking if the unroll flag byte is set
-		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
-		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
-			systemstack(func() {
-				unrollgcprog_m(typ)
-			})
+		trailer[i] = byte(n)
+		i++
+		n = count
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
 		}
-		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
-	} else {
-		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+		trailer[i] = byte(n)
+		i++
+		trailer[i] = 0
+		i++
+
+		runGCProg(prog, &trailer[0], h.bitp, 2)
+
+		// Even though we filled in the full array just now,
+		// record that we only filled in up to the ptrdata of the
+		// last element. This will cause the code below to
+		// memclr the dead section of the final array element,
+		// so that scanobject can stop early in the final element.
+		totalBits = (elemSize*(count-1) + progSize) / ptrSize
+	}
+	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
+	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
+	memclr(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
+}
+
+// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
+// size the size of the region described by prog, in bytes.
+// The resulting bitvector will have no more than size/ptrSize bits.
+func progToPointerMask(prog *byte, size uintptr) bitvector {
+	n := (size/ptrSize + 7) / 8
+	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
+	x[len(x)-1] = 0xa1 // overflow check sentinel
+	n = runGCProg(prog, nil, &x[0], 1)
+	if x[len(x)-1] != 0xa1 {
+		throw("progToPointerMask: overflow")
 	}
-	return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
+	return bitvector{int32(n), &x[0]}
 }
 
-// GC type info programs
+// Packed GC pointer bitmaps, aka GC programs.
 //
-// TODO(rsc): Clean up and enable.
+// For large types containing arrays, the type information has a
+// natural repetition that can be encoded to save space in the
+// binary and in the memory representation of the type information.
+//
+// The encoding is a simple Lempel-Ziv style bytecode machine
+// with the following instructions:
+//
+//	00000000: stop
+//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
+//	10000000 n c: repeat the previous n bits c times; n, c are varints
+//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
 
-const (
-	// GC type info programs.
-	// The programs allow to store type info required for GC in a compact form.
-	// Most importantly arrays take O(1) space instead of O(n).
-	// The program grammar is:
-	//
-	// Program = {Block} "insEnd"
-	// Block = Data | Array
-	// Data = "insData" DataSize DataBlock
-	// DataSize = int // size of the DataBlock in bit pairs, 1 byte
-	// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
-	// Array = "insArray" ArrayLen Block "insArrayEnd"
-	// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
-	//
-	// Each instruction (insData, insArray, etc) is 1 byte.
-	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
-	// the program looks as:
-	//
-	// insData 3 (typePointer typeScalar typeScalar)
-	//	insArray 20 insData 2 (typeScalar typePointer) insArrayEnd insEnd
-	//
-	// Total size of the program is 17 bytes (13 bytes on 32-bits).
-	// The corresponding GC mask would take 43 bytes (it would be repeated
-	// because the type has odd number of words).
-	insData = 1 + iota
-	insArray
-	insArrayEnd
-	insEnd
+// runGCProg executes the GC program prog, and then trailer if non-nil,
+// writing to dst with entries of the given size.
+// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
+// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
+// starting at dst (because the heap bitmap does). In this case, the caller guarantees
+// that only whole bytes in dst need to be written.
+//
+// runGCProg returns the number of 1- or 2-bit entries written to memory.
+func runGCProg(prog, trailer, dst *byte, size int) uintptr {
+	dstStart := dst
 
-	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
-	maxGCMask = 65536 // TODO(rsc): change back to 64
-)
+	// Bits waiting to be written to memory.
+	var bits uintptr
+	var nbits uintptr
 
-// Recursively unrolls GC program in prog.
-// mask is where to store the result.
-// If inplace is true, store the result not in mask but in the heap bitmap for mask.
-// ppos is a pointer to position in mask, in bits.
-// sparse says to generate 4-bits per word mask for heap (1-bit for data/bss otherwise).
-//go:nowritebarrier
-func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
-	pos := *ppos
-	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
+	p := prog
+Run:
 	for {
-		switch *prog {
-		default:
-			throw("unrollgcprog: unknown instruction")
+		// Flush accumulated full bytes.
+		// The rest of the loop assumes that nbits <= 7.
+		for ; nbits >= 8; nbits -= 8 {
+			if size == 1 {
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			} else {
+				v := bits&bitPointerAll | bitMarkedAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+				v = bits&bitPointerAll | bitMarkedAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+		}
 
-		case insData:
-			prog = addb(prog, 1)
-			siz := int(*prog)
-			prog = addb(prog, 1)
-			p := (*[1 << 30]byte)(unsafe.Pointer(prog))
-			for i := 0; i < siz; i++ {
-				const typeBitsPerByte = 8 / typeBitsWidth
-				v := p[i/typeBitsPerByte]
-				v >>= (uint(i) % typeBitsPerByte) * typeBitsWidth
-				v &= typeMask
-				if inplace {
-					// Store directly into GC bitmap.
-					h := heapBitsForAddr(uintptr(unsafe.Pointer(&mask[pos])))
-					if h.shift == 0 {
-						*h.bitp = v << typeShift
-					} else {
-						*h.bitp |= v << (4 + typeShift)
-					}
-					pos += ptrSize
-				} else if sparse {
-					// 4-bits per word, type bits in high bits
-					v <<= (pos % 8) + typeShift
-					mask[pos/8] |= v
-					pos += heapBitsWidth
+		// Process one instruction.
+		inst := uintptr(*p)
+		p = add1(p)
+		n := inst & 0x7F
+		if inst&0x80 == 0 {
+			// Literal bits; n == 0 means end of program.
+			if n == 0 {
+				// Program is over; continue in trailer if present.
+				if trailer != nil {
+					//println("trailer")
+					p = trailer
+					trailer = nil
+					continue
+				}
+				//println("done")
+				break Run
+			}
+			//println("lit", n, dst)
+			nbyte := n / 8
+			for i := uintptr(0); i < nbyte; i++ {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				if size == 1 {
+					*dst = uint8(bits)
+					dst = add1(dst)
+					bits >>= 8
 				} else {
-					// 1 bit per word, for data/bss bitmap
-					v >>= 1 // convert typePointer to 1, others to 0
-					mask[pos/8] |= v << (pos % 8)
-					pos++
+					v := bits&0xf | bitMarkedAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+					v = bits&0xf | bitMarkedAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+				}
+			}
+			if n %= 8; n > 0 {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				nbits += n
+			}
+			continue Run
+		}
+
+		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
+		if n == 0 {
+			for off := uint(0); ; off += 7 {
+				x := uintptr(*p)
+				p = add1(p)
+				n |= (x & 0x7F) << off
+				if x&0x80 == 0 {
+					break
+				}
+			}
+		}
+
+		// Count is encoded in a varint in the next bytes.
+		c := uintptr(0)
+		for off := uint(0); ; off += 7 {
+			x := uintptr(*p)
+			p = add1(p)
+			c |= (x & 0x7F) << off
+			if x&0x80 == 0 {
+				break
+			}
+		}
+		c *= n // now total number of bits to copy
+
+		// If the number of bits being repeated is small, load them
+		// into a register and use that register for the entire loop
+		// instead of repeatedly reading from memory.
+		// Handling fewer than 8 bits here makes the general loop simpler.
+		// The cutoff is ptrSize*8 - 7 to guarantee that when we add
+		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
+		// it will not overflow.
+		src := dst
+		const maxBits = ptrSize*8 - 7
+		if n <= maxBits {
+			// Start with bits in output buffer.
+			pattern := bits
+			npattern := nbits
+
+			// If we need more bits, fetch them from memory.
+			if size == 1 {
+				src = subtract1(src)
+				for npattern < n {
+					pattern <<= 8
+					pattern |= uintptr(*src)
+					src = subtract1(src)
+					npattern += 8
+				}
+			} else {
+				src = add1(src)
+				for npattern < n {
+					pattern <<= 4
+					pattern |= uintptr(*src) & 0xf
+					src = add1(src)
+					npattern += 4
 				}
 			}
-			prog = addb(prog, round(uintptr(siz)*typeBitsWidth, 8)/8)
 
-		case insArray:
-			prog = (*byte)(add(unsafe.Pointer(prog), 1))
-			siz := uintptr(0)
-			for i := uintptr(0); i < ptrSize; i++ {
-				siz = (siz << 8) + uintptr(*(*byte)(add(unsafe.Pointer(prog), ptrSize-i-1)))
+			// We started with the whole bit output buffer,
+			// and then we loaded bits from whole bytes.
+			// Either way, we might now have too many instead of too few.
+			// Discard the extra.
+			if npattern > n {
+				pattern >>= npattern - n
+				npattern = n
 			}
-			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
-			var prog1 *byte
-			for i := uintptr(0); i < siz; i++ {
-				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+
+			// Replicate pattern to at most maxBits.
+			if npattern == 1 {
+				// One bit being repeated.
+				// If the bit is 1, make the pattern all 1s.
+				// If the bit is 0, the pattern is already all 0s,
+				// but we can claim that the number of bits
+				// in the word is equal to the number we need (c),
+				// because right shift of bits will zero fill.
+				if pattern == 1 {
+					pattern = 1<<maxBits - 1
+					npattern = maxBits
+				} else {
+					npattern = c
+				}
+			} else {
+				b := pattern
+				nb := npattern
+				if nb+nb <= maxBits {
+					// Double pattern until the whole uintptr is filled.
+					for nb <= ptrSize*8 {
+						b |= b << nb
+						nb += nb
+					}
+					// Trim away incomplete copy of original pattern in high bits.
+					// TODO(rsc): Replace with table lookup or loop on systems without divide?
+					nb = maxBits / npattern * npattern
+					b &= 1<<nb - 1
+					pattern = b
+					npattern = nb
+				}
 			}
-			if *prog1 != insArrayEnd {
-				throw("unrollgcprog: array does not end with insArrayEnd")
+
+			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
+			// Since pattern contains >8 bits, there will be full bytes to flush
+			// on each iteration.
+			for ; c >= npattern; c -= npattern {
+				bits |= pattern << nbits
+				nbits += npattern
+				if size == 1 {
+					for nbits >= 8 {
+						*dst = uint8(bits)
+						dst = add1(dst)
+						bits >>= 8
+						nbits -= 8
+					}
+				} else {
+					for nbits >= 4 {
+						*dst = uint8(bits&0xf | bitMarkedAll)
+						dst = subtract1(dst)
+						bits >>= 4
+						nbits -= 4
+					}
+				}
 			}
-			prog = (*byte)(add(unsafe.Pointer(prog1), 1))
 
-		case insArrayEnd, insEnd:
-			*ppos = pos
-			return prog
+			// Add final fragment to bit buffer.
+			if c > 0 {
+				pattern &= 1<<c - 1
+				bits |= pattern << nbits
+				nbits += c
+			}
+			continue Run
 		}
-	}
-}
-
-// Unrolls GC program prog for data/bss, returns dense GC mask.
-func unrollglobgcprog(prog *byte, size uintptr) bitvector {
-	masksize := round(round(size, ptrSize)/ptrSize, 8) / 8
-	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
-	mask[masksize] = 0xa1
-	pos := uintptr(0)
-	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
-	if pos != size/ptrSize {
-		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize, "\n")
-		throw("unrollglobgcprog: bad program size")
-	}
-	if *prog != insEnd {
-		throw("unrollglobgcprog: program does not end with insEnd")
-	}
-	if mask[masksize] != 0xa1 {
-		throw("unrollglobgcprog: overflow")
-	}
-	return bitvector{int32(masksize * 8), &mask[0]}
-}
 
-func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
-	// TODO(rsc): Explain why these non-atomic updates are okay.
-	pos := uintptr(0)
-	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-	for pos != size0 {
-		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+		// Repeat; n too large to fit in a register.
+		// Since nbits <= 7, we know the first few bytes of repeated data
+		// are already written to memory.
+		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
+		if size == 1 {
+			// Leading src fragment.
+			src = subtractb(src, (off+7)/8)
+			if frag := off & 7; frag != 0 {
+				bits |= uintptr(*src) >> (8 - frag) << nbits
+				src = add1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 8; i > 0; i-- {
+				bits |= uintptr(*src) << nbits
+				src = add1(src)
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			}
+			// Final src fragment.
+			if c %= 8; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		} else {
+			// Leading src fragment.
+			src = addb(src, (off+3)/4)
+			if frag := off & 3; frag != 0 {
+				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
+				src = subtract1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 4; i > 0; i-- {
+				bits |= (uintptr(*src) & 0xf) << nbits
+				src = subtract1(src)
+				*dst = uint8(bits&0xf | bitMarkedAll)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+			// Final src fragment.
+			if c %= 4; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		}
 	}
 
-	// Mark first word as bitAllocated.
-	// Mark word after last as typeDead.
-	if size0 < size {
-		h := heapBitsForAddr(uintptr(v) + size0)
-		*h.bitp &^= typeMask << typeShift
+	// Write any final bits out, using full-byte writes, even for the final byte.
+	var totalBits uintptr
+	if size == 1 {
+		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
+		nbits += -nbits & 7
+		for ; nbits > 0; nbits -= 8 {
+			*dst = uint8(bits)
+			dst = add1(dst)
+			bits >>= 8
+		}
+	} else {
+		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
+		nbits += -nbits & 3
+		for ; nbits > 0; nbits -= 4 {
+			v := bits&0xf | bitMarkedAll
+			*dst = uint8(v)
+			dst = subtract1(dst)
+			bits >>= 4
+		}
+		// Clear the mark bits in the first two entries.
+		// They are the actual mark and checkmark bits,
+		// not non-dead markers. It simplified the code
+		// above to set the marker in every bit written and
+		// then clear these two as a special case at the end.
+		*dstStart &^= bitMarked | bitMarked<<heapBitsShift
 	}
+	return totalBits
 }
 
-var unroll mutex
-
-// Unrolls GC program in typ.gc[1] into typ.gc[0]
-//go:nowritebarrier
-func unrollgcprog_m(typ *_type) {
-	lock(&unroll)
-	mask := (*byte)(unsafe.Pointer(uintptr(typ.gc[0])))
-	if *mask == 0 {
-		pos := uintptr(8) // skip the unroll flag
-		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-		prog = unrollgcprog1(mask, prog, &pos, false, true)
-		if *prog != insEnd {
-			throw("unrollgcprog: program does not end with insEnd")
+func dumpGCProg(p *byte) {
+	nptr := 0
+	for {
+		x := *p
+		p = add1(p)
+		if x == 0 {
+			print("\t", nptr, " end\n")
+			break
 		}
-		if typ.size/ptrSize%2 != 0 {
-			// repeat the program
-			prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-			unrollgcprog1(mask, prog, &pos, false, true)
+		if x&0x80 == 0 {
+			print("\t", nptr, " lit ", x, ":")
+			n := int(x+7) / 8
+			for i := 0; i < n; i++ {
+				print(" ", hex(*p))
+				p = add1(p)
+			}
+			print("\n")
+			nptr += int(x)
+		} else {
+			nbit := int(x &^ 0x80)
+			if nbit == 0 {
+				for nb := uint(0); ; nb += 7 {
+					x := *p
+					p = add1(p)
+					nbit |= int(x&0x7f) << nb
+					if x&0x80 == 0 {
+						break
+					}
+				}
+			}
+			count := 0
+			for nb := uint(0); ; nb += 7 {
+				x := *p
+				p = add1(p)
+				count |= int(x&0x7f) << nb
+				if x&0x80 == 0 {
+					break
+				}
+			}
+			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
+			nptr += nbit * count
 		}
-
-		// atomic way to say mask[0] = 1
-		atomicor8(mask, 1)
 	}
-	unlock(&unroll)
 }
 
 // Testing.
@@ -748,36 +1510,46 @@ func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
 	return true
 }
 
-// Returns GC type info for object p for testing.
-func getgcmask(p unsafe.Pointer, t *_type, mask **byte, len *uintptr) {
-	*mask = nil
-	*len = 0
+// gcbits returns the GC type info for x, for testing.
+// The result is the bitmap entries (0 or 1), one entry per byte.
+//go:linkname reflect_gcbits reflect.gcbits
+func reflect_gcbits(x interface{}) []byte {
+	ret := getgcmask(x)
+	typ := (*ptrtype)(unsafe.Pointer((*eface)(unsafe.Pointer(&x))._type)).elem
+	nptr := typ.ptrdata / ptrSize
+	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
+		ret = ret[:len(ret)-1]
+	}
+	return ret
+}
 
-	// data
+// Returns GC type info for object p for testing.
+func getgcmask(ep interface{}) (mask []byte) {
+	e := *(*eface)(unsafe.Pointer(&ep))
+	p := e.data
+	t := e._type
+	// data or bss
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		// data
 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			bitmap := datap.gcdatamask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-			*len = n / ptrSize
-			*mask = &make([]byte, *len)[0]
+			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.data) / ptrSize
-				bits := (*addb(datap.gcdatamask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				*addb(*mask, i/ptrSize) = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
 
 		// bss
 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			bitmap := datap.gcbssmask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-			*len = n / ptrSize
-			*mask = &make([]byte, *len)[0]
+			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.bss) / ptrSize
-				bits := (*addb(datap.gcbssmask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				*addb(*mask, i/ptrSize) = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
@@ -787,47 +1559,58 @@ func getgcmask(p unsafe.Pointer, t *_type, mask **byte, len *uintptr) {
 	var n uintptr
 	var base uintptr
 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
-		*len = n / ptrSize
-		*mask = &make([]byte, *len)[0]
+		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
-			bits := heapBitsForAddr(base + i).typeBits()
-			*addb(*mask, i/ptrSize) = bits
+			hbits := heapBitsForAddr(base + i)
+			if hbits.isPointer() {
+				mask[i/ptrSize] = 1
+			}
+			if i >= 2*ptrSize && !hbits.isMarked() {
+				mask = mask[:i/ptrSize]
+				break
+			}
 		}
 		return
 	}
 
 	// stack
-	var frame stkframe
-	frame.sp = uintptr(p)
-	_g_ := getg()
-	gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
-	if frame.fn != nil {
-		f := frame.fn
-		targetpc := frame.continpc
-		if targetpc == 0 {
-			return
-		}
-		if targetpc != f.entry {
-			targetpc--
-		}
-		pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
-		if pcdata == -1 {
-			return
-		}
-		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
-		if stkmap == nil || stkmap.n <= 0 {
-			return
-		}
-		bv := stackmapdata(stkmap, pcdata)
-		size := uintptr(bv.n) * ptrSize
-		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-		*len = n / ptrSize
-		*mask = &make([]byte, *len)[0]
-		for i := uintptr(0); i < n; i += ptrSize {
-			off := (uintptr(p) + i - frame.varp + size) / ptrSize
-			bits := (*addb(bv.bytedata, off/8) >> (off % 8)) & 1
-			bits += 1 // convert 1-bit to 2-bit
-			*addb(*mask, i/ptrSize) = bits
+	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
+		var frame stkframe
+		frame.sp = uintptr(p)
+		_g_ := getg()
+		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+		if frame.fn != nil {
+			f := frame.fn
+			targetpc := frame.continpc
+			if targetpc == 0 {
+				return
+			}
+			if targetpc != f.entry {
+				targetpc--
+			}
+			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+			if pcdata == -1 {
+				return
+			}
+			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+			if stkmap == nil || stkmap.n <= 0 {
+				return
+			}
+			bv := stackmapdata(stkmap, pcdata)
+			size := uintptr(bv.n) * ptrSize
+			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+			mask = make([]byte, n/ptrSize)
+			for i := uintptr(0); i < n; i += ptrSize {
+				bitmap := bv.bytedata
+				off := (uintptr(p) + i - frame.varp + size) / ptrSize
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+			}
 		}
+		return
 	}
+
+	// otherwise, not something the GC knows about.
+	// possibly read-only data, like malloc(0).
+	// must not have pointers
+	return
 }
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 9bd36d1a5e..db5b2dcd36 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -127,13 +127,22 @@ const (
 	_RootCount       = 5
 )
 
-// heapminimum is the minimum number of bytes in the heap.
-// This cleans up the corner case of where we have a very small live set but a lot
-// of allocations and collecting every GOGC * live set is expensive.
-// heapminimum is adjust by multiplying it by GOGC/100. In
-// the special case of GOGC==0 this will set heapminimum to 0 resulting
-// collecting at every allocation even when the heap size is small.
-var heapminimum = uint64(4 << 20)
+// heapminimum is the minimum heap size at which to trigger GC.
+// For small heaps, this overrides the usual GOGC*live set rule.
+//
+// When there is a very small live set but a lot of allocation, simply
+// collecting when the heap reaches GOGC*live results in many GC
+// cycles and high total per-GC overhead. This minimum amortizes this
+// per-GC overhead while keeping the heap reasonably small.
+//
+// During initialization this is set to 4MB*GOGC/100. In the case of
+// GOGC==0, this will set heapminimum to 0, resulting in constant
+// collection even when the heap size is small, which is useful for
+// debugging.
+var heapminimum uint64 = defaultHeapMinimum
+
+// defaultHeapMinimum is the value of heapminimum for GOGC==100.
+const defaultHeapMinimum = 4 << 20
 
 // Initialized from $GOGC.  GOGC=off means no GC.
 var gcpercent int32
@@ -146,8 +155,8 @@ func gcinit() {
 	work.markfor = parforalloc(_MaxGcproc)
 	_ = setGCPercent(readgogc())
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		datap.gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(datap.gcdata)), datap.edata-datap.data)
-		datap.gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(datap.gcbss)), datap.ebss-datap.bss)
+		datap.gcdatamask = progToPointerMask((*byte)(unsafe.Pointer(datap.gcdata)), datap.edata-datap.data)
+		datap.gcbssmask = progToPointerMask((*byte)(unsafe.Pointer(datap.gcbss)), datap.ebss-datap.bss)
 	}
 	memstats.next_gc = heapminimum
 }
@@ -180,7 +189,7 @@ func setGCPercent(in int32) (out int32) {
 		in = -1
 	}
 	gcpercent = in
-	heapminimum = heapminimum * uint64(gcpercent) / 100
+	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
 	unlock(&mheap_.lock)
 	return out
 }
@@ -197,7 +206,6 @@ var gcBlackenEnabled uint32
 
 const (
 	_GCoff             = iota // GC not running, write barrier disabled
-	_GCquiesce                // unused state
 	_GCstw                    // unused state
 	_GCscan                   // GC collecting roots into workbufs, write barrier disabled
 	_GCmark                   // GC marking from workbufs, write barrier ENABLED
@@ -208,7 +216,7 @@ const (
 //go:nosplit
 func setGCPhase(x uint32) {
 	atomicstore(&gcphase, x)
-	writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination || mheap_.shadow_enabled
+	writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination
 }
 
 // gcMarkWorkerMode represents the mode that a concurrent mark worker
@@ -699,11 +707,11 @@ const (
 func startGC(mode int) {
 	// The gc is turned off (via enablegc) until the bootstrap has completed.
 	// Also, malloc gets called in the guts of a number of libraries that might be
-	// holding locks. To avoid deadlocks during stoptheworld, don't bother
+	// holding locks. To avoid deadlocks during stop-the-world, don't bother
 	// trying to run gc while holding a lock. The next mallocgc without a lock
 	// will do the gc instead.
 	mp := acquirem()
-	if gp := getg(); gp == mp.g0 || mp.locks > 1 || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
 		releasem(mp)
 		return
 	}
@@ -797,7 +805,7 @@ func gc(mode int) {
 		traceGCStart()
 	}
 
-	systemstack(stoptheworld)
+	systemstack(stopTheWorldWithSema)
 	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
 	// clearpools before we start the GC. If we wait they memory will not be
 	// reclaimed until the next GC cycle.
@@ -814,7 +822,7 @@ func gc(mode int) {
 			setGCPhase(_GCscan)
 
 			// Concurrent scan.
-			starttheworld()
+			startTheWorldWithSema()
 			if debug.gctrace > 0 {
 				tScan = nanotime()
 			}
@@ -858,7 +866,7 @@ func gc(mode int) {
 		if debug.gctrace > 0 {
 			tMarkTerm = nanotime()
 		}
-		systemstack(stoptheworld)
+		systemstack(stopTheWorldWithSema)
 		// The gcphase is _GCmark, it will transition to _GCmarktermination
 		// below. The important thing is that the wb remains active until
 		// all marking is complete. This includes writes made by the GC.
@@ -952,13 +960,12 @@ func gc(mode int) {
 	// all done
 	mp.preemptoff = ""
 
-	semrelease(&worldsema)
-
 	if gcphase != _GCoff {
 		throw("gc done but gcphase != _GCoff")
 	}
 
-	systemstack(starttheworld)
+	systemstack(startTheWorldWithSema)
+	semrelease(&worldsema)
 
 	releasem(mp)
 	mp = nil
@@ -1160,6 +1167,18 @@ func gcBgMarkDone() {
 	}
 }
 
+// gcMarkWorkAvailable determines if mark work is readily available.
+// It is used by the scheduler to decide if this p run a mark work.
+func gcMarkWorkAvailable(p *p) bool {
+	if !p.gcw.empty() {
+		return true
+	}
+	if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
+		return true // global work available
+	}
+	return false
+}
+
 // gcFlushGCWork disposes the gcWork caches of all Ps. The world must
 // be stopped.
 //go:nowritebarrier
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 9d78ddecae..62fa33895b 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -261,7 +261,7 @@ func gcphasework(gp *g) {
 	switch gcphase {
 	default:
 		throw("gcphasework in bad gcphase")
-	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
+	case _GCoff, _GCstw, _GCsweep:
 		// No work.
 	case _GCscan:
 		// scan the stack, mark the objects, put pointers in work buffers
@@ -557,9 +557,6 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
 				// Same work as in scanobject; see comments there.
 				obj := *(*uintptr)(unsafe.Pointer(b + i))
 				if obj != 0 && arena_start <= obj && obj < arena_used {
-					if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && useCheckmark {
-						checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
-					}
 					if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
 						greyobject(obj, b, i, hbits, span, gcw)
 					}
@@ -597,32 +594,25 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// Avoid needless hbits.next() on last iteration.
 			hbits = hbits.next()
 		}
-		bits := uintptr(hbits.typeBits())
-		if bits == typeDead {
+		// During checkmarking, 1-word objects store the checkmark
+		// in the type bit for the one word. The only one-word objects
+		// are pointers, or else they'd be merged with other non-pointer
+		// data into larger allocations.
+		bits := hbits.bits()
+		if i >= 2*ptrSize && bits&bitMarked == 0 {
 			break // no more pointers in this object
 		}
-
-		if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
-			continue
-		}
-
-		if bits&typePointer != typePointer {
-			print("gc useCheckmark=", useCheckmark, " b=", hex(b), "\n")
-			throw("unexpected garbage collection bits")
+		if bits&bitPointer == 0 {
+			continue // not a pointer
 		}
 
-		// Work here is duplicated in scanblock.
+		// Work here is duplicated in scanblock and above.
 		// If you make changes here, make changes there too.
-
 		obj := *(*uintptr)(unsafe.Pointer(b + i))
 
 		// At this point we have extracted the next potential pointer.
-		// Check if it points into heap.
-		if obj != 0 && arena_start <= obj && obj < arena_used {
-			if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && useCheckmark {
-				checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
-			}
-
+		// Check if it points into heap and not back at the current object.
+		if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
 			// Mark the object.
 			if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
 				greyobject(obj, b, i, hbits, span, gcw)
@@ -673,11 +663,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 
 			throw("checkmark found unmarked object")
 		}
-		if hbits.isCheckmarked() {
+		if hbits.isCheckmarked(span.elemsize) {
 			return
 		}
-		hbits.setCheckmarked()
-		if !hbits.isCheckmarked() {
+		hbits.setCheckmarked(span.elemsize)
+		if !hbits.isCheckmarked(span.elemsize) {
 			throw("setCheckmarked and isCheckmarked disagree")
 		}
 	} else {
@@ -685,12 +675,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 		if hbits.isMarked() {
 			return
 		}
-
 		hbits.setMarked()
 
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
-		if hbits.typeBits() == typeDead {
+		if !hbits.hasPointers(span.elemsize) {
 			gcw.bytesMarked += uint64(span.elemsize)
 			return
 		}
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 9c32ae8880..b7feb847b4 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -7,7 +7,7 @@ package runtime
 import "unsafe"
 
 const (
-	_Debugwbufs  = true    // if true check wbufs consistency
+	_Debugwbufs  = false   // if true check wbufs consistency
 	_WorkbufSize = 1 * 256 // in bytes - if small wbufs are passed to GC in a timely fashion.
 )
 
@@ -182,6 +182,13 @@ func (w *gcWork) balance() {
 	}
 }
 
+// empty returns true if w has no mark work available.
+//go:nowritebarrier
+func (w *gcWork) empty() bool {
+	wbuf := w.wbuf
+	return wbuf == 0 || wbuf.ptr().nobj == 0
+}
+
 // Internally, the GC work pool is kept in arrays in work buffers.
 // The gcWork interface caches a work buffer until full (or empty) to
 // avoid contending on the global work buffer lists.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 10878ee5cf..04fa050bc5 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -28,6 +28,15 @@ type mheap struct {
 	spans        **mspan
 	spans_mapped uintptr
 
+	// Proportional sweep
+	pagesSwept        uint64  // pages swept this cycle; updated atomically
+	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+
+	// Malloc stats.
+	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+
 	// range of addresses we might see in the heap
 	bitmap         uintptr
 	bitmap_mapped  uintptr
@@ -36,14 +45,6 @@ type mheap struct {
 	arena_end      uintptr
 	arena_reserved bool
 
-	// write barrier shadow heap.
-	// 64-bit systems only, enabled by GODEBUG=wbshadow=1.
-	// See also shadow_data, data_start, data_end fields on moduledata in
-	// symtab.go.
-	shadow_enabled  bool    // shadow should be updated and checked
-	shadow_reserved bool    // shadow memory is reserved
-	shadow_heap     uintptr // heap-addr + shadow_heap = shadow heap addr
-
 	// central free lists for small size classes.
 	// the padding makes sure that the MCentrals are
 	// spaced CacheLineSize bytes apart, so that each MCentral.lock
@@ -58,15 +59,6 @@ type mheap struct {
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for sepcial record allocators.
-
-	// Proportional sweep
-	pagesSwept        uint64  // pages swept this cycle; updated atomically
-	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
-
-	// Malloc stats.
-	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
 }
 
 var mheap_ mheap
@@ -176,7 +168,9 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 
 // inheap reports whether b is a pointer into a (potentially dead) heap object.
 // It returns false for pointers into stack spans.
+// Non-preemptible because it is used by write barriers.
 //go:nowritebarrier
+//go:nosplit
 func inheap(b uintptr) bool {
 	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
 		return false
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 4544344780..a618bd5e81 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -521,9 +521,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 	n = NumGoroutine()
 	if n <= len(p) {
 		gp := getg()
-		semacquire(&worldsema, false)
-		gp.m.preemptoff = "profile"
-		systemstack(stoptheworld)
+		stopTheWorld("profile")
 
 		n = NumGoroutine()
 		if n <= len(p) {
@@ -544,9 +542,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 			}
 		}
 
-		gp.m.preemptoff = ""
-		semrelease(&worldsema)
-		systemstack(starttheworld)
+		startTheWorld()
 	}
 
 	return n, ok
@@ -565,10 +561,7 @@ func saveg(pc, sp uintptr, gp *g, r *StackRecord) {
 // into buf after the trace for the current goroutine.
 func Stack(buf []byte, all bool) int {
 	if all {
-		semacquire(&worldsema, false)
-		gp := getg()
-		gp.m.preemptoff = "stack trace"
-		systemstack(stoptheworld)
+		stopTheWorld("stack trace")
 	}
 
 	n := 0
@@ -590,10 +583,7 @@ func Stack(buf []byte, all bool) int {
 	}
 
 	if all {
-		gp := getg()
-		gp.m.preemptoff = ""
-		semrelease(&worldsema)
-		systemstack(starttheworld)
+		startTheWorld()
 	}
 	return n
 }
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index c8e5249156..3eff7f6b3e 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -153,24 +153,13 @@ func init() {
 
 // ReadMemStats populates m with memory allocator statistics.
 func ReadMemStats(m *MemStats) {
-	// Have to acquire worldsema to stop the world,
-	// because stoptheworld can only be used by
-	// one goroutine at a time, and there might be
-	// a pending garbage collection already calling it.
-	semacquire(&worldsema, false)
-	gp := getg()
-	gp.m.preemptoff = "read mem stats"
-	systemstack(stoptheworld)
+	stopTheWorld("read mem stats")
 
 	systemstack(func() {
 		readmemstats_m(m)
 	})
 
-	gp.m.preemptoff = ""
-	gp.m.locks++
-	semrelease(&worldsema)
-	systemstack(starttheworld)
-	gp.m.locks--
+	startTheWorld()
 }
 
 func readmemstats_m(stats *MemStats) {
diff --git a/src/runtime/os1_darwin.go b/src/runtime/os1_darwin.go
index 10cf460f7f..1b74e3e653 100644
--- a/src/runtime/os1_darwin.go
+++ b/src/runtime/os1_darwin.go
@@ -8,7 +8,6 @@ import "unsafe"
 
 //extern SigTabTT runtime·sigtab[];
 
-var sigset_none = uint32(0)
 var sigset_all = ^uint32(0)
 
 func unimplemented(name string) {
@@ -126,17 +125,36 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*uint32)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	sigprocmask(_SIG_SETMASK, nil, smask)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
 	// Initialize signal handling.
 	_g_ := getg()
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask &^= 1 << (uint32(i) - 1)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*uint32)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(_SIG_SETMASK, smask, nil)
 	signalstack(nil, 0)
 }
 
@@ -447,6 +465,6 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+func updatesigmask(m sigmask) {
+	sigprocmask(_SIG_SETMASK, &m[0], nil)
 }
diff --git a/src/runtime/os1_dragonfly.go b/src/runtime/os1_dragonfly.go
index a590aea39b..eb42b54e2b 100644
--- a/src/runtime/os1_dragonfly.go
+++ b/src/runtime/os1_dragonfly.go
@@ -12,7 +12,6 @@ const (
 	_HW_NCPU = 3
 )
 
-var sigset_none = sigset{}
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
 func getncpu() int32 {
@@ -120,6 +119,14 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	sigprocmask(nil, smask)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -130,11 +137,22 @@ func minit() {
 
 	// Initialize signal handling
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(&sigset_none, nil)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(&nmask, nil)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(smask, nil)
 	signalstack(nil, 0)
 }
 
@@ -215,6 +233,8 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(&sigset_none, nil)
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(&mask, nil)
 }
diff --git a/src/runtime/os1_freebsd.go b/src/runtime/os1_freebsd.go
index 8719a49286..f7f34bd386 100644
--- a/src/runtime/os1_freebsd.go
+++ b/src/runtime/os1_freebsd.go
@@ -12,7 +12,6 @@ const (
 	_HW_NCPU = 3
 )
 
-var sigset_none = sigset{}
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
 func getncpu() int32 {
@@ -119,6 +118,14 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	sigprocmask(nil, smask)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -132,11 +139,22 @@ func minit() {
 
 	// Initialize signal handling.
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(&sigset_none, nil)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(&nmask, nil)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(smask, nil)
 	signalstack(nil, 0)
 }
 
@@ -217,6 +235,8 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(&sigset_none, nil)
+func updatesigmask(m [(_NSIG + 31) / 32]uint32) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(&mask, nil)
 }
diff --git a/src/runtime/os1_linux.go b/src/runtime/os1_linux.go
index e4b18c79b3..02f98d7c5f 100644
--- a/src/runtime/os1_linux.go
+++ b/src/runtime/os1_linux.go
@@ -6,7 +6,6 @@ package runtime
 
 import "unsafe"
 
-var sigset_none sigset
 var sigset_all sigset = sigset{^uint32(0), ^uint32(0)}
 
 // Linux futex.
@@ -190,17 +189,36 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	rtsigprocmask(_SIG_SETMASK, nil, smask, int32(unsafe.Sizeof(*smask)))
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
 	// Initialize signal handling.
 	_g_ := getg()
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	rtsigprocmask(_SIG_SETMASK, &nmask, nil, int32(unsafe.Sizeof(nmask)))
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	rtsigprocmask(_SIG_SETMASK, smask, nil, int32(unsafe.Sizeof(*smask)))
 	signalstack(nil, 0)
 }
 
@@ -304,6 +322,8 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask[:], m[:])
+	rtsigprocmask(_SIG_SETMASK, &mask, nil, int32(unsafe.Sizeof(mask)))
 }
diff --git a/src/runtime/os1_nacl.go b/src/runtime/os1_nacl.go
index dbb5dec2fd..66e60f8b12 100644
--- a/src/runtime/os1_nacl.go
+++ b/src/runtime/os1_nacl.go
@@ -15,6 +15,9 @@ func mpreinit(mp *m) {
 
 func sigtramp()
 
+func msigsave(mp *m) {
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
diff --git a/src/runtime/os1_netbsd.go b/src/runtime/os1_netbsd.go
index 8df74b5593..3fb05989e7 100644
--- a/src/runtime/os1_netbsd.go
+++ b/src/runtime/os1_netbsd.go
@@ -17,7 +17,6 @@ const (
 	_CLOCK_MONOTONIC = 3
 )
 
-var sigset_none = sigset{}
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
 // From NetBSD's <sys/sysctl.h>
@@ -139,6 +138,14 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	sigprocmask(_SIG_SETMASK, nil, smask)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -147,11 +154,23 @@ func minit() {
 
 	// Initialize signal handling
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(_SIG_SETMASK, smask, nil)
+
 	signalstack(nil, 0)
 }
 
@@ -206,6 +225,8 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
 }
diff --git a/src/runtime/os1_openbsd.go b/src/runtime/os1_openbsd.go
index 95729a56df..5ccf642468 100644
--- a/src/runtime/os1_openbsd.go
+++ b/src/runtime/os1_openbsd.go
@@ -148,6 +148,14 @@ func mpreinit(mp *m) {
 	mp.gsignal.m = mp
 }
 
+func msigsave(mp *m) {
+	smask := (*uint32)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	*smask = sigprocmask(_SIG_BLOCK, 0)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -158,11 +166,22 @@ func minit() {
 
 	// Initialize signal handling
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(_SIG_SETMASK, sigset_none)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask &^= 1 << (uint32(i) - 1)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, nmask)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(_SIG_SETMASK, smask)
 	signalstack(nil, 0)
 }
 
@@ -217,6 +236,6 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(_SIG_SETMASK, sigset_none)
+func updatesigmask(m sigmask) {
+	sigprocmask(_SIG_SETMASK, m[0])
 }
diff --git a/src/runtime/os1_plan9.go b/src/runtime/os1_plan9.go
index c026218241..bda7057f44 100644
--- a/src/runtime/os1_plan9.go
+++ b/src/runtime/os1_plan9.go
@@ -18,6 +18,9 @@ func mpreinit(mp *m) {
 	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, _FlagNoScan))
 }
 
+func msigsave(mp *m) {
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -177,7 +180,7 @@ func exit(e int) {
 	} else {
 		// build error string
 		var tmp [32]byte
-		status = []byte(gostringnocopy(&itoa(tmp[:len(tmp)-1], uint64(e))[0]))
+		status = append(itoa(tmp[:len(tmp)-1], uint64(e)), 0)
 	}
 	goexitsall(&status[0])
 	exits(&status[0])
diff --git a/src/runtime/os1_windows.go b/src/runtime/os1_windows.go
index 5719b320f5..bc472d0de9 100644
--- a/src/runtime/os1_windows.go
+++ b/src/runtime/os1_windows.go
@@ -292,6 +292,9 @@ func newosproc(mp *m, stk unsafe.Pointer) {
 func mpreinit(mp *m) {
 }
 
+func msigsave(mp *m) {
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index 69ac5b4970..e4fe92de41 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go
@@ -114,7 +114,6 @@ var (
 	libc_write libcFunc
 )
 
-var sigset_none = sigset{}
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
 func getncpu() int32 {
@@ -190,6 +189,14 @@ func mpreinit(mp *m) {
 
 func miniterrno()
 
+func msigsave(mp *m) {
+	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
+	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
+		throw("insufficient storage for signal mask")
+	}
+	sigprocmask(_SIG_SETMASK, nil, smask)
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
@@ -197,11 +204,23 @@ func minit() {
 	asmcgocall(unsafe.Pointer(funcPC(miniterrno)), unsafe.Pointer(&libc____errno))
 	// Initialize signal handling
 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__sigbits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
 }
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
+	_g_ := getg()
+	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	sigprocmask(_SIG_SETMASK, smask, nil)
+
 	signalstack(nil, 0)
 }
 
@@ -278,8 +297,10 @@ func signalstack(p *byte, n int32) {
 	sigaltstack(&st, nil)
 }
 
-func unblocksignals() {
-	sigprocmask(_SIG_SETMASK, &sigset_none, nil)
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__sigbits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
 }
 
 //go:nosplit
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 0e4086c7ef..47563f450e 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -188,16 +188,6 @@ func newdefer(siz int32) *_defer {
 		d = (*_defer)(mallocgc(total, deferType, 0))
 	}
 	d.siz = siz
-	if mheap_.shadow_enabled {
-		// This memory will be written directly, with no write barrier,
-		// and then scanned like stacks during collection.
-		// Unlike real stacks, it is from heap spans, so mark the
-		// shadow as explicitly unusable.
-		p := deferArgs(d)
-		for i := uintptr(0); i+ptrSize <= uintptr(siz); i += ptrSize {
-			writebarrierptr_noshadow((*uintptr)(add(p, i)))
-		}
-	}
 	gp := mp.curg
 	d.link = gp._defer
 	gp._defer = d
@@ -214,12 +204,6 @@ func freedefer(d *_defer) {
 	if d.fn != nil {
 		freedeferfn()
 	}
-	if mheap_.shadow_enabled {
-		// Undo the marking in newdefer.
-		systemstack(func() {
-			clearshadow(uintptr(deferArgs(d)), uintptr(d.siz))
-		})
-	}
 	sc := deferclass(uintptr(d.siz))
 	if sc < uintptr(len(p{}.deferpool)) {
 		mp := acquirem()
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index b3d0ae9b64..4290edb7be 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -442,35 +442,33 @@ func writeHeap(w io.Writer, debug int) error {
 
 	// Print memstats information too.
 	// Pprof will ignore, but useful for people
-	if debug > 0 {
-		s := new(runtime.MemStats)
-		runtime.ReadMemStats(s)
-		fmt.Fprintf(w, "\n# runtime.MemStats\n")
-		fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
-		fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
-		fmt.Fprintf(w, "# Sys = %d\n", s.Sys)
-		fmt.Fprintf(w, "# Lookups = %d\n", s.Lookups)
-		fmt.Fprintf(w, "# Mallocs = %d\n", s.Mallocs)
-		fmt.Fprintf(w, "# Frees = %d\n", s.Frees)
+	s := new(runtime.MemStats)
+	runtime.ReadMemStats(s)
+	fmt.Fprintf(w, "\n# runtime.MemStats\n")
+	fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
+	fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
+	fmt.Fprintf(w, "# Sys = %d\n", s.Sys)
+	fmt.Fprintf(w, "# Lookups = %d\n", s.Lookups)
+	fmt.Fprintf(w, "# Mallocs = %d\n", s.Mallocs)
+	fmt.Fprintf(w, "# Frees = %d\n", s.Frees)
 
-		fmt.Fprintf(w, "# HeapAlloc = %d\n", s.HeapAlloc)
-		fmt.Fprintf(w, "# HeapSys = %d\n", s.HeapSys)
-		fmt.Fprintf(w, "# HeapIdle = %d\n", s.HeapIdle)
-		fmt.Fprintf(w, "# HeapInuse = %d\n", s.HeapInuse)
-		fmt.Fprintf(w, "# HeapReleased = %d\n", s.HeapReleased)
-		fmt.Fprintf(w, "# HeapObjects = %d\n", s.HeapObjects)
+	fmt.Fprintf(w, "# HeapAlloc = %d\n", s.HeapAlloc)
+	fmt.Fprintf(w, "# HeapSys = %d\n", s.HeapSys)
+	fmt.Fprintf(w, "# HeapIdle = %d\n", s.HeapIdle)
+	fmt.Fprintf(w, "# HeapInuse = %d\n", s.HeapInuse)
+	fmt.Fprintf(w, "# HeapReleased = %d\n", s.HeapReleased)
+	fmt.Fprintf(w, "# HeapObjects = %d\n", s.HeapObjects)
 
-		fmt.Fprintf(w, "# Stack = %d / %d\n", s.StackInuse, s.StackSys)
-		fmt.Fprintf(w, "# MSpan = %d / %d\n", s.MSpanInuse, s.MSpanSys)
-		fmt.Fprintf(w, "# MCache = %d / %d\n", s.MCacheInuse, s.MCacheSys)
-		fmt.Fprintf(w, "# BuckHashSys = %d\n", s.BuckHashSys)
+	fmt.Fprintf(w, "# Stack = %d / %d\n", s.StackInuse, s.StackSys)
+	fmt.Fprintf(w, "# MSpan = %d / %d\n", s.MSpanInuse, s.MSpanSys)
+	fmt.Fprintf(w, "# MCache = %d / %d\n", s.MCacheInuse, s.MCacheSys)
+	fmt.Fprintf(w, "# BuckHashSys = %d\n", s.BuckHashSys)
 
-		fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
-		fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
-		fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
-		fmt.Fprintf(w, "# EnableGC = %v\n", s.EnableGC)
-		fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
-	}
+	fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
+	fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
+	fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
+	fmt.Fprintf(w, "# EnableGC = %v\n", s.EnableGC)
+	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
 	if tw != nil {
 		tw.Flush()
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index f725fc890b..805b96e627 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -203,7 +203,7 @@ func acquireSudog() *sudog {
 	// acquireSudog, acquireSudog calls new(sudog),
 	// new calls malloc, malloc can call the garbage collector,
 	// and the garbage collector calls the semaphore implementation
-	// in stoptheworld.
+	// in stopTheWorld.
 	// Break the cycle by doing acquirem/releasem around new(sudog).
 	// The acquirem/releasem increments m.locks during new(sudog),
 	// which keeps the garbage collector from being invoked.
diff --git a/src/runtime/proc1.go b/src/runtime/proc1.go
index 00535da77d..c070f7d773 100644
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@ -59,7 +59,6 @@ func schedinit() {
 	goargs()
 	goenvs()
 	parsedebugvars()
-	wbshadowinit()
 	gcinit()
 
 	sched.lastpoll = uint64(nanotime())
@@ -212,7 +211,7 @@ func helpgc(nproc int32) {
 // sched.stopwait to in order to request that all Gs permanently stop.
 const freezeStopWait = 0x7fffffff
 
-// Similar to stoptheworld but best-effort and can be called several times.
+// Similar to stopTheWorld but best-effort and can be called several times.
 // There is no reverse operation, used during crashing.
 // This function must not lock any mutexes.
 func freezetheworld() {
@@ -466,94 +465,68 @@ func stopscanstart(gp *g) {
 	}
 }
 
-// Runs on g0 and does the actual work after putting the g back on the run queue.
-func mquiesce(gpmaster *g) {
-	// enqueue the calling goroutine.
-	restartg(gpmaster)
-
-	activeglen := len(allgs)
-	for i := 0; i < activeglen; i++ {
-		gp := allgs[i]
-		if readgstatus(gp) == _Gdead {
-			gp.gcworkdone = true // noop scan.
-		} else {
-			gp.gcworkdone = false
-		}
-		stopscanstart(gp)
-	}
-
-	// Check that the G's gcwork (such as scanning) has been done. If not do it now.
-	// You can end up doing work here if the page trap on a Grunning Goroutine has
-	// not been sprung or in some race situations. For example a runnable goes dead
-	// and is started up again with a gp->gcworkdone set to false.
-	for i := 0; i < activeglen; i++ {
-		gp := allgs[i]
-		for !gp.gcworkdone {
-			status := readgstatus(gp)
-			if status == _Gdead {
-				//do nothing, scan not needed.
-				gp.gcworkdone = true // scan is a noop
-				break
-			}
-			if status == _Grunning && gp.stackguard0 == uintptr(stackPreempt) && notetsleep(&sched.stopnote, 100*1000) { // nanosecond arg
-				noteclear(&sched.stopnote)
-			} else {
-				stopscanstart(gp)
-			}
-		}
-	}
-
-	for i := 0; i < activeglen; i++ {
-		gp := allgs[i]
-		status := readgstatus(gp)
-		if isscanstatus(status) {
-			print("mstopandscang:bottom: post scan bad status gp=", gp, " has status ", hex(status), "\n")
-			dumpgstatus(gp)
-		}
-		if !gp.gcworkdone && status != _Gdead {
-			print("mstopandscang:bottom: post scan gp=", gp, "->gcworkdone still false\n")
-			dumpgstatus(gp)
-		}
-	}
-
-	schedule() // Never returns.
+// stopTheWorld stops all P's from executing goroutines, interrupting
+// all goroutines at GC safe points and records reason as the reason
+// for the stop. On return, only the current goroutine's P is running.
+// stopTheWorld must not be called from a system stack and the caller
+// must not hold worldsema. The caller must call startTheWorld when
+// other P's should resume execution.
+//
+// stopTheWorld is safe for multiple goroutines to call at the
+// same time. Each will execute its own stop, and the stops will
+// be serialized.
+//
+// This is also used by routines that do stack dumps. If the system is
+// in panic or being exited, this may not reliably stop all
+// goroutines.
+func stopTheWorld(reason string) {
+	semacquire(&worldsema, false)
+	getg().m.preemptoff = reason
+	systemstack(stopTheWorldWithSema)
 }
 
-// quiesce moves all the goroutines to a GC safepoint which for now is a at preemption point.
-// If the global gcphase is GCmark quiesce will ensure that all of the goroutine's stacks
-// have been scanned before it returns.
-func quiesce(mastergp *g) {
-	castogscanstatus(mastergp, _Grunning, _Gscanenqueue)
-	// Now move this to the g0 (aka m) stack.
-	// g0 will potentially scan this thread and put mastergp on the runqueue
-	mcall(mquiesce)
+// startTheWorld undoes the effects of stopTheWorld.
+func startTheWorld() {
+	systemstack(startTheWorldWithSema)
+	// worldsema must be held over startTheWorldWithSema to ensure
+	// gomaxprocs cannot change while worldsema is held.
+	semrelease(&worldsema)
+	getg().m.preemptoff = ""
 }
 
-// Holding worldsema grants an M the right to try to stop the world.
-// The procedure is:
+// Holding worldsema grants an M the right to try to stop the world
+// and prevents gomaxprocs from changing concurrently.
+var worldsema uint32 = 1
+
+// stopTheWorldWithSema is the core implementation of stopTheWorld.
+// The caller is responsible for acquiring worldsema and disabling
+// preemption first and then should stopTheWorldWithSema on the system
+// stack:
 //
-//	semacquire(&worldsema);
-//	m.preemptoff = "reason";
-//	stoptheworld();
+//	semacquire(&worldsema, false)
+//	m.preemptoff = "reason"
+//	systemstack(stopTheWorldWithSema)
 //
-//	... do stuff ...
+// When finished, the caller must either call startTheWorld or undo
+// these three operations separately:
 //
-//	m.preemptoff = "";
-//	semrelease(&worldsema);
-//	starttheworld();
+//	m.preemptoff = ""
+//	systemstack(startTheWorldWithSema)
+//	semrelease(&worldsema)
 //
-var worldsema uint32 = 1
-
-// This is used by the GC as well as the routines that do stack dumps. In the case
-// of GC all the routines can be reliably stopped. This is not always the case
-// when the system is in panic or being exited.
-func stoptheworld() {
+// It is allowed to acquire worldsema once and then execute multiple
+// startTheWorldWithSema/stopTheWorldWithSema pairs.
+// Other P's are able to execute between successive calls to
+// startTheWorldWithSema and stopTheWorldWithSema.
+// Holding worldsema causes any other goroutines invoking
+// stopTheWorld to block.
+func stopTheWorldWithSema() {
 	_g_ := getg()
 
 	// If we hold a lock, then we won't be able to stop another M
 	// that is blocked trying to acquire the lock.
 	if _g_.m.locks > 0 {
-		throw("stoptheworld: holding locks")
+		throw("stopTheWorld: holding locks")
 	}
 
 	lock(&sched.lock)
@@ -600,12 +573,12 @@ func stoptheworld() {
 		}
 	}
 	if sched.stopwait != 0 {
-		throw("stoptheworld: not stopped")
+		throw("stopTheWorld: not stopped")
 	}
 	for i := 0; i < int(gomaxprocs); i++ {
 		p := allp[i]
 		if p.status != _Pgcstop {
-			throw("stoptheworld: not stopped")
+			throw("stopTheWorld: not stopped")
 		}
 	}
 }
@@ -615,7 +588,7 @@ func mhelpgc() {
 	_g_.m.helpgc = -1
 }
 
-func starttheworld() {
+func startTheWorldWithSema() {
 	_g_ := getg()
 
 	_g_.m.locks++        // disable preemption because it can be holding p in a local var
@@ -644,7 +617,7 @@ func starttheworld() {
 			mp := p.m.ptr()
 			p.m = 0
 			if mp.nextp != 0 {
-				throw("starttheworld: inconsistent mp->nextp")
+				throw("startTheWorld: inconsistent mp->nextp")
 			}
 			mp.nextp.set(p)
 			notewakeup(&mp.park)
@@ -754,10 +727,10 @@ func forEachP(fn func(*p)) {
 	_p_ := getg().m.p.ptr()
 
 	lock(&sched.lock)
-	if sched.stopwait != 0 {
-		throw("forEachP: sched.stopwait != 0")
+	if sched.safePointWait != 0 {
+		throw("forEachP: sched.safePointWait != 0")
 	}
-	sched.stopwait = gomaxprocs - 1
+	sched.safePointWait = gomaxprocs - 1
 	sched.safePointFn = fn
 
 	// Ask all Ps to run the safe point function.
@@ -777,11 +750,11 @@ func forEachP(fn func(*p)) {
 	for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
 		if cas(&p.runSafePointFn, 1, 0) {
 			fn(p)
-			sched.stopwait--
+			sched.safePointWait--
 		}
 	}
 
-	wait := sched.stopwait > 0
+	wait := sched.safePointWait > 0
 	unlock(&sched.lock)
 
 	// Run fn for the current P.
@@ -807,15 +780,15 @@ func forEachP(fn func(*p)) {
 		for {
 			// Wait for 100us, then try to re-preempt in
 			// case of any races.
-			if notetsleep(&sched.stopnote, 100*1000) {
-				noteclear(&sched.stopnote)
+			if notetsleep(&sched.safePointNote, 100*1000) {
+				noteclear(&sched.safePointNote)
 				break
 			}
 			preemptall()
 		}
 	}
-	if sched.stopwait != 0 {
-		throw("forEachP: not stopped")
+	if sched.safePointWait != 0 {
+		throw("forEachP: not done")
 	}
 	for i := 0; i < int(gomaxprocs); i++ {
 		p := allp[i]
@@ -851,9 +824,9 @@ func runSafePointFn() {
 	}
 	sched.safePointFn(p)
 	lock(&sched.lock)
-	sched.stopwait--
-	if sched.stopwait == 0 {
-		notewakeup(&sched.stopnote)
+	sched.safePointWait--
+	if sched.safePointWait == 0 {
+		notewakeup(&sched.safePointNote)
 	}
 	unlock(&sched.lock)
 }
@@ -971,6 +944,7 @@ func needm(x byte) {
 	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
 
+	msigsave(mp)
 	// Initialize this thread to use the m.
 	asminit()
 	minit()
@@ -1098,6 +1072,7 @@ func unlockextra(mp *m) {
 func newm(fn func(), _p_ *p) {
 	mp := allocm(_p_, fn)
 	mp.nextp.set(_p_)
+	msigsave(mp)
 	if iscgo {
 		var ts cgothreadstart
 		if _cgo_thread_start == nil {
@@ -1226,9 +1201,9 @@ func handoffp(_p_ *p) {
 	}
 	if _p_.runSafePointFn != 0 && cas(&_p_.runSafePointFn, 1, 0) {
 		sched.safePointFn(_p_)
-		sched.stopwait--
-		if sched.stopwait == 0 {
-			notewakeup(&sched.stopnote)
+		sched.safePointWait--
+		if sched.safePointWait == 0 {
+			notewakeup(&sched.safePointNote)
 		}
 	}
 	if sched.runqsize != 0 {
@@ -1305,7 +1280,7 @@ func startlockedm(gp *g) {
 	stopm()
 }
 
-// Stops the current m for stoptheworld.
+// Stops the current m for stopTheWorld.
 // Returns when the world is restarted.
 func gcstopm() {
 	_g_ := getg()
@@ -1421,7 +1396,7 @@ top:
 		xadd(&sched.nmspinning, 1)
 	}
 	// random steal from other P's
-	for i := 0; i < int(2*gomaxprocs); i++ {
+	for i := 0; i < int(4*gomaxprocs); i++ {
 		if sched.gcwaiting != 0 {
 			goto top
 		}
@@ -1430,18 +1405,20 @@ top:
 		if _p_ == _g_.m.p.ptr() {
 			gp, _ = runqget(_p_)
 		} else {
-			gp = runqsteal(_g_.m.p.ptr(), _p_)
+			stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
+			gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
 		}
 		if gp != nil {
 			return gp, false
 		}
 	}
+
 stop:
 
-	// We have nothing to do. If we're in the GC mark phaseand can
+	// We have nothing to do. If we're in the GC mark phase and can
 	// safely scan and blacken objects, run idle-time marking
 	// rather than give up the P.
-	if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil {
+	if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil && gcMarkWorkAvailable(_p_) {
 		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
 		gp := _p_.gcBgMarkWorker
 		casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2484,11 +2461,9 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	mp.mallocing++
 
 	// Define that a "user g" is a user-created goroutine, and a "system g"
-	// is one that is m->g0 or m->gsignal. We've only made sure that we
-	// can unwind user g's, so exclude the system g's.
+	// is one that is m->g0 or m->gsignal.
 	//
-	// It is not quite as easy as testing gp == m->curg (the current user g)
-	// because we might be interrupted for profiling halfway through a
+	// We might be interrupted for profiling halfway through a
 	// goroutine switch. The switch involves updating three (or four) values:
 	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
 	// because once it gets updated the new g is running.
@@ -2497,8 +2472,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// so the update only affects g, SP, and PC. Since PC must be last, there
 	// the possible partial transitions in ordinary execution are (1) g alone is updated,
 	// (2) both g and SP are updated, and (3) SP alone is updated.
-	// If g is updated, we'll see a system g and not look closer.
-	// If SP alone is updated, we can detect the partial transition by checking
+	// If SP or g alone is updated, we can detect the partial transition by checking
 	// whether the SP is within g's stack bounds. (We could also require that SP
 	// be changed only after g, but the stack bounds check is needed by other
 	// cases, so there is no need to impose an additional requirement.)
@@ -2527,15 +2501,11 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// disabled, so a profiling signal cannot arrive then anyway.
 	//
 	// Third, the common case: it may be that the switch updates g, SP, and PC
-	// separately, as in gogo.
-	//
-	// Because gogo is the only instance, we check whether the PC lies
-	// within that function, and if so, not ask for a traceback. This approach
-	// requires knowing the size of the gogo function, which we
-	// record in arch_*.h and check in runtime_test.go.
+	// separately. If the PC is within any of the functions that does this,
+	// we don't ask for a traceback. C.F. the function setsSP for more about this.
 	//
 	// There is another apparently viable approach, recorded here in case
-	// the "PC within gogo" check turns out not to be usable.
+	// the "PC within setsSP function" check turns out not to be usable.
 	// It would be possible to delay the update of either g or SP until immediately
 	// before the PC update instruction. Then, because of the stack bounds check,
 	// the only problematic interrupt point is just before that PC update instruction,
@@ -2556,28 +2526,23 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// transition. We simply require that g and SP match and that the PC is not
 	// in gogo.
 	traceback := true
-	gogo := funcPC(gogo)
-	if gp == nil || gp != mp.curg ||
-		sp < gp.stack.lo || gp.stack.hi < sp ||
-		(gogo <= pc && pc < gogo+_RuntimeGogoBytes) {
+	if gp == nil || sp < gp.stack.lo || gp.stack.hi < sp || setsSP(pc) {
 		traceback = false
 	}
-
 	var stk [maxCPUProfStack]uintptr
 	n := 0
-	if traceback {
-		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap)
+	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
+		// Cgo, we can't unwind and symbolize arbitrary C code,
+		// so instead collect Go stack that leads to the cgo call.
+		// This is especially important on windows, since all syscalls are cgo calls.
+		n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[0], len(stk), nil, nil, 0)
+	} else if traceback {
+		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
 	}
 	if !traceback || n <= 0 {
 		// Normal traceback is impossible or has failed.
 		// See if it falls into several common cases.
 		n = 0
-		if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
-			// Cgo, we can't unwind and symbolize arbitrary C code,
-			// so instead collect Go stack that leads to the cgo call.
-			// This is especially important on windows, since all syscalls are cgo calls.
-			n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[0], len(stk), nil, nil, 0)
-		}
 		if GOOS == "windows" && n == 0 && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
 			// Libcall, i.e. runtime syscall on windows.
 			// Collect Go stack that leads to the call.
@@ -2612,6 +2577,30 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	mp.mallocing--
 }
 
+// Reports whether a function will set the SP
+// to an absolute value. Important that
+// we don't traceback when these are at the bottom
+// of the stack since we can't be sure that we will
+// find the caller.
+//
+// If the function is not on the bottom of the stack
+// we assume that it will have set it up so that traceback will be consistent,
+// either by being a traceback terminating function
+// or putting one on the stack at the right offset.
+func setsSP(pc uintptr) bool {
+	f := findfunc(pc)
+	if f == nil {
+		// couldn't find the function for this PC,
+		// so assume the worst and stop traceback
+		return true
+	}
+	switch f.entry {
+	case gogoPC, systemstackPC, mcallPC, morestackPC:
+		return true
+	}
+	return false
+}
+
 // Arrange to call fn with a traceback hz times a second.
 func setcpuprofilerate_m(hz int32) {
 	// Force sane arguments.
@@ -3447,23 +3436,34 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 	}
 }
 
-// Grabs a batch of goroutines from local runnable queue.
-// batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
+// Grabs a batch of goroutines from _p_'s runnable queue into batch.
+// Batch is a ring buffer starting at batchHead.
+// Returns number of grabbed goroutines.
 // Can be executed by any P.
-func runqgrab(_p_ *p, batch []*g) uint32 {
+func runqgrab(_p_ *p, batch *[256]*g, batchHead uint32, stealRunNextG bool) uint32 {
 	for {
 		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
 		t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
-			// Try to steal from _p_.runnext.
-			if next := _p_.runnext; next != 0 {
-				if !_p_.runnext.cas(next, 0) {
-					continue
+			if stealRunNextG {
+				// Try to steal from _p_.runnext.
+				if next := _p_.runnext; next != 0 {
+					// Sleep to ensure that _p_ isn't about to run the g we
+					// are about to steal.
+					// The important use case here is when the g running on _p_
+					// ready()s another g and then almost immediately blocks.
+					// Instead of stealing runnext in this window, back off
+					// to give _p_ a chance to schedule runnext. This will avoid
+					// thrashing gs between different Ps.
+					usleep(100)
+					if !_p_.runnext.cas(next, 0) {
+						continue
+					}
+					batch[batchHead%uint32(len(batch))] = next.ptr()
+					return 1
 				}
-				batch[0] = next.ptr()
-				return 1
 			}
 			return 0
 		}
@@ -3471,7 +3471,8 @@ func runqgrab(_p_ *p, batch []*g) uint32 {
 			continue
 		}
 		for i := uint32(0); i < n; i++ {
-			batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
+			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
+			batch[(batchHead+i)%uint32(len(batch))] = g
 		}
 		if cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
 			return n
@@ -3482,26 +3483,21 @@ func runqgrab(_p_ *p, batch []*g) uint32 {
 // Steal half of elements from local runnable queue of p2
 // and put onto local runnable queue of p.
 // Returns one of the stolen elements (or nil if failed).
-func runqsteal(_p_, p2 *p) *g {
-	var batch [len(_p_.runq) / 2]*g
-
-	n := runqgrab(p2, batch[:])
+func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
+	t := _p_.runqtail
+	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
 	if n == 0 {
 		return nil
 	}
 	n--
-	gp := batch[n]
+	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))]
 	if n == 0 {
 		return gp
 	}
 	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
-	t := _p_.runqtail
 	if t-h+n >= uint32(len(_p_.runq)) {
 		throw("runqsteal: runq overflow")
 	}
-	for i := uint32(0); i < n; i++ {
-		_p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
-	}
 	atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
 	return gp
 }
@@ -3528,20 +3524,16 @@ func testSchedLocalQueue() {
 	}
 }
 
-var pSink *p
-
 func testSchedLocalQueueSteal() {
 	p1 := new(p)
 	p2 := new(p)
-	pSink = p1 // Force to heap, too large to allocate on system stack ("G0 stack")
-	pSink = p2 // Force to heap, too large to allocate on system stack ("G0 stack")
 	gs := make([]g, len(p1.runq))
 	for i := 0; i < len(p1.runq); i++ {
 		for j := 0; j < i; j++ {
 			gs[j].sig = 0
 			runqput(p1, &gs[j], false)
 		}
-		gp := runqsteal(p2, p1)
+		gp := runqsteal(p2, p1, true)
 		s := 0
 		if gp != nil {
 			s++
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 4c5712d32f..4471ee5afb 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -7,6 +7,7 @@ package runtime_test
 import (
 	"math"
 	"runtime"
+	"runtime/debug"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -104,8 +105,8 @@ func TestGoroutineParallelism(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
 	// If runtime triggers a forced GC during this test then it will deadlock,
 	// since the goroutines can't be stopped/preempted.
-	// So give this test as much time as possible.
-	runtime.GC()
+	// Disable GC for this test (see issue #10958).
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 	for try := 0; try < N; try++ {
 		done := make(chan bool)
 		x := uint32(0)
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index fe7d38a39c..f4014b2e05 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -59,7 +59,7 @@ func TestGdbPython(t *testing.T) {
 
 	cmd := exec.Command("go", "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := cmd.CombinedOutput()
+	out, err := testEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source %v\n%s", err, out)
 	}
@@ -85,7 +85,7 @@ func TestGdbPython(t *testing.T) {
 	// stack frames on RISC architectures.
 	canBackTrace := false
 	switch runtime.GOARCH {
-	case "amd64", "386":
+	case "amd64", "386", "ppc64", "ppc64le", "arm", "arm64":
 		canBackTrace = true
 		args = append(args,
 			"-ex", "echo BEGIN goroutine 2 bt\n",
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index ac539b9a9d..3ee5d5d29d 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -266,6 +266,7 @@ type m struct {
 	// Fields not known to debuggers.
 	procid        uint64     // for debuggers, but offset not hard-coded
 	gsignal       *g         // signal-handling g
+	sigmask       [4]uintptr // storage for saved signal mask
 	tls           [4]uintptr // thread-local storage (for x86 extern register)
 	mstartfn      func()
 	curg          *g       // current running goroutine
@@ -441,7 +442,9 @@ type schedt struct {
 
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
-	safePointFn func(*p)
+	safePointFn   func(*p)
+	safePointWait int32
+	safePointNote note
 
 	profilehz int32 // cpu profiling rate
 
@@ -467,15 +470,16 @@ type sigtabtt struct {
 }
 
 const (
-	_SigNotify   = 1 << 0 // let signal.Notify have signal, even if from kernel
-	_SigKill     = 1 << 1 // if signal.Notify doesn't take it, exit quietly
-	_SigThrow    = 1 << 2 // if signal.Notify doesn't take it, exit loudly
-	_SigPanic    = 1 << 3 // if the signal is from the kernel, panic
-	_SigDefault  = 1 << 4 // if the signal isn't explicitly requested, don't monitor it
-	_SigHandling = 1 << 5 // our signal handler is registered
-	_SigIgnored  = 1 << 6 // the signal was ignored before we registered for it
-	_SigGoExit   = 1 << 7 // cause all runtime procs to exit (only used on Plan 9).
-	_SigSetStack = 1 << 8 // add SA_ONSTACK to libc handler
+	_SigNotify   = 1 << iota // let signal.Notify have signal, even if from kernel
+	_SigKill                 // if signal.Notify doesn't take it, exit quietly
+	_SigThrow                // if signal.Notify doesn't take it, exit loudly
+	_SigPanic                // if the signal is from the kernel, panic
+	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
+	_SigHandling             // our signal handler is registered
+	_SigIgnored              // the signal was ignored before we registered for it
+	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
+	_SigSetStack             // add SA_ONSTACK to libc handler
+	_SigUnblock              // unblocked in minit
 )
 
 // Layout of in-memory per-function information prepared by linker
@@ -594,8 +598,9 @@ type stkframe struct {
 }
 
 const (
-	_TraceRuntimeFrames = 1 << 0 // include frames for internal runtime functions.
-	_TraceTrap          = 1 << 1 // the initial PC, SP are from a trap, not a return PC from a call
+	_TraceRuntimeFrames = 1 << iota // include frames for internal runtime functions.
+	_TraceTrap                      // the initial PC, SP are from a trap, not a return PC from a call
+	_TraceJumpStack                 // if traceback is on a systemstack, resume trace at g that called into it
 )
 
 const (
diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index d4cccbf084..f65562ab91 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@@ -6,13 +6,8 @@ package runtime_test
 
 import (
 	"io"
-	"io/ioutil"
-	"os"
-	"os/exec"
 	. "runtime"
 	"runtime/debug"
-	"strconv"
-	"strings"
 	"testing"
 	"unsafe"
 )
@@ -88,53 +83,6 @@ func BenchmarkDeferMany(b *testing.B) {
 	}
 }
 
-// The profiling signal handler needs to know whether it is executing runtime.gogo.
-// The constant RuntimeGogoBytes in arch_*.h gives the size of the function;
-// we don't have a way to obtain it from the linker (perhaps someday).
-// Test that the constant matches the size determined by 'go tool nm -S'.
-// The value reported will include the padding between runtime.gogo and the
-// next function in memory. That's fine.
-func TestRuntimeGogoBytes(t *testing.T) {
-	switch GOOS {
-	case "android", "nacl":
-		t.Skipf("skipping on %s", GOOS)
-	case "darwin":
-		switch GOARCH {
-		case "arm", "arm64":
-			t.Skipf("skipping on %s/%s, no fork", GOOS, GOARCH)
-		}
-	}
-
-	dir, err := ioutil.TempDir("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	out, err := exec.Command("go", "build", "-o", dir+"/hello", "../../test/helloworld.go").CombinedOutput()
-	if err != nil {
-		t.Fatalf("building hello world: %v\n%s", err, out)
-	}
-
-	out, err = exec.Command("go", "tool", "nm", "-size", dir+"/hello").CombinedOutput()
-	if err != nil {
-		t.Fatalf("go tool nm: %v\n%s", err, out)
-	}
-
-	for _, line := range strings.Split(string(out), "\n") {
-		f := strings.Fields(line)
-		if len(f) == 4 && f[3] == "runtime.gogo" {
-			size, _ := strconv.Atoi(f[1])
-			if GogoBytes() != int32(size) {
-				t.Fatalf("RuntimeGogoBytes = %d, should be %d", GogoBytes(), size)
-			}
-			return
-		}
-	}
-
-	t.Fatalf("go tool nm did not report size for runtime.gogo")
-}
-
 // golang.org/issue/7063
 func TestStopCPUProfilingWithProfilerOff(t *testing.T) {
 	SetCPUProfileRate(0)
diff --git a/src/runtime/signal1_unix.go b/src/runtime/signal1_unix.go
index 7577d43a64..d3e9dac097 100644
--- a/src/runtime/signal1_unix.go
+++ b/src/runtime/signal1_unix.go
@@ -19,6 +19,19 @@ const (
 // Signal forwarding is currently available only on Linux.
 var fwdSig [_NSIG]uintptr
 
+// sigmask represents a general signal mask compatible with the GOOS
+// specific sigset types: the signal numbered x is represented by bit x-1
+// to match the representation expected by sigprocmask.
+type sigmask [(_NSIG + 31) / 32]uint32
+
+// channels for synchronizing signal mask updates with the signal mask
+// thread
+var (
+	disableSigChan  chan uint32
+	enableSigChan   chan uint32
+	maskUpdatedChan chan struct{}
+)
+
 func initsig() {
 	// _NSIG is the number of signals on this operating system.
 	// sigtable should describe what to do for all the possible signals.
@@ -61,12 +74,17 @@ func sigenable(sig uint32) {
 	}
 
 	t := &sigtable[sig]
-	if t.flags&_SigNotify != 0 && t.flags&_SigHandling == 0 {
-		t.flags |= _SigHandling
-		if getsig(int32(sig)) == _SIG_IGN {
-			t.flags |= _SigIgnored
+	if t.flags&_SigNotify != 0 {
+		ensureSigM()
+		enableSigChan <- sig
+		<-maskUpdatedChan
+		if t.flags&_SigHandling == 0 {
+			t.flags |= _SigHandling
+			if getsig(int32(sig)) == _SIG_IGN {
+				t.flags |= _SigIgnored
+			}
+			setsig(int32(sig), funcPC(sighandler), true)
 		}
-		setsig(int32(sig), funcPC(sighandler), true)
 	}
 }
 
@@ -76,12 +94,17 @@ func sigdisable(sig uint32) {
 	}
 
 	t := &sigtable[sig]
-	if t.flags&_SigNotify != 0 && t.flags&_SigHandling != 0 {
-		t.flags &^= _SigHandling
-		if t.flags&_SigIgnored != 0 {
-			setsig(int32(sig), _SIG_IGN, true)
-		} else {
-			setsig(int32(sig), _SIG_DFL, true)
+	if t.flags&_SigNotify != 0 {
+		ensureSigM()
+		disableSigChan <- sig
+		<-maskUpdatedChan
+		if t.flags&_SigHandling != 0 {
+			t.flags &^= _SigHandling
+			if t.flags&_SigIgnored != 0 {
+				setsig(int32(sig), _SIG_IGN, true)
+			} else {
+				setsig(int32(sig), _SIG_DFL, true)
+			}
 		}
 	}
 }
@@ -130,7 +153,52 @@ func crash() {
 		}
 	}
 
-	unblocksignals()
+	updatesigmask(sigmask{})
 	setsig(_SIGABRT, _SIG_DFL, false)
 	raise(_SIGABRT)
 }
+
+// createSigM starts one global, sleeping thread to make sure at least one thread
+// is available to catch signals enabled for os/signal.
+func ensureSigM() {
+	if maskUpdatedChan != nil {
+		return
+	}
+	maskUpdatedChan = make(chan struct{})
+	disableSigChan = make(chan uint32)
+	enableSigChan = make(chan uint32)
+	go func() {
+		// Signal masks are per-thread, so make sure this goroutine stays on one
+		// thread.
+		LockOSThread()
+		defer UnlockOSThread()
+		// The sigBlocked mask contains the signals not active for os/signal,
+		// initially all signals except the essential. When signal.Notify()/Stop is called,
+		// sigenable/sigdisable in turn notify this thread to update its signal
+		// mask accordingly.
+		var sigBlocked sigmask
+		for i := range sigBlocked {
+			sigBlocked[i] = ^uint32(0)
+		}
+		for i := range sigtable {
+			if sigtable[i].flags&_SigUnblock != 0 {
+				sigBlocked[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+			}
+		}
+		updatesigmask(sigBlocked)
+		for {
+			select {
+			case sig := <-enableSigChan:
+				if b := sig - 1; b >= 0 {
+					sigBlocked[b/32] &^= (1 << (b & 31))
+				}
+			case sig := <-disableSigChan:
+				if b := sig - 1; b >= 0 {
+					sigBlocked[b/32] |= (1 << (b & 31))
+				}
+			}
+			updatesigmask(sigBlocked)
+			maskUpdatedChan <- struct{}{}
+		}
+	}()
+}
diff --git a/src/runtime/signal_darwin.go b/src/runtime/signal_darwin.go
index 32ecce0d7d..6cd18653d5 100644
--- a/src/runtime/signal_darwin.go
+++ b/src/runtime/signal_darwin.go
@@ -16,14 +16,14 @@ var sigtable = [...]sigTabT{
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -32,14 +32,14 @@ var sigtable = [...]sigTabT{
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 19 */ {0, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
diff --git a/src/runtime/signal_linux.go b/src/runtime/signal_linux.go
index f8250b9fa1..2f25b59663 100644
--- a/src/runtime/signal_linux.go
+++ b/src/runtime/signal_linux.go
@@ -16,20 +16,20 @@ var sigtable = [...]sigTabT{
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
-	/* 7 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 7 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
 	/* 10 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigThrow, "SIGSTKFLT: stack fault"},
-	/* 17 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 16 */ {_SigThrow + _SigUnblock, "SIGSTKFLT: stack fault"},
+	/* 17 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 18 */ {0, "SIGCONT: continue"},
 	/* 19 */ {0, "SIGSTOP: stop, unblockable"},
 	/* 20 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
@@ -39,7 +39,7 @@ var sigtable = [...]sigTabT{
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 30 */ {_SigNotify, "SIGPWR: power failure restart"},
diff --git a/src/runtime/signal_netbsd.go b/src/runtime/signal_netbsd.go
index 78afc59efa..d93a450d98 100644
--- a/src/runtime/signal_netbsd.go
+++ b/src/runtime/signal_netbsd.go
@@ -14,14 +14,14 @@ var sigtable = [...]sigTabT{
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/*  2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/*  3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/*  4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/*  5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/*  4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/*  5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/*  6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/*  7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/*  8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/*  8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/*  9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -30,14 +30,14 @@ var sigtable = [...]sigTabT{
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 19 */ {0, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
diff --git a/src/runtime/signal_solaris.go b/src/runtime/signal_solaris.go
index 2986c5aabc..d8ac676846 100644
--- a/src/runtime/signal_solaris.go
+++ b/src/runtime/signal_solaris.go
@@ -14,21 +14,21 @@ var sigtable = [...]sigTabT{
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt (rubout)"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit (ASCII FS)"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction (not reset when caught)"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap (not reset when caught)"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction (not reset when caught)"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap (not reset when caught)"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: used by abort, replace SIGIOT in the future"},
 	/* 7 */ {_SigThrow, "SIGEMT: EMT instruction"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating point exception"},
 	/* 9 */ {0, "SIGKILL: kill (cannot be caught or ignored)"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad argument to system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write on a pipe with no one to read it"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: software termination signal from kill"},
 	/* 16 */ {_SigNotify, "SIGUSR1: user defined signal 1"},
 	/* 17 */ {_SigNotify, "SIGUSR2: user defined signal 2"},
-	/* 18 */ {_SigNotify, "SIGCHLD: child status change alias (POSIX)"},
+	/* 18 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status change alias (POSIX)"},
 	/* 19 */ {_SigNotify, "SIGPWR: power-fail restart"},
 	/* 20 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 21 */ {_SigNotify, "SIGURG: urgent socket condition"},
@@ -39,7 +39,7 @@ var sigtable = [...]sigTabT{
 	/* 26 */ {_SigNotify + _SigDefault, "SIGTTIN: background tty read attempted"},
 	/* 27 */ {_SigNotify + _SigDefault, "SIGTTOU: background tty write attempted"},
 	/* 28 */ {_SigNotify, "SIGVTALRM: virtual timer expired"},
-	/* 29 */ {_SigNotify, "SIGPROF: profiling timer expired"},
+	/* 29 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling timer expired"},
 	/* 30 */ {_SigNotify, "SIGXCPU: exceeded cpu limit"},
 	/* 31 */ {_SigNotify, "SIGXFSZ: exceeded file size limit"},
 	/* 32 */ {_SigNotify, "SIGWAITING: reserved signal no longer used by"},
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index da8a1c5801..b2fce53534 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -131,7 +131,9 @@ func lastcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
 
 	print("PC=", hex(r.ip()), "\n")
 	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
-		print("signal arrived during cgo execution\n")
+		if iscgo {
+			print("signal arrived during external code execution\n")
+		}
 		gp = _g_.m.lockedg
 	}
 	print("\n")
diff --git a/src/runtime/sigqueue_plan9.go b/src/runtime/sigqueue_plan9.go
index 38f0a57b90..f000fabd1a 100644
--- a/src/runtime/sigqueue_plan9.go
+++ b/src/runtime/sigqueue_plan9.go
@@ -17,21 +17,29 @@ var sig struct {
 	sleeping bool
 }
 
+type noteData struct {
+	s [_ERRMAX]byte
+	n int // n bytes of s are valid
+}
+
 type noteQueue struct {
 	lock mutex
-	data [qsize]*byte
+	data [qsize]noteData
 	ri   int
 	wi   int
 	full bool
 }
 
+// It is not allowed to allocate memory in the signal handler.
 func (q *noteQueue) push(item *byte) bool {
 	lock(&q.lock)
 	if q.full {
 		unlock(&q.lock)
 		return false
 	}
-	q.data[q.wi] = item
+	s := gostringnocopy(item)
+	copy(q.data[q.wi].s[:], s)
+	q.data[q.wi].n = len(s)
 	q.wi++
 	if q.wi == qsize {
 		q.wi = 0
@@ -43,14 +51,15 @@ func (q *noteQueue) push(item *byte) bool {
 	return true
 }
 
-func (q *noteQueue) pop() *byte {
+func (q *noteQueue) pop() string {
 	lock(&q.lock)
 	q.full = false
 	if q.ri == q.wi {
 		unlock(&q.lock)
-		return nil
+		return ""
 	}
-	item := q.data[q.ri]
+	note := &q.data[q.ri]
+	item := string(note.s[:note.n])
 	q.ri++
 	if q.ri == qsize {
 		q.ri = 0
@@ -86,8 +95,8 @@ func sendNote(s *byte) bool {
 func signal_recv() string {
 	for {
 		note := sig.q.pop()
-		if note != nil {
-			return gostring(note)
+		if note != "" {
+			return note
 		}
 
 		lock(&sig.lock)
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 5ccc6592bf..79b611839d 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -84,10 +84,13 @@ func growslice(t *slicetype, old slice, n int) slice {
 		memclr(add(p, lenmem), capmem-lenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan unitialized memory.
-		// TODO(rsc): Use memmove when !writeBarrierEnabled.
 		p = newarray(et, uintptr(newcap))
-		for i := 0; i < old.len; i++ {
-			typedmemmove(et, add(p, uintptr(i)*et.size), add(old.array, uintptr(i)*et.size))
+		if !writeBarrierEnabled {
+			memmove(p, old.array, lenmem)
+		} else {
+			for i := uintptr(0); i < lenmem; i += et.size {
+				typedmemmove(et, add(p, i), add(old.array, i))
+			}
 		}
 	}
 
diff --git a/src/runtime/stack1.go b/src/runtime/stack1.go
index f74694b7e9..27427af955 100644
--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@ -352,6 +352,12 @@ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
 	}
 }
 
+// Information from the compiler about the layout of stack frames.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
+
 type gobitvector struct {
 	n        uintptr
 	bytedata []uint8
@@ -381,20 +387,20 @@ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f
 			print("        ", add(scanp, i*ptrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*ptrSize))), " # ", i, " ", bv.bytedata[i/4], "\n")
 		}
 		if ptrbit(&bv, i) == 1 {
-			p := *(*unsafe.Pointer)(add(scanp, i*ptrSize))
-			up := uintptr(p)
-			if f != nil && 0 < up && up < _PageSize && debug.invalidptr != 0 || up == poisonStack {
+			pp := (*uintptr)(add(scanp, i*ptrSize))
+			p := *pp
+			if f != nil && 0 < p && p < _PageSize && debug.invalidptr != 0 || p == poisonStack {
 				// Looks like a junk value in a pointer slot.
 				// Live analysis wrong?
 				getg().m.traceback = 2
-				print("runtime: bad pointer in frame ", funcname(f), " at ", add(scanp, i*ptrSize), ": ", p, "\n")
+				print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
 				throw("invalid stack pointer")
 			}
-			if minp <= up && up < maxp {
+			if minp <= p && p < maxp {
 				if stackDebug >= 3 {
 					print("adjust ptr ", p, " ", funcname(f), "\n")
 				}
-				*(*unsafe.Pointer)(add(scanp, i*ptrSize)) = unsafe.Pointer(up + delta)
+				*pp = p + delta
 			}
 		}
 	}
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 25f5bf46fb..687f067cb9 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -32,6 +32,8 @@ const (
 // moduledata records information about the layout of the executable
 // image. It is written by the linker. Any changes here must be
 // matched changes to the code in cmd/internal/ld/symtab.go:symtab.
+// moduledata is stored in read-only memory; none of the pointers here
+// are visible to the garbage collector.
 type moduledata struct {
 	pclntable    []byte
 	ftab         []functab
@@ -48,18 +50,24 @@ type moduledata struct {
 
 	typelinks []*_type
 
-	gcdatamask, gcbssmask bitvector
+	modulename   string
+	modulehashes []modulehash
 
-	// write barrier shadow data
-	// 64-bit systems only, enabled by GODEBUG=wbshadow=1.
-	// See also the shadow_* fields on mheap in mheap.go.
-	shadow_data uintptr // data-addr + shadow_data = shadow data addr
-	data_start  uintptr // start of shadowed data addresses
-	data_end    uintptr // end of shadowed data addresses
+	gcdatamask, gcbssmask bitvector
 
 	next *moduledata
 }
 
+// For each shared library a module links against, the linker creates an entry in the
+// moduledata.modulehashes slice containing the name of the module, the abi hash seen
+// at link time and a pointer to the runtime abi hash. These are checked in
+// moduledataverify1 below.
+type modulehash struct {
+	modulename   string
+	linktimehash string
+	runtimehash  *string
+}
+
 var firstmoduledata moduledata  // linker symbol
 var lastmoduledatap *moduledata // linker symbol
 
@@ -124,6 +132,13 @@ func moduledataverify1(datap *moduledata) {
 		datap.maxpc != datap.ftab[nftab].entry {
 		throw("minpc or maxpc invalid")
 	}
+
+	for _, modulehash := range datap.modulehashes {
+		if modulehash.linktimehash != *modulehash.runtimehash {
+			println("abi mismatch detected between", datap.modulename, "and", modulehash.modulename)
+			throw("abi mismatch")
+		}
+	}
 }
 
 // FuncForPC returns a *Func describing the function that contains the
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 3b7501b9b4..6da7baddc5 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -132,10 +132,7 @@ type traceBuf struct {
 func StartTrace() error {
 	// Stop the world, so that we can take a consistent snapshot
 	// of all goroutines at the beginning of the trace.
-	semacquire(&worldsema, false)
-	_g_ := getg()
-	_g_.m.preemptoff = "start tracing"
-	systemstack(stoptheworld)
+	stopTheWorld("start tracing")
 
 	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
 	// Exitsyscall could check trace.enabled long before and then suddenly wake up
@@ -146,9 +143,7 @@ func StartTrace() error {
 
 	if trace.enabled || trace.shutdown {
 		unlock(&trace.bufLock)
-		_g_.m.preemptoff = ""
-		semrelease(&worldsema)
-		systemstack(starttheworld)
+		startTheWorld()
 		return errorString("tracing is already enabled")
 	}
 
@@ -175,9 +170,7 @@ func StartTrace() error {
 
 	unlock(&trace.bufLock)
 
-	_g_.m.preemptoff = ""
-	semrelease(&worldsema)
-	systemstack(starttheworld)
+	startTheWorld()
 	return nil
 }
 
@@ -186,19 +179,14 @@ func StartTrace() error {
 func StopTrace() {
 	// Stop the world so that we can collect the trace buffers from all p's below,
 	// and also to avoid races with traceEvent.
-	semacquire(&worldsema, false)
-	_g_ := getg()
-	_g_.m.preemptoff = "stop tracing"
-	systemstack(stoptheworld)
+	stopTheWorld("stop tracing")
 
 	// See the comment in StartTrace.
 	lock(&trace.bufLock)
 
 	if !trace.enabled {
 		unlock(&trace.bufLock)
-		_g_.m.preemptoff = ""
-		semrelease(&worldsema)
-		systemstack(starttheworld)
+		startTheWorld()
 		return
 	}
 
@@ -236,9 +224,7 @@ func StopTrace() {
 
 	unlock(&trace.bufLock)
 
-	_g_.m.preemptoff = ""
-	semrelease(&worldsema)
-	systemstack(starttheworld)
+	startTheWorld()
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
@@ -428,9 +414,9 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 
 	// The caller checked that trace.enabled == true, but trace.enabled might have been
 	// turned off between the check and now. Check again. traceLockBuffer did mp.locks++,
-	// StopTrace does stoptheworld, and stoptheworld waits for mp.locks to go back to zero,
+	// StopTrace does stopTheWorld, and stopTheWorld waits for mp.locks to go back to zero,
 	// so if we see trace.enabled == true now, we know it's true for the rest of the function.
-	// Exitsyscall can run even during stoptheworld. The race with StartTrace/StopTrace
+	// Exitsyscall can run even during stopTheWorld. The race with StartTrace/StopTrace
 	// during tracing in exitsyscall is resolved by locking trace.bufLock in traceLockBuffer.
 	if !trace.enabled {
 		traceReleaseBuffer(pid)
@@ -733,7 +719,7 @@ func traceProcStart() {
 }
 
 func traceProcStop(pp *p) {
-	// Sysmon and stoptheworld can stop Ps blocked in syscalls,
+	// Sysmon and stopTheWorld can stop Ps blocked in syscalls,
 	// to handle this we temporary employ the P.
 	mp := acquirem()
 	oldp := mp.p
@@ -807,7 +793,7 @@ func traceGoSysExit(ts int64) {
 }
 
 func traceGoSysBlock(pp *p) {
-	// Sysmon and stoptheworld can declare syscalls running on remote Ps as blocked,
+	// Sysmon and stopTheWorld can declare syscalls running on remote Ps as blocked,
 	// to handle this we temporary employ the P.
 	mp := acquirem()
 	oldp := mp.p
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 9f34e37ea4..5ed601e6f3 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -46,6 +46,9 @@ var (
 	timerprocPC          uintptr
 	gcBgMarkWorkerPC     uintptr
 	systemstack_switchPC uintptr
+	systemstackPC        uintptr
+
+	gogoPC uintptr
 
 	externalthreadhandlerp uintptr // initialized elsewhere
 )
@@ -69,6 +72,10 @@ func tracebackinit() {
 	timerprocPC = funcPC(timerproc)
 	gcBgMarkWorkerPC = funcPC(gcBgMarkWorker)
 	systemstack_switchPC = funcPC(systemstack_switch)
+	systemstackPC = funcPC(systemstack)
+
+	// used by sigprof handler
+	gogoPC = funcPC(gogo)
 }
 
 // Traceback over the deferred function calls.
@@ -194,7 +201,14 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		// Found an actual function.
 		// Derive frame pointer and link register.
 		if frame.fp == 0 {
-			frame.fp = frame.sp + uintptr(funcspdelta(f, frame.pc))
+			// We want to jump over the systemstack switch. If we're running on the
+			// g0, this systemstack is at the top of the stack.
+			// if we're not on g0 or there's a no curg, then this is a regular call.
+			sp := frame.sp
+			if flags&_TraceJumpStack != 0 && f.entry == systemstackPC && gp == g.m.g0 && gp.m.curg != nil {
+				sp = gp.m.curg.sched.sp
+			}
+			frame.fp = sp + uintptr(funcspdelta(f, frame.pc))
 			if !usesLR {
 				// On x86, call instruction pushes return PC before entering new function.
 				frame.fp += regSize
@@ -455,7 +469,7 @@ func setArgInfo(frame *stkframe, f *_func, needArgMap bool) {
 				throw("reflect mismatch")
 			}
 			bv := (*bitvector)(unsafe.Pointer(fn[1]))
-			frame.arglen = uintptr(bv.n / 2 * ptrSize)
+			frame.arglen = uintptr(bv.n * ptrSize)
 			frame.argmap = bv
 		}
 	}
@@ -517,9 +531,10 @@ func traceback1(pc, sp, lr uintptr, gp *g, flags uint) {
 func callers(skip int, pcbuf []uintptr) int {
 	sp := getcallersp(unsafe.Pointer(&skip))
 	pc := uintptr(getcallerpc(unsafe.Pointer(&skip)))
+	gp := getg()
 	var n int
 	systemstack(func() {
-		n = gentraceback(pc, sp, 0, getg(), skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
+		n = gentraceback(pc, sp, 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
 	})
 	return n
 }
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 48df2a4382..45bdac8b91 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -20,17 +20,10 @@ type _type struct {
 	fieldalign uint8
 	kind       uint8
 	alg        *typeAlg
-	// gc stores type info required for garbage collector.
-	// If (kind&KindGCProg)==0, then gc[0] points at sparse GC bitmap
-	// (no indirection), 4 bits per word.
-	// If (kind&KindGCProg)!=0, then gc[1] points to a compiler-generated
-	// read-only GC program; and gc[0] points to BSS space for sparse GC bitmap.
-	// For huge types (>maxGCMask), runtime unrolls the program directly into
-	// GC bitmap and gc[0] is not used. For moderately-sized types, runtime
-	// unrolls the program into gc[0] space on first use. The first byte of gc[0]
-	// (gc[0][0]) contains 'unroll' flag saying whether the program is already
-	// unrolled into gc[0] or not.
-	gc      [2]uintptr
+	// gcdata stores the GC type data for the garbage collector.
+	// If the KindGCProg bit is set in kind, gcdata is a GC program.
+	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
+	gcdata  *byte
 	_string *string
 	x       *uncommontype
 	ptrto   *_type