[dev.boringcrypto] all: merge commit 9d0819b27c (CL 314609) into dev.boringcrypto

There used to be two BoringCrypto-specific behaviors related to cipher suites in crypto/tls: 1. in FIPS-only mode, only a restricted set of AES ciphers is allowed 2. NOT in FIPS-only mode, AES would be prioritized over ChaCha20 even if AES hardware was not available The motivation of (2) is unclear, and BoringSSL doesn't have equivalent logic. This merge drops (2), and keeps (1). Note that the list of FIPS-only ciphers does not have priority semantics anymore, but the default logic still sorts them the same way as they used to be. Change-Id: I50544011085cfa2b087f323aebf5338c0bd2dd33
author: Filippo Valsorda <filippo@golang.org> 2021-05-12 19:23:21 +0200
committer: Filippo Valsorda <filippo@golang.org> 2021-05-13 12:59:22 -0400
commit: ed1f812cefc3ece4b21241ba4cba0272cd2484ed (patch)
tree: 2e336c94eb9797488234c0cb86b123c1c5f2932b /src/runtime
parent: ad1b6f3ee00ce2592503efec7a9793c4786f6274 (diff)
parent: 9d0819b27ca248f9949e7cf6bf7cb9fe7cf574e8 (diff)
download: go-ed1f812cefc3ece4b21241ba4cba0272cd2484ed.tar.xz
395 files changed, 8991 insertions, 5344 deletions
diff --git a/src/runtime/abi_test.go b/src/runtime/abi_test.go
new file mode 100644
index 0000000000..f69d3a9d50
--- /dev/null
+++ b/src/runtime/abi_test.go
@@ -0,0 +1,113 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.regabireflect
+// +build goexperiment.regabireflect
+
+// This file contains tests specific to making sure the register ABI
+// works in a bunch of contexts in the runtime.
+
+package runtime_test
+
+import (
+	"internal/abi"
+	"internal/testenv"
+	"os"
+	"os/exec"
+	"runtime"
+	"strings"
+	"testing"
+	"time"
+)
+
+var regConfirmRun chan int
+
+//go:registerparams
+func regFinalizerPointer(v *Tint) (int, float32, [10]byte) {
+	regConfirmRun <- *(*int)(v)
+	return 5151, 4.0, [10]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+}
+
+//go:registerparams
+func regFinalizerIface(v Tinter) (int, float32, [10]byte) {
+	regConfirmRun <- *(*int)(v.(*Tint))
+	return 5151, 4.0, [10]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+}
+
+func TestFinalizerRegisterABI(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	// Actually run the test in a subprocess because we don't want
+	// finalizers from other tests interfering.
+	if os.Getenv("TEST_FINALIZER_REGABI") != "1" {
+		cmd := testenv.CleanCmdEnv(exec.Command(os.Args[0], "-test.run=TestFinalizerRegisterABI", "-test.v"))
+		cmd.Env = append(cmd.Env, "TEST_FINALIZER_REGABI=1")
+		out, err := cmd.CombinedOutput()
+		if !strings.Contains(string(out), "PASS\n") || err != nil {
+			t.Fatalf("%s\n(exit status %v)", string(out), err)
+		}
+		return
+	}
+
+	// Optimistically clear any latent finalizers from e.g. the testing
+	// package before continuing.
+	//
+	// It's possible that a finalizer only becomes available to run
+	// after this point, which would interfere with the test and could
+	// cause a crash, but because we're running in a separate process
+	// it's extremely unlikely.
+	runtime.GC()
+	runtime.GC()
+
+	// fing will only pick the new IntRegArgs up if it's currently
+	// sleeping and wakes up, so wait for it to go to sleep.
+	success := false
+	for i := 0; i < 100; i++ {
+		if runtime.FinalizerGAsleep() {
+			success = true
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	if !success {
+		t.Fatal("finalizer not asleep?")
+	}
+
+	argRegsBefore := runtime.SetIntArgRegs(abi.IntArgRegs)
+	defer runtime.SetIntArgRegs(argRegsBefore)
+
+	tests := []struct {
+		name         string
+		fin          interface{}
+		confirmValue int
+	}{
+		{"Pointer", regFinalizerPointer, -1},
+		{"Interface", regFinalizerIface, -2},
+	}
+	for i := range tests {
+		test := &tests[i]
+		t.Run(test.name, func(t *testing.T) {
+			regConfirmRun = make(chan int)
+
+			x := new(Tint)
+			*x = (Tint)(test.confirmValue)
+			runtime.SetFinalizer(x, test.fin)
+
+			runtime.KeepAlive(x)
+
+			// Queue the finalizer.
+			runtime.GC()
+			runtime.GC()
+
+			select {
+			case <-time.After(time.Second):
+				t.Fatal("finalizer failed to execute")
+			case gotVal := <-regConfirmRun:
+				if gotVal != test.confirmValue {
+					t.Fatalf("wrong finalizer executed? got %d, want %d", gotVal, test.confirmValue)
+				}
+			}
+		})
+	}
+}
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 5cf6827c21..ec5ea58028 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -195,6 +195,10 @@ nocpuinfo:
 	JMP ok
 #endif
 needtls:
+#ifdef GOOS_openbsd
+	// skip runtime·ldt0setup(SB) and tls test on OpenBSD in all cases
+	JMP	ok
+#endif
 #ifdef GOOS_plan9
 	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
 	JMP	ok
@@ -1473,6 +1477,10 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-8
 	MOVL	AX, x+0(FP)
 	MOVL	CX, y+4(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-8
+	MOVL	DX, x+0(FP)
+	MOVL	BX, y+4(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
 
 // Extended versions for 64-bit indexes.
 TEXT runtime·panicExtendIndex(SB),NOSPLIT,$0-12
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 517c5a9d3e..14f29e1964 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -6,6 +6,7 @@
 #include "go_tls.h"
 #include "funcdata.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 // _rt0_amd64 is common startup code for most amd64 systems when using
 // internal linking. This is the entry point for the program from the
@@ -28,18 +29,9 @@ TEXT main(SB),NOSPLIT,$-8
 // c-archive) or when the shared library is loaded (for c-shared).
 // We expect argc and argv to be passed in the usual C ABI registers
 // DI and SI.
-TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
-	// Align stack per ELF ABI requirements.
-	MOVQ	SP, AX
-	ANDQ	$~15, SP
-	// Save C ABI callee-saved registers, as caller may need them.
-	MOVQ	BX, 0x10(SP)
-	MOVQ	BP, 0x18(SP)
-	MOVQ	R12, 0x20(SP)
-	MOVQ	R13, 0x28(SP)
-	MOVQ	R14, 0x30(SP)
-	MOVQ	R15, 0x38(SP)
-	MOVQ	AX, 0x40(SP)
+TEXT _rt0_amd64_lib(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
 	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
 	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
@@ -51,25 +43,27 @@ TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
 	MOVQ	_cgo_sys_thread_create(SB), AX
 	TESTQ	AX, AX
 	JZ	nocgo
+
+	// We're calling back to C.
+	// Align stack per ELF ABI requirements.
+	MOVQ	SP, BX  // Callee-save in C ABI
+	ANDQ	$~15, SP
 	MOVQ	$_rt0_amd64_lib_go(SB), DI
 	MOVQ	$0, SI
 	CALL	AX
+	MOVQ	BX, SP
 	JMP	restore
 
 nocgo:
+	ADJSP	$16
 	MOVQ	$0x800000, 0(SP)		// stacksize
 	MOVQ	$_rt0_amd64_lib_go(SB), AX
 	MOVQ	AX, 8(SP)			// fn
 	CALL	runtime·newosproc0(SB)
+	ADJSP	$-16
 
 restore:
-	MOVQ	0x10(SP), BX
-	MOVQ	0x18(SP), BP
-	MOVQ	0x20(SP), R12
-	MOVQ	0x28(SP), R13
-	MOVQ	0x30(SP), R14
-	MOVQ	0x38(SP), R15
-	MOVQ	0x40(SP), SP
+	POP_REGS_HOST_TO_ABI0()
 	RET
 
 // _rt0_amd64_lib_go initializes the Go runtime.
@@ -231,9 +225,9 @@ ok:
 	CALL	runtime·abort(SB)	// mstart should never return
 	RET
 
-	// Prevent dead-code elimination of debugCallV1, which is
+	// Prevent dead-code elimination of debugCallV2, which is
 	// intended to be called by debuggers.
-	MOVQ	$runtime·debugCallV1<ABIInternal>(SB), AX
+	MOVQ	$runtime·debugCallV2<ABIInternal>(SB), AX
 	RET
 
 // mainPC is a function value for runtime.main, to be passed to newproc.
@@ -285,6 +279,36 @@ TEXT gogo<>(SB), NOSPLIT, $0
 // Switch to m->g0's stack, call fn(g).
 // Fn must never return. It should gogo(&g->sched)
 // to keep running g.
+#ifdef GOEXPERIMENT_regabiargs
+TEXT runtime·mcall<ABIInternal>(SB), NOSPLIT, $0-8
+	MOVQ	AX, DX	// DX = fn
+
+	// save state in g->sched
+	MOVQ	0(SP), BX	// caller's PC
+	MOVQ	BX, (g_sched+gobuf_pc)(R14)
+	LEAQ	fn+0(FP), BX	// caller's SP
+	MOVQ	BX, (g_sched+gobuf_sp)(R14)
+	MOVQ	BP, (g_sched+gobuf_bp)(R14)
+
+	// switch to m->g0 & its stack, call fn
+	MOVQ	g_m(R14), BX
+	MOVQ	m_g0(BX), SI	// SI = g.m.g0
+	CMPQ	SI, R14	// if g == m->g0 call badmcall
+	JNE	goodm
+	JMP	runtime·badmcall(SB)
+goodm:
+	MOVQ	R14, AX		// AX (and arg 0) = g
+	MOVQ	SI, R14		// g = g.m.g0
+	get_tls(CX)		// Set G in TLS
+	MOVQ	R14, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(R14), SP	// sp = g0.sched.sp
+	PUSHQ	AX	// open up space for fn's arg spill slot
+	MOVQ	0(DX), R12
+	CALL	R12		// fn(g)
+	POPQ	AX
+	JMP	runtime·badmcall2(SB)
+	RET
+#else
 TEXT runtime·mcall(SB), NOSPLIT, $0-8
 	MOVQ	fn+0(FP), DI
 
@@ -315,6 +339,7 @@ TEXT runtime·mcall(SB), NOSPLIT, $0-8
 	MOVQ	$runtime·badmcall2(SB), AX
 	JMP	AX
 	RET
+#endif
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
 // of the G stack. We need to distinguish the routine that
@@ -442,12 +467,9 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 	MOVL	$0, DX
 	JMP	runtime·morestack(SB)
 
-// REFLECTCALL_USE_REGABI is not defined. It must be defined in conjunction with the
-// register constants in the internal/abi package.
-
-#ifdef REFLECTCALL_USE_REGABI
+#ifdef GOEXPERIMENT_regabireflect
 // spillArgs stores return values from registers to a *internal/abi.RegArgs in R12.
-TEXT spillArgs<>(SB),NOSPLIT,$0-0
+TEXT ·spillArgs<ABIInternal>(SB),NOSPLIT,$0-0
 	MOVQ AX, 0(R12)
 	MOVQ BX, 8(R12)
 	MOVQ CX, 16(R12)
@@ -475,7 +497,7 @@ TEXT spillArgs<>(SB),NOSPLIT,$0-0
 	RET
 
 // unspillArgs loads args into registers from a *internal/abi.RegArgs in R12.
-TEXT unspillArgs<>(SB),NOSPLIT,$0-0
+TEXT ·unspillArgs<ABIInternal>(SB),NOSPLIT,$0-0
 	MOVQ 0(R12), AX
 	MOVQ 8(R12), BX
 	MOVQ 16(R12), CX
@@ -503,11 +525,11 @@ TEXT unspillArgs<>(SB),NOSPLIT,$0-0
 	RET
 #else
 // spillArgs stores return values from registers to a pointer in R12.
-TEXT spillArgs<>(SB),NOSPLIT,$0-0
+TEXT ·spillArgs<ABIInternal>(SB),NOSPLIT,$0-0
 	RET
 
 // unspillArgs loads args into registers from a pointer in R12.
-TEXT unspillArgs<>(SB),NOSPLIT,$0-0
+TEXT ·unspillArgs<ABIInternal>(SB),NOSPLIT,$0-0
 	RET
 #endif
 
@@ -524,7 +546,7 @@ TEXT unspillArgs<>(SB),NOSPLIT,$0-0
 	JMP	AX
 // Note: can't just "JMP NAME(SB)" - bad inlining results.
 
-TEXT ·reflectcall<ABIInternal>(SB), NOSPLIT, $0-48
+TEXT ·reflectcall(SB), NOSPLIT, $0-48
 	MOVLQZX frameSize+32(FP), CX
 	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
@@ -566,7 +588,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-48;		\
 	REP;MOVSB;				\
 	/* set up argument registers */		\
 	MOVQ    regArgs+40(FP), R12;		\
-	CALL    unspillArgs<>(SB);		\
+	CALL    ·unspillArgs<ABIInternal>(SB);		\
 	/* call function */			\
 	MOVQ	f+8(FP), DX;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
@@ -574,7 +596,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-48;		\
 	CALL	R12;				\
 	/* copy register return values back */		\
 	MOVQ    regArgs+40(FP), R12;		\
-	CALL    spillArgs<>(SB);		\
+	CALL    ·spillArgs<ABIInternal>(SB);		\
 	MOVLQZX	stackArgsSize+24(FP), CX;		\
 	MOVLQZX	stackRetOffset+28(FP), BX;		\
 	MOVQ	stackArgs+16(FP), DI;		\
@@ -663,7 +685,7 @@ TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
 // or else unwinding from systemstack_switch is incorrect.
 // Smashes R9.
 TEXT gosave_systemstack_switch<>(SB),NOSPLIT,$0
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R14)
 	MOVQ	g(R14), R14
 #endif
@@ -776,6 +798,14 @@ nosave:
 	MOVL	AX, ret+16(FP)
 	RET
 
+#ifdef GOOS_windows
+// Dummy TLS that's used on Windows so that we don't crash trying
+// to restore the G register in needm. needm and its callees are
+// very careful never to actually use the G, the TLS just can't be
+// unset since we're in Go code.
+GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
+#endif
+
 // func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
 TEXT ·cgocallback(SB),NOSPLIT,$24-24
@@ -799,7 +829,27 @@ TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
 	JMP	havem
 needm:
-	MOVQ    $runtime·needm(SB), AX
+#ifdef GOOS_windows
+	// Set up a dummy TLS value. needm is careful not to use it,
+	// but it needs to be there to prevent autogenerated code from
+	// crashing when it loads from it.
+	// We don't need to clear it or anything later because needm
+	// will set up TLS properly.
+	MOVQ	$zeroTLS<>(SB), DI
+	CALL	runtime·settls(SB)
+#endif
+	// On some platforms (Windows) we cannot call needm through
+	// an ABI wrapper because there's no TLS set up, and the ABI
+	// wrapper will try to restore the G register (R14) from TLS.
+	// Clear X15 because Go expects it and we're not calling
+	// through a wrapper, but otherwise avoid setting the G
+	// register in the wrapper and call needm directly. It
+	// takes no arguments and doesn't return any values so
+	// there's no need to handle that. Clear R14 so that there's
+	// a bad value in there, in case needm tries to use it.
+	XORPS	X15, X15
+	XORQ    R14, R14
+	MOVQ	$runtime·needm<ABIInternal>(SB), AX
 	CALL	AX
 	MOVQ	$0, savedm-8(SP) // dropm on return
 	get_tls(CX)
@@ -861,7 +911,8 @@ havem:
 	MOVQ	BX, 0(SP)
 	MOVQ	CX, 8(SP)
 	MOVQ	DX, 16(SP)
-	CALL	runtime·cgocallbackg(SB)
+	MOVQ	$runtime·cgocallbackg(SB), AX
+	CALL	AX	// indirect call to bypass nosplit check. We're on a different stack now.
 
 	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
@@ -893,9 +944,17 @@ havem:
 	// for the duration of the call. Since the call is over, return it with dropm.
 	MOVQ	savedm-8(SP), BX
 	CMPQ	BX, $0
-	JNE 3(PC)
+	JNE	done
 	MOVQ	$runtime·dropm(SB), AX
 	CALL	AX
+#ifdef GOOS_windows
+	// We need to clear the TLS pointer in case the next
+	// thread that comes into Go tries to reuse that space
+	// but uses the same M.
+	XORQ	DI, DI
+	CALL	runtime·settls(SB)
+#endif
+done:
 
 	// Done!
 	RET
@@ -904,16 +963,6 @@ havem:
 // set g. for use by needm.
 TEXT runtime·setg(SB), NOSPLIT, $0-8
 	MOVQ	gg+0(FP), BX
-#ifdef GOOS_windows
-	CMPQ	BX, $0
-	JNE	settls
-	MOVQ	$0, 0x28(GS)
-	RET
-settls:
-	MOVQ	g_m(BX), AX
-	LEAQ	m_tls(AX), AX
-	MOVQ	AX, 0x28(GS)
-#endif
 	get_tls(CX)
 	MOVQ	BX, g(CX)
 	RET
@@ -959,34 +1008,62 @@ done:
 
 // func memhash(p unsafe.Pointer, h, s uintptr) uintptr
 // hash function using AES hardware instructions
-TEXT runtime·memhash(SB),NOSPLIT,$0-32
+TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT,$0-32
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+	// CX = size
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	s+16(FP), CX	// size
 	LEAQ	ret+24(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtime·memhashFallback(SB)
+	JMP	runtime·memhashFallback<ABIInternal>(SB)
 
 // func strhash(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·strhash(SB),NOSPLIT,$0-24
+TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to string struct
+	// BX = seed
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to string struct
+#endif
 	MOVQ	8(AX), CX	// length of string
 	MOVQ	(AX), AX	// string data
+#ifndef GOEXPERIMENT_regabiargs
 	LEAQ	ret+16(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtime·strhashFallback(SB)
+	JMP	runtime·strhashFallback<ABIInternal>(SB)
 
 // AX: data
+#ifdef GOEXPERIMENT_regabiargs
+// BX: hash seed
+#else
+// h+8(FP): hash seed
+#endif
 // CX: length
+#ifdef GOEXPERIMENT_regabiargs
+// At return: AX = return value
+#else
 // DX: address to put return value
+#endif
 TEXT aeshashbody<>(SB),NOSPLIT,$0-0
 	// Fill an SSE register with our seeds.
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0				// 64 bits of per-table hash seed
+#else
 	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
+#endif
 	PINSRW	$4, CX, X0			// 16 bits of length
 	PSHUFHW $0, X0, X0			// repeat length 4 times total
 	MOVO	X0, X1				// save unscrambled seed
@@ -1023,7 +1100,11 @@ final1:
 	AESENC	X1, X1	// scramble combo 3 times
 	AESENC	X1, X1
 	AESENC	X1, X1
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X1, AX	// return X1
+#else
 	MOVQ	X1, (DX)
+#endif
 	RET
 
 endofpage:
@@ -1039,7 +1120,11 @@ endofpage:
 aes0:
 	// Return scrambled input seed
 	AESENC	X0, X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, (DX)
+#endif
 	RET
 
 aes16:
@@ -1069,7 +1154,11 @@ aes17to32:
 
 	// combine results
 	PXOR	X3, X2
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X2, AX	// return X2
+#else
 	MOVQ	X2, (DX)
+#endif
 	RET
 
 aes33to64:
@@ -1111,7 +1200,11 @@ aes33to64:
 	PXOR	X6, X4
 	PXOR	X7, X5
 	PXOR	X5, X4
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X4, AX	// return X4
+#else
 	MOVQ	X4, (DX)
+#endif
 	RET
 
 aes65to128:
@@ -1193,7 +1286,15 @@ aes65to128:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET
 
 aes129plus:
@@ -1309,38 +1410,73 @@ aesloop:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET
 
 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·memhash32(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRD	$2, (AX), X0	// data
 	AESENC	runtime·aeskeysched+0(SB), X0
 	AESENC	runtime·aeskeysched+16(SB), X0
 	AESENC	runtime·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtime·memhash32Fallback(SB)
+	JMP	runtime·memhash32Fallback<ABIInternal>(SB)
 
 // func memhash64(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·memhash64(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#else
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRQ	$1, (AX), X0	// data
 	AESENC	runtime·aeskeysched+0(SB), X0
 	AESENC	runtime·aeskeysched+16(SB), X0
 	AESENC	runtime·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtime·memhash64Fallback(SB)
+	JMP	runtime·memhash64Fallback<ABIInternal>(SB)
 
 // simple mask to get rid of data in the high part of the register.
 DATA masks<>+0x00(SB)/8, $0x0000000000000000
@@ -1440,11 +1576,8 @@ TEXT _cgo_topofstack(SB),NOSPLIT,$0
 	RET
 
 // The top-most function running on a goroutine
-// returns to goexit+PCQuantum. Defined as ABIInternal
-// so as to make it identifiable to traceback (this
-// function it used as a sentinel; traceback wants to
-// see the func PC, not a wrapper PC).
-TEXT runtime·goexit<ABIInternal>(SB),NOSPLIT|TOPFRAME,$0-0
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT|TOPFRAME,$0-0
 	BYTE	$0x90	// NOP
 	CALL	runtime·goexit1(SB)	// does not return
 	// traceback from goexit1 must hit code range of goexit
@@ -1464,7 +1597,7 @@ TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
 // signals. It is quite painful to set X15 in the signal context,
 // so we do it here.
 TEXT ·sigpanic0<ABIInternal>(SB),NOSPLIT,$0-0
-#ifdef GOEXPERIMENT_REGABI
+#ifdef GOEXPERIMENT_regabig
 	get_tls(R14)
 	MOVQ	g(R14), R14
 	XORPS	X15, X15
@@ -1486,7 +1619,7 @@ TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$112
 	MOVQ	R13, 104(SP)
 	// TODO: Consider passing g.m.p in as an argument so they can be shared
 	// across a sequence of write barriers.
-#ifdef GOEXPERIMENT_REGABI
+#ifdef GOEXPERIMENT_regabig
 	MOVQ	g_m(R14), R13
 #else
 	get_tls(R13)
@@ -1624,7 +1757,7 @@ TEXT runtime·gcWriteBarrierR9<ABIInternal>(SB),NOSPLIT,$0
 DATA	debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
 GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
 
-// debugCallV1 is the entry point for debugger-injected function
+// debugCallV2 is the entry point for debugger-injected function
 // calls on running goroutines. It informs the runtime that a
 // debug call has been injected and creates a call frame for the
 // debugger to fill in.
@@ -1637,7 +1770,7 @@ GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
 //    after step 2).
 // 4. Save all machine registers (including flags and XMM reigsters)
 //    so they can be restored later by the debugger.
-// 5. Set the PC to debugCallV1 and resume execution.
+// 5. Set the PC to debugCallV2 and resume execution.
 //
 // If the goroutine is in state _Grunnable, then it's not generally
 // safe to inject a call because it may return out via other runtime
@@ -1647,19 +1780,19 @@ GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
 //
 // If the goroutine is in any other state, it's not safe to inject a call.
 //
-// This function communicates back to the debugger by setting RAX and
+// This function communicates back to the debugger by setting R12 and
 // invoking INT3 to raise a breakpoint signal. See the comments in the
 // implementation for the protocol the debugger is expected to
 // follow. InjectDebugCall in the runtime tests demonstrates this protocol.
 //
 // The debugger must ensure that any pointers passed to the function
 // obey escape analysis requirements. Specifically, it must not pass
-// a stack pointer to an escaping argument. debugCallV1 cannot check
+// a stack pointer to an escaping argument. debugCallV2 cannot check
 // this invariant.
 //
 // This is ABIInternal because Go code injects its PC directly into new
 // goroutine stacks.
-TEXT runtime·debugCallV1<ABIInternal>(SB),NOSPLIT,$152-0
+TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT,$152-0
 	// Save all registers that may contain pointers so they can be
 	// conservatively scanned.
 	//
@@ -1699,10 +1832,10 @@ TEXT runtime·debugCallV1<ABIInternal>(SB),NOSPLIT,$152-0
 	MOVQ	AX, 0(SP)
 	MOVQ	16(SP), AX
 	MOVQ	AX, 8(SP)
-	// Set AX to 8 and invoke INT3. The debugger should get the
+	// Set R12 to 8 and invoke INT3. The debugger should get the
 	// reason a call can't be injected from the top of the stack
 	// and resume execution.
-	MOVQ	$8, AX
+	MOVQ	$8, R12
 	BYTE	$0xcc
 	JMP	restore
 
@@ -1710,17 +1843,18 @@ good:
 	// Registers are saved and it's safe to make a call.
 	// Open up a call frame, moving the stack if necessary.
 	//
-	// Once the frame is allocated, this will set AX to 0 and
+	// Once the frame is allocated, this will set R12 to 0 and
 	// invoke INT3. The debugger should write the argument
-	// frame for the call at SP, push the trapping PC on the
-	// stack, set the PC to the function to call, set RCX to point
-	// to the closure (if a closure call), and resume execution.
+	// frame for the call at SP, set up argument registers, push
+	// the trapping PC on the stack, set the PC to the function to
+	// call, set RDX to point to the closure (if a closure call),
+	// and resume execution.
 	//
-	// If the function returns, this will set AX to 1 and invoke
+	// If the function returns, this will set R12 to 1 and invoke
 	// INT3. The debugger can then inspect any return value saved
-	// on the stack at SP and resume execution again.
+	// on the stack at SP and in registers and resume execution again.
 	//
-	// If the function panics, this will set AX to 2 and invoke INT3.
+	// If the function panics, this will set R12 to 2 and invoke INT3.
 	// The interface{} value of the panic will be at SP. The debugger
 	// can inspect the panic value and resume execution again.
 #define DEBUG_CALL_DISPATCH(NAME,MAXSIZE)	\
@@ -1748,16 +1882,16 @@ good:
 	MOVQ	$debugCallFrameTooLarge<>(SB), AX
 	MOVQ	AX, 0(SP)
 	MOVQ	$20, 8(SP) // length of debugCallFrameTooLarge string
-	MOVQ	$8, AX
+	MOVQ	$8, R12
 	BYTE	$0xcc
 	JMP	restore
 
 restore:
 	// Calls and failures resume here.
 	//
-	// Set AX to 16 and invoke INT3. The debugger should restore
+	// Set R12 to 16 and invoke INT3. The debugger should restore
 	// all registers except RIP and RSP and resume execution.
-	MOVQ	$16, AX
+	MOVQ	$16, R12
 	BYTE	$0xcc
 	// We must not modify flags after this point.
 
@@ -1786,9 +1920,9 @@ restore:
 #define DEBUG_CALL_FN(NAME,MAXSIZE)		\
 TEXT NAME(SB),WRAPPER,$MAXSIZE-0;		\
 	NO_LOCAL_POINTERS;			\
-	MOVQ	$0, AX;				\
+	MOVQ	$0, R12;				\
 	BYTE	$0xcc;				\
-	MOVQ	$1, AX;				\
+	MOVQ	$1, R12;				\
 	BYTE	$0xcc;				\
 	RET
 DEBUG_CALL_FN(debugCall32<>, 32)
@@ -1811,7 +1945,7 @@ TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
 	MOVQ	AX, 0(SP)
 	MOVQ	val_data+8(FP), AX
 	MOVQ	AX, 8(SP)
-	MOVQ	$2, AX
+	MOVQ	$2, R12
 	BYTE	$0xcc
 	RET
 
@@ -1822,69 +1956,147 @@ TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
 // The tail call makes these stubs disappear in backtraces.
 // Defined as ABIInternal since they do not use the stack-based Go ABI.
 TEXT runtime·panicIndex<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicIndex<ABIInternal>(SB)
 TEXT runtime·panicIndexU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicIndexU<ABIInternal>(SB)
 TEXT runtime·panicSliceAlen<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceAlen<ABIInternal>(SB)
 TEXT runtime·panicSliceAlenU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceAlenU<ABIInternal>(SB)
 TEXT runtime·panicSliceAcap<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceAcap<ABIInternal>(SB)
 TEXT runtime·panicSliceAcapU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceAcapU<ABIInternal>(SB)
 TEXT runtime·panicSliceB<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceB<ABIInternal>(SB)
 TEXT runtime·panicSliceBU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSliceBU<ABIInternal>(SB)
 TEXT runtime·panicSlice3Alen<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	DX, AX
+#else
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3Alen<ABIInternal>(SB)
 TEXT runtime·panicSlice3AlenU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	DX, AX
+#else
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3AlenU<ABIInternal>(SB)
 TEXT runtime·panicSlice3Acap<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	DX, AX
+#else
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3Acap<ABIInternal>(SB)
 TEXT runtime·panicSlice3AcapU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	DX, AX
+#else
 	MOVQ	DX, x+0(FP)
 	MOVQ	BX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3AcapU<ABIInternal>(SB)
 TEXT runtime·panicSlice3B<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3B<ABIInternal>(SB)
 TEXT runtime·panicSlice3BU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+#else
 	MOVQ	CX, x+0(FP)
 	MOVQ	DX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3BU<ABIInternal>(SB)
 TEXT runtime·panicSlice3C<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3C<ABIInternal>(SB)
 TEXT runtime·panicSlice3CU<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	CX, BX
+#else
 	MOVQ	AX, x+0(FP)
 	MOVQ	CX, y+8(FP)
+#endif
 	JMP	runtime·goPanicSlice3CU<ABIInternal>(SB)
+TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	DX, AX
+#else
+	MOVQ	DX, x+0(FP)
+	MOVQ	BX, y+8(FP)
+#endif
+	JMP	runtime·goPanicSliceConvert<ABIInternal>(SB)
 
 #ifdef GOOS_android
 // Use the free TLS_SLOT_APP slot #2 on Android Q.
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 9896ab4383..e779fc8f84 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -252,9 +252,6 @@ TEXT runtime·mcall(SB),NOSPLIT|NOFRAME,$0-4
 	CMP	g, R1
 	B.NE	2(PC)
 	B	runtime·badmcall(SB)
-	MOVB	runtime·iscgo(SB), R11
-	CMP	$0, R11
-	BL.NE	runtime·save_g(SB)
 	MOVW	fn+0(FP), R0
 	MOVW	(g_sched+gobuf_sp)(g), R13
 	SUB	$8, R13
@@ -992,6 +989,10 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-8
 	MOVW	R0, x+0(FP)
 	MOVW	R1, y+4(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-8
+	MOVW	R2, x+0(FP)
+	MOVW	R3, y+4(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
 
 // Extended versions for 64-bit indexes.
 TEXT runtime·panicExtendIndex(SB),NOSPLIT,$0-12
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 3709f1d95e..2d495397a8 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -1158,10 +1158,10 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
 // It does not clobber any general-purpose registers,
 // but may clobber others (e.g., floating point registers)
 // The act of CALLing gcWriteBarrier will clobber R30 (LR).
-TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$216
+TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$200
 	// Save the registers clobbered by the fast path.
-	MOVD	R0, 200(RSP)
-	MOVD	R1, 208(RSP)
+	MOVD	R0, 184(RSP)
+	MOVD	R1, 192(RSP)
 	MOVD	g_m(g), R0
 	MOVD	m_p(R0), R0
 	MOVD	(p_wbBuf+wbBuf_next)(R0), R1
@@ -1177,8 +1177,8 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$216
 	// Is the buffer full? (flags set in CMP above)
 	BEQ	flush
 ret:
-	MOVD	200(RSP), R0
-	MOVD	208(RSP), R1
+	MOVD	184(RSP), R0
+	MOVD	192(RSP), R1
 	// Do the write.
 	MOVD	R3, (R2)
 	RET
@@ -1202,17 +1202,16 @@ flush:
 	MOVD	R13, 96(RSP)
 	MOVD	R14, 104(RSP)
 	MOVD	R15, 112(RSP)
-	MOVD	R16, 120(RSP)
-	MOVD	R17, 128(RSP)
+	// R16, R17 may be clobbered by linker trampoline
 	// R18 is unused.
-	MOVD	R19, 136(RSP)
-	MOVD	R20, 144(RSP)
-	MOVD	R21, 152(RSP)
-	MOVD	R22, 160(RSP)
-	MOVD	R23, 168(RSP)
-	MOVD	R24, 176(RSP)
-	MOVD	R25, 184(RSP)
-	MOVD	R26, 192(RSP)
+	MOVD	R19, 120(RSP)
+	MOVD	R20, 128(RSP)
+	MOVD	R21, 136(RSP)
+	MOVD	R22, 144(RSP)
+	MOVD	R23, 152(RSP)
+	MOVD	R24, 160(RSP)
+	MOVD	R25, 168(RSP)
+	MOVD	R26, 176(RSP)
 	// R27 is temp register.
 	// R28 is g.
 	// R29 is frame pointer (unused).
@@ -1236,16 +1235,14 @@ flush:
 	MOVD	96(RSP), R13
 	MOVD	104(RSP), R14
 	MOVD	112(RSP), R15
-	MOVD	120(RSP), R16
-	MOVD	128(RSP), R17
-	MOVD	136(RSP), R19
-	MOVD	144(RSP), R20
-	MOVD	152(RSP), R21
-	MOVD	160(RSP), R22
-	MOVD	168(RSP), R23
-	MOVD	176(RSP), R24
-	MOVD	184(RSP), R25
-	MOVD	192(RSP), R26
+	MOVD	120(RSP), R19
+	MOVD	128(RSP), R20
+	MOVD	136(RSP), R21
+	MOVD	144(RSP), R22
+	MOVD	152(RSP), R23
+	MOVD	160(RSP), R24
+	MOVD	168(RSP), R25
+	MOVD	176(RSP), R26
 	JMP	ret
 
 // Note: these functions use a special calling convention to save generated code space.
@@ -1317,3 +1314,7 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
 	MOVD	R0, x+0(FP)
 	MOVD	R1, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16
+	MOVD	R2, x+0(FP)
+	MOVD	R3, y+8(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index cee4b528bb..c3b57e472a 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -805,3 +805,7 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
 	MOVV	R1, x+0(FP)
 	MOVV	R2, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16
+	MOVV	R3, x+0(FP)
+	MOVV	R4, y+8(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 17fbc902c2..1d828b03cf 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -801,6 +801,10 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-8
 	MOVW	R1, x+0(FP)
 	MOVW	R2, y+4(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-8
+	MOVW	R3, x+0(FP)
+	MOVW	R4, y+4(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
 
 // Extended versions for 64-bit indexes.
 TEXT runtime·panicExtendIndex(SB),NOSPLIT,$0-12
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 6544048497..2c39b38912 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1022,3 +1022,7 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
 	MOVD	R3, x+0(FP)
 	MOVD	R4, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16
+	MOVD	R5, x+0(FP)
+	MOVD	R6, y+8(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index d8d5252ed5..ef7af4e10d 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -806,6 +806,10 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
 	MOV	T0, x+0(FP)
 	MOV	T1, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16
+	MOV	T2, x+0(FP)
+	MOV	T3, y+8(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
 
 DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
 GLOBL	runtime·mainPC(SB),RODATA,$8
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 4748e00aa8..fb38271630 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -906,3 +906,7 @@ TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
 	MOVD	R0, x+0(FP)
 	MOVD	R1, y+8(FP)
 	JMP	runtime·goPanicSlice3CU(SB)
+TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16
+	MOVD	R2, x+0(FP)
+	MOVD	R3, y+8(FP)
+	JMP	runtime·goPanicSliceConvert(SB)
diff --git a/src/runtime/auxv_none.go b/src/runtime/auxv_none.go
index 3ca617b21e..3178f1a154 100644
--- a/src/runtime/auxv_none.go
+++ b/src/runtime/auxv_none.go
@@ -2,12 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !linux
-// +build !darwin
-// +build !dragonfly
-// +build !freebsd
-// +build !netbsd
-// +build !solaris
+//go:build !linux && !darwin && !dragonfly && !freebsd && !netbsd && !solaris
+// +build !linux,!darwin,!dragonfly,!freebsd,!netbsd,!solaris
 
 package runtime
 
diff --git a/src/runtime/cgo/abi_amd64.h b/src/runtime/cgo/abi_amd64.h
new file mode 100644
index 0000000000..9949435fe9
--- /dev/null
+++ b/src/runtime/cgo/abi_amd64.h
@@ -0,0 +1,99 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Macros for transitioning from the host ABI to Go ABI0.
+//
+// These save the frame pointer, so in general, functions that use
+// these should have zero frame size to suppress the automatic frame
+// pointer, though it's harmless to not do this.
+
+#ifdef GOOS_windows
+
+// REGS_HOST_TO_ABI0_STACK is the stack bytes used by
+// PUSH_REGS_HOST_TO_ABI0.
+#define REGS_HOST_TO_ABI0_STACK (28*8 + 8)
+
+// PUSH_REGS_HOST_TO_ABI0 prepares for transitioning from
+// the host ABI to Go ABI0 code. It saves all registers that are
+// callee-save in the host ABI and caller-save in Go ABI0 and prepares
+// for entry to Go.
+//
+// Save DI SI BP BX R12 R13 R14 R15 X6-X15 registers and the DF flag.
+// Clear the DF flag for the Go ABI.
+// MXCSR matches the Go ABI, so we don't have to set that,
+// and Go doesn't modify it, so we don't have to save it.
+#define PUSH_REGS_HOST_TO_ABI0()	\
+	PUSHFQ			\
+	CLD			\
+	ADJSP	$(REGS_HOST_TO_ABI0_STACK - 8)	\
+	MOVQ	DI, (0*0)(SP)	\
+	MOVQ	SI, (1*8)(SP)	\
+	MOVQ	BP, (2*8)(SP)	\
+	MOVQ	BX, (3*8)(SP)	\
+	MOVQ	R12, (4*8)(SP)	\
+	MOVQ	R13, (5*8)(SP)	\
+	MOVQ	R14, (6*8)(SP)	\
+	MOVQ	R15, (7*8)(SP)	\
+	MOVUPS	X6, (8*8)(SP)	\
+	MOVUPS	X7, (10*8)(SP)	\
+	MOVUPS	X8, (12*8)(SP)	\
+	MOVUPS	X9, (14*8)(SP)	\
+	MOVUPS	X10, (16*8)(SP)	\
+	MOVUPS	X11, (18*8)(SP)	\
+	MOVUPS	X12, (20*8)(SP)	\
+	MOVUPS	X13, (22*8)(SP)	\
+	MOVUPS	X14, (24*8)(SP)	\
+	MOVUPS	X15, (26*8)(SP)
+
+#define POP_REGS_HOST_TO_ABI0()	\
+	MOVQ	(0*0)(SP), DI	\
+	MOVQ	(1*8)(SP), SI	\
+	MOVQ	(2*8)(SP), BP	\
+	MOVQ	(3*8)(SP), BX	\
+	MOVQ	(4*8)(SP), R12	\
+	MOVQ	(5*8)(SP), R13	\
+	MOVQ	(6*8)(SP), R14	\
+	MOVQ	(7*8)(SP), R15	\
+	MOVUPS	(8*8)(SP), X6	\
+	MOVUPS	(10*8)(SP), X7	\
+	MOVUPS	(12*8)(SP), X8	\
+	MOVUPS	(14*8)(SP), X9	\
+	MOVUPS	(16*8)(SP), X10	\
+	MOVUPS	(18*8)(SP), X11	\
+	MOVUPS	(20*8)(SP), X12	\
+	MOVUPS	(22*8)(SP), X13	\
+	MOVUPS	(24*8)(SP), X14	\
+	MOVUPS	(26*8)(SP), X15	\
+	ADJSP	$-(REGS_HOST_TO_ABI0_STACK - 8)	\
+	POPFQ
+
+#else
+// SysV ABI
+
+#define REGS_HOST_TO_ABI0_STACK (6*8)
+
+// SysV MXCSR matches the Go ABI, so we don't have to set that,
+// and Go doesn't modify it, so we don't have to save it.
+// Both SysV and Go require DF to be cleared, so that's already clear.
+// The SysV and Go frame pointer conventions are compatible.
+#define PUSH_REGS_HOST_TO_ABI0()	\
+	ADJSP	$(REGS_HOST_TO_ABI0_STACK)	\
+	MOVQ	BP, (5*8)(SP)	\
+	LEAQ	(5*8)(SP), BP	\
+	MOVQ	BX, (0*8)(SP)	\
+	MOVQ	R12, (1*8)(SP)	\
+	MOVQ	R13, (2*8)(SP)	\
+	MOVQ	R14, (3*8)(SP)	\
+	MOVQ	R15, (4*8)(SP)
+
+#define POP_REGS_HOST_TO_ABI0()	\
+	MOVQ	(0*8)(SP), BX	\
+	MOVQ	(1*8)(SP), R12	\
+	MOVQ	(2*8)(SP), R13	\
+	MOVQ	(3*8)(SP), R14	\
+	MOVQ	(4*8)(SP), R15	\
+	MOVQ	(5*8)(SP), BP	\
+	ADJSP	$-(REGS_HOST_TO_ABI0_STACK)
+
+#endif
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 5dc8e2d235..386299c548 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s
@@ -3,70 +3,32 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "abi_amd64.h"
 
 // Called by C code generated by cmd/cgo.
 // func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
 // Saves C callee-saved registers and calls cgocallback with three arguments.
 // fn is the PC of a func(a unsafe.Pointer) function.
 // This signature is known to SWIG, so we can't change it.
+TEXT crosscall2(SB),NOSPLIT,$0-0
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Make room for arguments to cgocallback.
+	ADJSP	$0x18
 #ifndef GOOS_windows
-TEXT crosscall2(SB),NOSPLIT,$0x50-0 /* keeps stack pointer 32-byte aligned */
+	MOVQ	DI, 0x0(SP)	/* fn */
+	MOVQ	SI, 0x8(SP)	/* arg */
+	// Skip n in DX.
+	MOVQ	CX, 0x10(SP)	/* ctxt */
 #else
-TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
-#endif
-	MOVQ	BX, 0x18(SP)
-	MOVQ	R12, 0x28(SP)
-	MOVQ	R13, 0x30(SP)
-	MOVQ	R14, 0x38(SP)
-	MOVQ	R15, 0x40(SP)
-
-#ifdef GOOS_windows
-	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 and XMM6 -- XMM15.
-	MOVQ	DI, 0x48(SP)
-	MOVQ	SI, 0x50(SP)
-	MOVUPS	X6, 0x60(SP)
-	MOVUPS	X7, 0x70(SP)
-	MOVUPS	X8, 0x80(SP)
-	MOVUPS	X9, 0x90(SP)
-	MOVUPS	X10, 0xa0(SP)
-	MOVUPS	X11, 0xb0(SP)
-	MOVUPS	X12, 0xc0(SP)
-	MOVUPS	X13, 0xd0(SP)
-	MOVUPS	X14, 0xe0(SP)
-	MOVUPS	X15, 0xf0(SP)
-
 	MOVQ	CX, 0x0(SP)	/* fn */
 	MOVQ	DX, 0x8(SP)	/* arg */
 	// Skip n in R8.
 	MOVQ	R9, 0x10(SP)	/* ctxt */
-
-	CALL	runtime·cgocallback(SB)
-
-	MOVQ	0x48(SP), DI
-	MOVQ	0x50(SP), SI
-	MOVUPS	0x60(SP), X6
-	MOVUPS	0x70(SP), X7
-	MOVUPS	0x80(SP), X8
-	MOVUPS	0x90(SP), X9
-	MOVUPS	0xa0(SP), X10
-	MOVUPS	0xb0(SP), X11
-	MOVUPS	0xc0(SP), X12
-	MOVUPS	0xd0(SP), X13
-	MOVUPS	0xe0(SP), X14
-	MOVUPS	0xf0(SP), X15
-#else
-	MOVQ	DI, 0x0(SP)	/* fn */
-	MOVQ	SI, 0x8(SP)	/* arg */
-	// Skip n in DX.
-	MOVQ	CX, 0x10(SP)	/* ctxt */
-
-	CALL	runtime·cgocallback(SB)
 #endif
 
-	MOVQ	0x18(SP), BX
-	MOVQ	0x28(SP), R12
-	MOVQ	0x30(SP), R13
-	MOVQ	0x38(SP), R14
-	MOVQ	0x40(SP), R15
+	CALL	runtime·cgocallback(SB)
 
+	ADJSP	$-0x18
+	POP_REGS_HOST_TO_ABI0()
 	RET
diff --git a/src/runtime/cgo/callbacks_traceback.go b/src/runtime/cgo/callbacks_traceback.go
index cdadf9e66f..7302c1eedf 100644
--- a/src/runtime/cgo/callbacks_traceback.go
+++ b/src/runtime/cgo/callbacks_traceback.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || linux
 // +build darwin linux
 
 package cgo
diff --git a/src/runtime/cgo/dragonfly.go b/src/runtime/cgo/dragonfly.go
index d6d69187b8..cfa6fe86e2 100644
--- a/src/runtime/cgo/dragonfly.go
+++ b/src/runtime/cgo/dragonfly.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly
 // +build dragonfly
 
 package cgo
diff --git a/src/runtime/cgo/freebsd.go b/src/runtime/cgo/freebsd.go
index 5c9ddbdc71..d56702ec70 100644
--- a/src/runtime/cgo/freebsd.go
+++ b/src/runtime/cgo/freebsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build freebsd
 // +build freebsd
 
 package cgo
diff --git a/src/runtime/cgo/gcc_libinit_windows.c b/src/runtime/cgo/gcc_libinit_windows.c
index 2732248bdc..ad5038667a 100644
--- a/src/runtime/cgo/gcc_libinit_windows.c
+++ b/src/runtime/cgo/gcc_libinit_windows.c
@@ -4,7 +4,7 @@
 
 // +build cgo
 
-#define WIN64_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
 
diff --git a/src/runtime/cgo/gcc_openbsd_mips64.c b/src/runtime/cgo/gcc_openbsd_mips64.c
new file mode 100644
index 0000000000..79f039a373
--- /dev/null
+++ b/src/runtime/cgo/gcc_openbsd_mips64.c
@@ -0,0 +1,67 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
+	ts->g->stackhi = size;
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_sigaction.c b/src/runtime/cgo/gcc_sigaction.c
index e510e359fe..890008e327 100644
--- a/src/runtime/cgo/gcc_sigaction.c
+++ b/src/runtime/cgo/gcc_sigaction.c
@@ -23,7 +23,7 @@ typedef struct {
 } go_sigaction_t;
 
 // SA_RESTORER is part of the kernel interface.
-// This is GNU/Linux i386/amd64 specific.
+// This is Linux i386/amd64 specific.
 #ifndef SA_RESTORER
 #define SA_RESTORER 0x4000000
 #endif
diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
index 25cfd086dd..9df9b9b1e4 100644
--- a/src/runtime/cgo/gcc_windows_amd64.c
+++ b/src/runtime/cgo/gcc_windows_amd64.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#define WIN64_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
 #include <stdlib.h>
diff --git a/src/runtime/cgo/gcc_windows_arm64.c b/src/runtime/cgo/gcc_windows_arm64.c
new file mode 100644
index 0000000000..61ef094866
--- /dev/null
+++ b/src/runtime/cgo/gcc_windows_arm64.c
@@ -0,0 +1,46 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include "libcgo.h"
+#include "libcgo_windows.h"
+
+static void threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	setg_gcc = setg;
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	uintptr_t thandle;
+
+	thandle = _beginthread(threadentry, 0, ts);
+	if(thandle == -1) {
+		fprintf(stderr, "runtime: failed to create new OS thread (%d)\n", errno);
+		abort();
+	}
+}
+
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+
+static void
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall1(ts.fn, setg_gcc, (void *)ts.g);
+}
diff --git a/src/runtime/cgo/handle.go b/src/runtime/cgo/handle.go
new file mode 100644
index 0000000000..720acca802
--- /dev/null
+++ b/src/runtime/cgo/handle.go
@@ -0,0 +1,109 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import (
+	"sync"
+	"sync/atomic"
+)
+
+// Handle provides a way to pass values that contain Go pointers
+// (pointers to memory allocated by Go) between Go and C without
+// breaking the cgo pointer passing rules. A Handle is an integer
+// value that can represent any Go value. A Handle can be passed
+// through C and back to Go, and Go code can use the Handle to
+// retrieve the original Go value.
+//
+// The underlying type of Handle is guaranteed to fit in an integer type
+// that is large enough to hold the bit pattern of any pointer. The zero
+// value of a Handle is not valid, and thus is safe to use as a sentinel
+// in C APIs.
+//
+// For instance, on the Go side:
+//
+//	package main
+//
+//	/*
+//	#include <stdint.h> // for uintptr_t
+//
+//	extern void MyGoPrint(uintptr_t handle);
+//	void myprint(uintptr_t handle);
+//	*/
+//	import "C"
+//	import "runtime/cgo"
+//
+//	//export MyGoPrint
+//	func MyGoPrint(handle C.uintptr_t) {
+//		h := cgo.Handle(handle)
+//		val := h.Value().(string)
+//		println(val)
+//		h.Delete()
+//	}
+//
+//	func main() {
+//		val := "hello Go"
+//		C.myprint(C.uintptr_t(cgo.NewHandle(val)))
+//		// Output: hello Go
+//	}
+//
+// and on the C side:
+//
+//	#include <stdint.h> // for uintptr_t
+//
+//	// A Go function
+//	extern void MyGoPrint(uintptr_t handle);
+//
+//	// A C function
+//	void myprint(uintptr_t handle) {
+//	    MyGoPrint(handle);
+//	}
+type Handle uintptr
+
+// NewHandle returns a handle for a given value.
+//
+// The handle is valid until the program calls Delete on it. The handle
+// uses resources, and this package assumes that C code may hold on to
+// the handle, so a program must explicitly call Delete when the handle
+// is no longer needed.
+//
+// The intended use is to pass the returned handle to C code, which
+// passes it back to Go, which calls Value.
+func NewHandle(v interface{}) Handle {
+	h := atomic.AddUintptr(&handleIdx, 1)
+	if h == 0 {
+		panic("runtime/cgo: ran out of handle space")
+	}
+
+	handles.Store(h, v)
+	return Handle(h)
+}
+
+// Value returns the associated Go value for a valid handle.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Value() interface{} {
+	v, ok := handles.Load(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+	return v
+}
+
+// Delete invalidates a handle. This method should only be called once
+// the program no longer needs to pass the handle to C and the C code
+// no longer has a copy of the handle value.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Delete() {
+	_, ok := handles.LoadAndDelete(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+}
+
+var (
+	handles   = sync.Map{} // map[Handle]interface{}
+	handleIdx uintptr      // atomic
+)
diff --git a/src/runtime/cgo/handle_test.go b/src/runtime/cgo/handle_test.go
new file mode 100644
index 0000000000..738051a0ea
--- /dev/null
+++ b/src/runtime/cgo/handle_test.go
@@ -0,0 +1,103 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestHandle(t *testing.T) {
+	v := 42
+
+	tests := []struct {
+		v1 interface{}
+		v2 interface{}
+	}{
+		{v1: v, v2: v},
+		{v1: &v, v2: &v},
+		{v1: nil, v2: nil},
+	}
+
+	for _, tt := range tests {
+		h1 := NewHandle(tt.v1)
+		h2 := NewHandle(tt.v2)
+
+		if uintptr(h1) == 0 || uintptr(h2) == 0 {
+			t.Fatalf("NewHandle returns zero")
+		}
+
+		if uintptr(h1) == uintptr(h2) {
+			t.Fatalf("Duplicated Go values should have different handles, but got equal")
+		}
+
+		h1v := h1.Value()
+		h2v := h2.Value()
+		if !reflect.DeepEqual(h1v, h2v) || !reflect.DeepEqual(h1v, tt.v1) {
+			t.Fatalf("Value of a Handle got wrong, got %+v %+v, want %+v", h1v, h2v, tt.v1)
+		}
+
+		h1.Delete()
+		h2.Delete()
+	}
+
+	siz := 0
+	handles.Range(func(k, v interface{}) bool {
+		siz++
+		return true
+	})
+	if siz != 0 {
+		t.Fatalf("handles are not cleared, got %d, want %d", siz, 0)
+	}
+}
+
+func TestInvalidHandle(t *testing.T) {
+	t.Run("zero", func(t *testing.T) {
+		h := Handle(0)
+
+		defer func() {
+			if r := recover(); r != nil {
+				return
+			}
+			t.Fatalf("Delete of zero handle did not trigger a panic")
+		}()
+
+		h.Delete()
+	})
+
+	t.Run("invalid", func(t *testing.T) {
+		h := NewHandle(42)
+
+		defer func() {
+			if r := recover(); r != nil {
+				h.Delete()
+				return
+			}
+			t.Fatalf("Invalid handle did not trigger a panic")
+		}()
+
+		Handle(h + 1).Delete()
+	})
+}
+
+func BenchmarkHandle(b *testing.B) {
+	b.Run("non-concurrent", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			h := NewHandle(i)
+			_ = h.Value()
+			h.Delete()
+		}
+	})
+	b.Run("concurrent", func(b *testing.B) {
+		b.RunParallel(func(pb *testing.PB) {
+			var v int
+			for pb.Next() {
+				h := NewHandle(v)
+				_ = h.Value()
+				h.Delete()
+			}
+		})
+	})
+}
diff --git a/src/runtime/cgo/linux.go b/src/runtime/cgo/linux.go
index 76c0192c20..070d531bee 100644
--- a/src/runtime/cgo/linux.go
+++ b/src/runtime/cgo/linux.go
@@ -5,6 +5,7 @@
 // Linux system call wrappers that provide POSIX semantics through the
 // corresponding cgo->libc (nptl) wrappers for various system calls.
 
+//go:build linux
 // +build linux
 
 package cgo
diff --git a/src/runtime/cgo/mmap.go b/src/runtime/cgo/mmap.go
index 00fb7fced6..347ae6b363 100644
--- a/src/runtime/cgo/mmap.go
+++ b/src/runtime/cgo/mmap.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (linux && amd64) || (linux && arm64)
 // +build linux,amd64 linux,arm64
 
 package cgo
diff --git a/src/runtime/cgo/netbsd.go b/src/runtime/cgo/netbsd.go
index 74d0aed014..7e17d1fcd2 100644
--- a/src/runtime/cgo/netbsd.go
+++ b/src/runtime/cgo/netbsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build netbsd
 // +build netbsd
 
 package cgo
diff --git a/src/runtime/cgo/openbsd.go b/src/runtime/cgo/openbsd.go
index 81c73bf399..f6215613c3 100644
--- a/src/runtime/cgo/openbsd.go
+++ b/src/runtime/cgo/openbsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build openbsd
 // +build openbsd
 
 package cgo
diff --git a/src/runtime/cgo/setenv.go b/src/runtime/cgo/setenv.go
index 6495fcb5f8..e6e8040ae0 100644
--- a/src/runtime/cgo/setenv.go
+++ b/src/runtime/cgo/setenv.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package cgo
diff --git a/src/runtime/cgo/sigaction.go b/src/runtime/cgo/sigaction.go
index 076fbc1a0a..ee63ea4c09 100644
--- a/src/runtime/cgo/sigaction.go
+++ b/src/runtime/cgo/sigaction.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (linux && amd64) || (freebsd && amd64) || (linux && arm64)
 // +build linux,amd64 freebsd,amd64 linux,arm64
 
 package cgo
diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
index d5e0cc1e3e..17d26b4cc8 100644
--- a/src/runtime/cgo_mmap.go
+++ b/src/runtime/cgo_mmap.go
@@ -4,6 +4,7 @@
 
 // Support for memory sanitizer. See runtime/cgo/mmap.go.
 
+//go:build (linux && amd64) || (linux && arm64)
 // +build linux,amd64 linux,arm64
 
 package runtime
diff --git a/src/runtime/cgo_ppc64x.go b/src/runtime/cgo_ppc64x.go
index fb2da32c7e..4dc92ea321 100644
--- a/src/runtime/cgo_ppc64x.go
+++ b/src/runtime/cgo_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ppc64 || ppc64le
 // +build ppc64 ppc64le
 
 package runtime
diff --git a/src/runtime/cgo_sigaction.go b/src/runtime/cgo_sigaction.go
index de634dc957..15690ecb0b 100644
--- a/src/runtime/cgo_sigaction.go
+++ b/src/runtime/cgo_sigaction.go
@@ -4,6 +4,7 @@
 
 // Support for memory sanitizer. See runtime/cgo/sigaction.go.
 
+//go:build (linux && amd64) || (freebsd && amd64) || (linux && arm64)
 // +build linux,amd64 freebsd,amd64 linux,arm64
 
 package runtime
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 534a2c4295..0e287d0b8e 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -195,7 +195,7 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 	return errno
 }
 
-// Call from C back to Go.
+// Call from C back to Go. fn must point to an ABIInternal Go entry-point.
 //go:nosplit
 func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index ba56e2cc40..f2a75b30f4 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -690,28 +690,6 @@ func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
 // compiler implements
 //
 //	select {
-//	case v = <-c:
-//		... foo
-//	default:
-//		... bar
-//	}
-//
-// as
-//
-//	if selectnbrecv(&v, c) {
-//		... foo
-//	} else {
-//		... bar
-//	}
-//
-func selectnbrecv(elem unsafe.Pointer, c *hchan) (selected bool) {
-	selected, _ = chanrecv(c, elem, false)
-	return
-}
-
-// compiler implements
-//
-//	select {
 //	case v, ok = <-c:
 //		... foo
 //	default:
@@ -720,16 +698,14 @@ func selectnbrecv(elem unsafe.Pointer, c *hchan) (selected bool) {
 //
 // as
 //
-//	if c != nil && selectnbrecv2(&v, &ok, c) {
+//	if selected, ok = selectnbrecv(&v, c); selected {
 //		... foo
 //	} else {
 //		... bar
 //	}
 //
-func selectnbrecv2(elem unsafe.Pointer, received *bool, c *hchan) (selected bool) {
-	// TODO(khr): just return 2 values from this function, now that it is in Go.
-	selected, *received = chanrecv(c, elem, false)
-	return
+func selectnbrecv(elem unsafe.Pointer, c *hchan) (selected, received bool) {
+	return chanrecv(c, elem, false)
 }
 
 //go:linkname reflect_chansend reflect.chansend
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index 756bbbeccf..c9ce3ac643 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@@ -631,7 +631,7 @@ func TestNoShrinkStackWhileParking(t *testing.T) {
 	// channel. See issue 40641 for more details on the problem.
 	//
 	// The way we try to induce this failure is to set up two
-	// goroutines: a sender and a reciever that communicate across
+	// goroutines: a sender and a receiver that communicate across
 	// a channel. We try to set up a situation where the sender
 	// grows its stack temporarily then *fully* blocks on a channel
 	// often. Meanwhile a GC is triggered so that we try to get a
@@ -671,7 +671,7 @@ func TestNoShrinkStackWhileParking(t *testing.T) {
 		go send(c, done)
 		// Wait a little bit before triggering
 		// the GC to make sure the sender and
-		// reciever have gotten into their groove.
+		// receiver have gotten into their groove.
 		time.Sleep(50 * time.Microsecond)
 		runtime.GC()
 		<-done
@@ -708,8 +708,6 @@ func TestSelectDuplicateChannel(t *testing.T) {
 	c <- 8 // wake up B.  This operation used to fail because c.recvq was corrupted (it tries to wake up an already running G instead of B)
 }
 
-var selectSink interface{}
-
 func TestSelectStackAdjust(t *testing.T) {
 	// Test that channel receive slots that contain local stack
 	// pointers are adjusted correctly by stack shrinking.
@@ -766,20 +764,8 @@ func TestSelectStackAdjust(t *testing.T) {
 	<-ready2
 	time.Sleep(10 * time.Millisecond)
 
-	// Force concurrent GC a few times.
-	var before, after runtime.MemStats
-	runtime.ReadMemStats(&before)
-	for i := 0; i < 100; i++ {
-		selectSink = new([1 << 20]byte)
-		runtime.ReadMemStats(&after)
-		if after.NumGC-before.NumGC >= 2 {
-			goto done
-		}
-		runtime.Gosched()
-	}
-	t.Fatal("failed to trigger concurrent GC")
-done:
-	selectSink = nil
+	// Force concurrent GC to shrink the stacks.
+	runtime.GC()
 
 	// Wake selects.
 	close(d)
diff --git a/src/runtime/cputicks.go b/src/runtime/cputicks.go
index 7beb57ea12..7c926f4a2b 100644
--- a/src/runtime/cputicks.go
+++ b/src/runtime/cputicks.go
@@ -2,13 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !arm
-// +build !arm64
-// +build !mips64
-// +build !mips64le
-// +build !mips
-// +build !mipsle
-// +build !wasm
+//go:build !arm && !arm64 && !mips64 && !mips64le && !mips && !mipsle && !wasm
+// +build !arm,!arm64,!mips64,!mips64le,!mips,!mipsle,!wasm
 
 package runtime
 
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 140c170ddc..7d25c51aa2 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build cgo
 // +build cgo
 
 package runtime_test
diff --git a/src/runtime/crash_nonunix_test.go b/src/runtime/crash_nonunix_test.go
index 06c197ec2b..5f61476f21 100644
--- a/src/runtime/crash_nonunix_test.go
+++ b/src/runtime/crash_nonunix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build windows || plan9 || (js && wasm)
 // +build windows plan9 js,wasm
 
 package runtime_test
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 803b031873..9469d5e6a6 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime_test
@@ -12,7 +13,6 @@ import (
 	"io"
 	"os"
 	"os/exec"
-	"path/filepath"
 	"runtime"
 	"strings"
 	"sync"
@@ -77,31 +77,14 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
 	}
 
-	// We don't use executeTest because we need to kill the
-	// program while it is running.
-
 	testenv.MustHaveGoBuild(t)
 
-	t.Parallel()
-
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	if err := os.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
-		t.Fatalf("failed to create Go file: %v", err)
-	}
-
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go")
-	cmd.Dir = dir
-	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	exe, err := buildTestProg(t, "testprog")
 	if err != nil {
-		t.Fatalf("building source: %v\n%s", err, out)
+		t.Fatal(err)
 	}
 
-	cmd = exec.Command(filepath.Join(dir, "a.exe"))
+	cmd := exec.Command(exe, "CrashDumpsAllThreads")
 	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env,
 		"GOTRACEBACK=crash",
@@ -123,9 +106,12 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer rp.Close()
+
 	cmd.ExtraFiles = []*os.File{wp}
 
 	if err := cmd.Start(); err != nil {
+		wp.Close()
 		t.Fatalf("starting program: %v", err)
 	}
 
@@ -147,56 +133,14 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 	// We want to see a stack trace for each thread.
 	// Before https://golang.org/cl/2811 running threads would say
 	// "goroutine running on other thread; stack unavailable".
-	out = outbuf.Bytes()
-	n := bytes.Count(out, []byte("main.loop("))
+	out := outbuf.Bytes()
+	n := bytes.Count(out, []byte("main.crashDumpsAllThreadsLoop("))
 	if n != 4 {
 		t.Errorf("found %d instances of main.loop; expected 4", n)
 		t.Logf("%s", out)
 	}
 }
 
-const crashDumpsAllThreadsSource = `
-package main
-
-import (
-	"fmt"
-	"os"
-	"runtime"
-)
-
-func main() {
-	const count = 4
-	runtime.GOMAXPROCS(count + 1)
-
-	chans := make([]chan bool, count)
-	for i := range chans {
-		chans[i] = make(chan bool)
-		go loop(i, chans[i])
-	}
-
-	// Wait for all the goroutines to start executing.
-	for _, c := range chans {
-		<-c
-	}
-
-	// Tell our parent that all the goroutines are executing.
-	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
-		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
-		os.Exit(2)
-	}
-
-	select {}
-}
-
-func loop(i int, c chan bool) {
-	close(c)
-	for {
-		for j := 0; j < 0x7fffffff; j++ {
-		}
-	}
-}
-`
-
 func TestPanicSystemstack(t *testing.T) {
 	// Test that GOTRACEBACK=crash prints both the system and user
 	// stack of other threads.
diff --git a/src/runtime/debug/panic_test.go b/src/runtime/debug/panic_test.go
index b67a3de4f9..b93631e1d8 100644
--- a/src/runtime/debug/panic_test.go
+++ b/src/runtime/debug/panic_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd
 // +build aix darwin dragonfly freebsd linux netbsd openbsd
 
 // TODO: test on Windows?
diff --git a/src/runtime/debug_test.go b/src/runtime/debug_test.go
index a0b3f84382..f74383457f 100644
--- a/src/runtime/debug_test.go
+++ b/src/runtime/debug_test.go
@@ -9,14 +9,16 @@
 // spends all of its time in the race runtime, which isn't a safe
 // point.
 
-// +build amd64
-// +build linux
-// +build !race
+//go:build amd64 && linux && !race
+// +build amd64,linux,!race
 
 package runtime_test
 
 import (
 	"fmt"
+	"internal/abi"
+	"internal/goexperiment"
+	"math"
 	"os"
 	"regexp"
 	"runtime"
@@ -116,21 +118,49 @@ func TestDebugCall(t *testing.T) {
 	g, after := startDebugCallWorker(t)
 	defer after()
 
+	type stackArgs struct {
+		x0    int
+		x1    float64
+		y0Ret int
+		y1Ret float64
+	}
+
 	// Inject a call into the debugCallWorker goroutine and test
 	// basic argument and result passing.
-	var args struct {
-		x    int
-		yRet int
+	fn := func(x int, y float64) (y0Ret int, y1Ret float64) {
+		return x + 1, y + 1.0
 	}
-	fn := func(x int) (yRet int) {
-		return x + 1
+	var args *stackArgs
+	var regs abi.RegArgs
+	intRegs := regs.Ints[:]
+	floatRegs := regs.Floats[:]
+	fval := float64(42.0)
+	if goexperiment.RegabiArgs {
+		intRegs[0] = 42
+		floatRegs[0] = math.Float64bits(fval)
+	} else {
+		args = &stackArgs{
+			x0: 42,
+			x1: 42.0,
+		}
 	}
-	args.x = 42
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, &regs, args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
-	if args.yRet != 43 {
-		t.Fatalf("want 43, got %d", args.yRet)
+	var result0 int
+	var result1 float64
+	if goexperiment.RegabiArgs {
+		result0 = int(intRegs[0])
+		result1 = math.Float64frombits(floatRegs[0])
+	} else {
+		result0 = args.y0Ret
+		result1 = args.y1Ret
+	}
+	if result0 != 43 {
+		t.Errorf("want 43, got %d", result0)
+	}
+	if result1 != fval+1 {
+		t.Errorf("want 43, got %f", result1)
 	}
 }
 
@@ -155,7 +185,7 @@ func TestDebugCallLarge(t *testing.T) {
 		args.in[i] = i
 		want[i] = i + 1
 	}
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, nil, &args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 	if want != args.out {
@@ -168,7 +198,7 @@ func TestDebugCallGC(t *testing.T) {
 	defer after()
 
 	// Inject a call that performs a GC.
-	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill, false); err != nil {
+	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -179,7 +209,7 @@ func TestDebugCallGrowStack(t *testing.T) {
 
 	// Inject a call that grows the stack. debugCallWorker checks
 	// for stack pointer breakage.
-	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill, false); err != nil {
+	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -215,7 +245,7 @@ func TestDebugCallUnsafePoint(t *testing.T) {
 		runtime.Gosched()
 	}
 
-	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill, true)
+	_, err := runtime.InjectDebugCall(g, func() {}, nil, nil, debugCallTKill, true)
 	if msg := "call not at safe point"; err == nil || err.Error() != msg {
 		t.Fatalf("want %q, got %s", msg, err)
 	}
@@ -239,7 +269,7 @@ func TestDebugCallPanic(t *testing.T) {
 	}()
 	g := <-ready
 
-	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill, false)
+	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, nil, debugCallTKill, false)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/src/runtime/debugcall.go b/src/runtime/debugcall.go
index efc68a767d..faddf59eed 100644
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64
 // +build amd64
 
 package runtime
@@ -15,7 +16,7 @@ const (
 	debugCallUnsafePoint = "call not at safe point"
 )
 
-func debugCallV1()
+func debugCallV2()
 func debugCallPanicked(val interface{})
 
 // debugCallCheck checks whether it is safe to inject a debugger
@@ -95,7 +96,7 @@ func debugCallCheck(pc uintptr) string {
 // function at PC dispatch.
 //
 // This must be deeply nosplit because there are untyped values on the
-// stack from debugCallV1.
+// stack from debugCallV2.
 //
 //go:nosplit
 func debugCallWrap(dispatch uintptr) {
@@ -107,14 +108,16 @@ func debugCallWrap(dispatch uintptr) {
 	// Create a new goroutine to execute the call on. Run this on
 	// the system stack to avoid growing our stack.
 	systemstack(func() {
-		var args struct {
-			dispatch uintptr
-			callingG *g
-		}
-		args.dispatch = dispatch
-		args.callingG = gp
+		// TODO(mknyszek): It would be nice to wrap these arguments in an allocated
+		// closure and start the goroutine with that closure, but the compiler disallows
+		// implicit closure allocation in the runtime.
 		fn := debugCallWrap1
-		newg := newproc1(*(**funcval)(unsafe.Pointer(&fn)), unsafe.Pointer(&args), int32(unsafe.Sizeof(args)), gp, callerpc)
+		newg := newproc1(*(**funcval)(unsafe.Pointer(&fn)), nil, 0, gp, callerpc)
+		args := &debugCallWrapArgs{
+			dispatch: dispatch,
+			callingG: gp,
+		}
+		newg.param = unsafe.Pointer(args)
 
 		// If the current G is locked, then transfer that
 		// locked-ness to the new goroutine.
@@ -184,9 +187,19 @@ func debugCallWrap(dispatch uintptr) {
 	gp.asyncSafePoint = false
 }
 
+type debugCallWrapArgs struct {
+	dispatch uintptr
+	callingG *g
+}
+
 // debugCallWrap1 is the continuation of debugCallWrap on the callee
 // goroutine.
-func debugCallWrap1(dispatch uintptr, callingG *g) {
+func debugCallWrap1() {
+	gp := getg()
+	args := (*debugCallWrapArgs)(gp.param)
+	dispatch, callingG := args.dispatch, args.callingG
+	gp.param = nil
+
 	// Dispatch call and trap panics.
 	debugCallWrap2(dispatch)
 
diff --git a/src/runtime/debuglog_off.go b/src/runtime/debuglog_off.go
index bb3e172498..dd38156999 100644
--- a/src/runtime/debuglog_off.go
+++ b/src/runtime/debuglog_off.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !debuglog
 // +build !debuglog
 
 package runtime
diff --git a/src/runtime/debuglog_on.go b/src/runtime/debuglog_on.go
index 3d477e8ef5..2fcdbe70d1 100644
--- a/src/runtime/debuglog_on.go
+++ b/src/runtime/debuglog_on.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build debuglog
 // +build debuglog
 
 package runtime
diff --git a/src/runtime/defer_test.go b/src/runtime/defer_test.go
index 9a40ea1984..fc96144597 100644
--- a/src/runtime/defer_test.go
+++ b/src/runtime/defer_test.go
@@ -370,7 +370,7 @@ func g2() {
 	defer ap.method2()
 	defer ap.method1()
 	ff1(ap, 1, 2, 3, 4, 5, 6, 7, 8, 9)
-	// Try to get the stack to be be moved by growing it too large, so
+	// Try to get the stack to be moved by growing it too large, so
 	// existing stack-allocated defer becomes invalid.
 	rec1(2000)
 }
diff --git a/src/runtime/defs1_linux.go b/src/runtime/defs1_linux.go
index 4085d6f418..df9c05dd5e 100644
--- a/src/runtime/defs1_linux.go
+++ b/src/runtime/defs1_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go
index 87e19c1598..d016db7d02 100644
--- a/src/runtime/defs2_linux.go
+++ b/src/runtime/defs2_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs3_linux.go b/src/runtime/defs3_linux.go
index 31f2191c76..0a06aa2370 100644
--- a/src/runtime/defs3_linux.go
+++ b/src/runtime/defs3_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_aix.go b/src/runtime/defs_aix.go
index 23a6cac2bb..1605002ea2 100644
--- a/src/runtime/defs_aix.go
+++ b/src/runtime/defs_aix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_aix_ppc64.go b/src/runtime/defs_aix_ppc64.go
index a53fcc5933..f84ff1160d 100644
--- a/src/runtime/defs_aix_ppc64.go
+++ b/src/runtime/defs_aix_ppc64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix
 // +build aix
 
 package runtime
diff --git a/src/runtime/defs_arm_linux.go b/src/runtime/defs_arm_linux.go
index e51dd32b5b..f6b6dd2c09 100644
--- a/src/runtime/defs_arm_linux.go
+++ b/src/runtime/defs_arm_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
index cc8c475387..2d41a97b57 100644
--- a/src/runtime/defs_darwin.go
+++ b/src/runtime/defs_darwin.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_dragonfly.go b/src/runtime/defs_dragonfly.go
index 95014fe6e7..aca2bf9001 100644
--- a/src/runtime/defs_dragonfly.go
+++ b/src/runtime/defs_dragonfly.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
@@ -31,6 +32,10 @@ const (
 	EFAULT = C.EFAULT
 	EBUSY  = C.EBUSY
 	EAGAIN = C.EAGAIN
+	ENOSYS = C.ENOSYS
+
+	O_NONBLOCK = C.O_NONBLOCK
+	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
diff --git a/src/runtime/defs_dragonfly_amd64.go b/src/runtime/defs_dragonfly_amd64.go
index 30f1b33845..f3c6ecd04b 100644
--- a/src/runtime/defs_dragonfly_amd64.go
+++ b/src/runtime/defs_dragonfly_amd64.go
@@ -10,6 +10,10 @@ const (
 	_EFAULT = 0xe
 	_EBUSY  = 0x10
 	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x20000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
index e196dff076..c258759549 100644
--- a/src/runtime/defs_freebsd.go
+++ b/src/runtime/defs_freebsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go
index 7b14063386..022ef19427 100644
--- a/src/runtime/defs_linux.go
+++ b/src/runtime/defs_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go
index 1fb423b198..2cafad20cf 100644
--- a/src/runtime/defs_linux_mips64x.go
+++ b/src/runtime/defs_linux_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (mips64 || mips64le) && linux
 // +build mips64 mips64le
 // +build linux
 
diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go
index 9315ba9346..3a8dfe2e99 100644
--- a/src/runtime/defs_linux_mipsx.go
+++ b/src/runtime/defs_linux_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (mips || mipsle) && linux
 // +build mips mipsle
 // +build linux
 
diff --git a/src/runtime/defs_netbsd.go b/src/runtime/defs_netbsd.go
index 3f5ce5adca..755992d18e 100644
--- a/src/runtime/defs_netbsd.go
+++ b/src/runtime/defs_netbsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_netbsd_386.go b/src/runtime/defs_netbsd_386.go
index c26f246077..03c9c2de59 100644
--- a/src/runtime/defs_netbsd_386.go
+++ b/src/runtime/defs_netbsd_386.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_netbsd_amd64.go b/src/runtime/defs_netbsd_amd64.go
index f18a7b1fe3..9fda1d7d22 100644
--- a/src/runtime/defs_netbsd_amd64.go
+++ b/src/runtime/defs_netbsd_amd64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_netbsd_arm.go b/src/runtime/defs_netbsd_arm.go
index cb0dce66b4..e7f4f6cece 100644
--- a/src/runtime/defs_netbsd_arm.go
+++ b/src/runtime/defs_netbsd_arm.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index ff7e21c71e..8d323449d1 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_openbsd_386.go b/src/runtime/defs_openbsd_386.go
index 35f2e53fcf..a866ec880a 100644
--- a/src/runtime/defs_openbsd_386.go
+++ b/src/runtime/defs_openbsd_386.go
@@ -30,6 +30,13 @@ const (
 	_SA_RESTART = 0x2
 	_SA_ONSTACK = 0x1
 
+	_PTHREAD_CREATE_DETACHED = 0x1
+
+	_F_SETFD    = 0x2
+	_F_GETFL    = 0x3
+	_F_SETFL    = 0x4
+	_FD_CLOEXEC = 0x1
+
 	_SIGHUP    = 0x1
 	_SIGINT    = 0x2
 	_SIGQUIT   = 0x3
@@ -166,3 +173,10 @@ type keventt struct {
 	data   int64
 	udata  *byte
 }
+
+type pthread uintptr
+type pthreadattr uintptr
+type pthreadcond uintptr
+type pthreadcondattr uintptr
+type pthreadmutex uintptr
+type pthreadmutexattr uintptr
diff --git a/src/runtime/defs_solaris.go b/src/runtime/defs_solaris.go
index 22df59094d..e644f9c6dd 100644
--- a/src/runtime/defs_solaris.go
+++ b/src/runtime/defs_solaris.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/defs_solaris_amd64.go b/src/runtime/defs_solaris_amd64.go
index 0493178880..a0b38319a5 100644
--- a/src/runtime/defs_solaris_amd64.go
+++ b/src/runtime/defs_solaris_amd64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 /*
diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
index af353bbcd9..95517b2a95 100644
--- a/src/runtime/env_posix.go
+++ b/src/runtime/env_posix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris || windows || plan9
 // +build aix darwin dragonfly freebsd js,wasm linux netbsd openbsd solaris windows plan9
 
 package runtime
diff --git a/src/runtime/error.go b/src/runtime/error.go
index 9e6cdf35dd..91f83ae126 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -134,6 +134,7 @@ const (
 	boundsSlice3B    // s[?:x:y], 0 <= x <= y failed (but boundsSlice3A didn't happen)
 	boundsSlice3C    // s[x:y:?], 0 <= x <= y failed (but boundsSlice3A/B didn't happen)
 
+	boundsConvert // (*[x]T)(s), 0 <= x <= len(s) failed
 	// Note: in the above, len(s) and cap(s) are stored in y
 )
 
@@ -149,6 +150,7 @@ var boundsErrorFmts = [...]string{
 	boundsSlice3Acap: "slice bounds out of range [::%x] with capacity %y",
 	boundsSlice3B:    "slice bounds out of range [:%x:%y]",
 	boundsSlice3C:    "slice bounds out of range [%x:%y:]",
+	boundsConvert:    "cannot convert slice with length %y to pointer to array with length %x",
 }
 
 // boundsNegErrorFmts are overriding formats if x is negative. In this case there's no need to report y.
diff --git a/src/runtime/export_debug_regabiargs_off_test.go b/src/runtime/export_debug_regabiargs_off_test.go
new file mode 100644
index 0000000000..fce37ab4d1
--- /dev/null
+++ b/src/runtime/export_debug_regabiargs_off_test.go
@@ -0,0 +1,17 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && linux && !goexperiment.regabiargs
+// +build amd64,linux
+// +build !goexperiment.regabiargs
+
+package runtime
+
+import "internal/abi"
+
+func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
+}
+
+func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
+}
diff --git a/src/runtime/export_debug_regabiargs_on_test.go b/src/runtime/export_debug_regabiargs_on_test.go
new file mode 100644
index 0000000000..3c65127e56
--- /dev/null
+++ b/src/runtime/export_debug_regabiargs_on_test.go
@@ -0,0 +1,47 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && linux && goexperiment.regabiargs
+// +build amd64,linux
+// +build goexperiment.regabiargs
+
+package runtime
+
+import "internal/abi"
+
+// storeRegArgs sets up argument registers in the signal
+// context state from an abi.RegArgs.
+//
+// Both src and dst must be non-nil.
+func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
+	dst.rax = uint64(src.Ints[0])
+	dst.rbx = uint64(src.Ints[1])
+	dst.rcx = uint64(src.Ints[2])
+	dst.rdi = uint64(src.Ints[3])
+	dst.rsi = uint64(src.Ints[4])
+	dst.r8 = uint64(src.Ints[5])
+	dst.r9 = uint64(src.Ints[6])
+	dst.r10 = uint64(src.Ints[7])
+	dst.r11 = uint64(src.Ints[8])
+	for i := range src.Floats {
+		dst.fpstate._xmm[i].element[0] = uint32(src.Floats[i] >> 0)
+		dst.fpstate._xmm[i].element[1] = uint32(src.Floats[i] >> 32)
+	}
+}
+
+func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
+	dst.Ints[0] = uintptr(src.rax)
+	dst.Ints[1] = uintptr(src.rbx)
+	dst.Ints[2] = uintptr(src.rcx)
+	dst.Ints[3] = uintptr(src.rdi)
+	dst.Ints[4] = uintptr(src.rsi)
+	dst.Ints[5] = uintptr(src.r8)
+	dst.Ints[6] = uintptr(src.r9)
+	dst.Ints[7] = uintptr(src.r10)
+	dst.Ints[8] = uintptr(src.r11)
+	for i := range dst.Floats {
+		dst.Floats[i] = uint64(src.fpstate._xmm[i].element[0]) << 0
+		dst.Floats[i] |= uint64(src.fpstate._xmm[i].element[1]) << 32
+	}
+}
diff --git a/src/runtime/export_debug_test.go b/src/runtime/export_debug_test.go
index ed4242ef24..fe4c9045c1 100644
--- a/src/runtime/export_debug_test.go
+++ b/src/runtime/export_debug_test.go
@@ -2,25 +2,28 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64
-// +build linux
+//go:build amd64 && linux
+// +build amd64,linux
 
 package runtime
 
 import (
+	"internal/abi"
 	"runtime/internal/sys"
 	"unsafe"
 )
 
-// InjectDebugCall injects a debugger call to fn into g. args must be
-// a pointer to a valid call frame (including arguments and return
-// space) for fn, or nil. tkill must be a function that will send
-// SIGTRAP to thread ID tid. gp must be locked to its OS thread and
+// InjectDebugCall injects a debugger call to fn into g. regArgs must
+// contain any arguments to fn that are passed in registers, according
+// to the internal Go ABI. It may be nil if no arguments are passed in
+// registers to fn. args must be a pointer to a valid call frame (including
+// arguments and return space) for fn, or nil. tkill must be a function that
+// will send SIGTRAP to thread ID tid. gp must be locked to its OS thread and
 // running.
 //
 // On success, InjectDebugCall returns the panic value of fn or nil.
 // If fn did not panic, its results will be available in args.
-func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, returnOnUnsafePoint bool) (interface{}, error) {
+func InjectDebugCall(gp *g, fn interface{}, regArgs *abi.RegArgs, stackArgs interface{}, tkill func(tid int) error, returnOnUnsafePoint bool) (interface{}, error) {
 	if gp.lockedm == 0 {
 		return nil, plainError("goroutine not locked to thread")
 	}
@@ -36,7 +39,7 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, ret
 	}
 	fv := (*funcval)(f.data)
 
-	a := efaceOf(&args)
+	a := efaceOf(&stackArgs)
 	if a._type != nil && a._type.kind&kindMask != kindPtr {
 		return nil, plainError("args must be a pointer or nil")
 	}
@@ -51,7 +54,7 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, ret
 	// gp may not be running right now, but we can still get the M
 	// it will run on since it's locked.
 	h.mp = gp.lockedm.ptr()
-	h.fv, h.argp, h.argSize = fv, argp, argSize
+	h.fv, h.regArgs, h.argp, h.argSize = fv, regArgs, argp, argSize
 	h.handleF = h.handle // Avoid allocating closure during signal
 
 	defer func() { testSigtrap = nil }()
@@ -91,6 +94,7 @@ type debugCallHandler struct {
 	gp      *g
 	mp      *m
 	fv      *funcval
+	regArgs *abi.RegArgs
 	argp    unsafe.Pointer
 	argSize uintptr
 	panic   interface{}
@@ -120,8 +124,8 @@ func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		h.savedRegs = *ctxt.regs()
 		h.savedFP = *h.savedRegs.fpstate
 		h.savedRegs.fpstate = nil
-		// Set PC to debugCallV1.
-		ctxt.set_rip(uint64(funcPC(debugCallV1)))
+		// Set PC to debugCallV2.
+		ctxt.set_rip(uint64(funcPC(debugCallV2)))
 		// Call injected. Switch to the debugCall protocol.
 		testSigtrap = h.handleF
 	case _Grunnable:
@@ -153,22 +157,28 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		return false
 	}
 
-	switch status := ctxt.rax(); status {
+	switch status := ctxt.r12(); status {
 	case 0:
-		// Frame is ready. Copy the arguments to the frame.
+		// Frame is ready. Copy the arguments to the frame and to registers.
 		sp := ctxt.rsp()
 		memmove(unsafe.Pointer(uintptr(sp)), h.argp, h.argSize)
+		if h.regArgs != nil {
+			storeRegArgs(ctxt.regs(), h.regArgs)
+		}
 		// Push return PC.
 		sp -= sys.PtrSize
 		ctxt.set_rsp(sp)
 		*(*uint64)(unsafe.Pointer(uintptr(sp))) = ctxt.rip()
 		// Set PC to call and context register.
 		ctxt.set_rip(uint64(h.fv.fn))
-		ctxt.regs().rcx = uint64(uintptr(unsafe.Pointer(h.fv)))
+		ctxt.regs().rdx = uint64(uintptr(unsafe.Pointer(h.fv)))
 	case 1:
-		// Function returned. Copy frame back out.
+		// Function returned. Copy frame and result registers back out.
 		sp := ctxt.rsp()
 		memmove(h.argp, unsafe.Pointer(uintptr(sp)), h.argSize)
+		if h.regArgs != nil {
+			loadRegArgs(h.regArgs, ctxt.regs())
+		}
 	case 2:
 		// Function panicked. Copy panic out.
 		sp := ctxt.rsp()
@@ -191,7 +201,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		// Done
 		notewakeup(&h.done)
 	default:
-		h.err = plainError("unexpected debugCallV1 status")
+		h.err = plainError("unexpected debugCallV2 status")
 		notewakeup(&h.done)
 	}
 	// Resume execution.
diff --git a/src/runtime/export_futex_test.go b/src/runtime/export_futex_test.go
index a727a93114..34c970d6d2 100644
--- a/src/runtime/export_futex_test.go
+++ b/src/runtime/export_futex_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux
 // +build dragonfly freebsd linux
 
 package runtime
diff --git a/src/runtime/export_mmap_test.go b/src/runtime/export_mmap_test.go
index aeaf37f64b..bf4a815899 100644
--- a/src/runtime/export_mmap_test.go
+++ b/src/runtime/export_mmap_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Export guts for testing.
diff --git a/src/runtime/export_pipe2_test.go b/src/runtime/export_pipe2_test.go
index 9d580d3313..26d8b7d185 100644
--- a/src/runtime/export_pipe2_test.go
+++ b/src/runtime/export_pipe2_test.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd linux netbsd openbsd solaris
+//go:build dragonfly || freebsd || linux || netbsd || openbsd || solaris
+// +build dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
 
diff --git a/src/runtime/export_pipe_test.go b/src/runtime/export_pipe_test.go
index 8f66770fb9..a0c6c0440d 100644
--- a/src/runtime/export_pipe_test.go
+++ b/src/runtime/export_pipe_test.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly
+//go:build aix || darwin
+// +build aix darwin
 
 package runtime
 
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index a48bb2636f..a6fc1e4785 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -46,6 +46,14 @@ var NetpollGenericInit = netpollGenericInit
 var Memmove = memmove
 var MemclrNoHeapPointers = memclrNoHeapPointers
 
+var LockPartialOrder = lockPartialOrder
+
+type LockRank lockRank
+
+func (l LockRank) String() string {
+	return lockRank(l).String()
+}
+
 const PreemptMSupported = preemptMSupported
 
 type LFNode struct {
@@ -139,28 +147,40 @@ func RunSchedLocalQueueStealTest() {
 	}
 }
 
+// Temporary to enable register ABI bringup.
+// TODO(register args): convert back to local variables in RunSchedLocalQueueEmptyTest that
+// get passed to the "go" stmts there.
+var RunSchedLocalQueueEmptyState struct {
+	done  chan bool
+	ready *uint32
+	p     *p
+}
+
 func RunSchedLocalQueueEmptyTest(iters int) {
 	// Test that runq is not spuriously reported as empty.
 	// Runq emptiness affects scheduling decisions and spurious emptiness
 	// can lead to underutilization (both runnable Gs and idle Ps coexist
 	// for arbitrary long time).
 	done := make(chan bool, 1)
+	RunSchedLocalQueueEmptyState.done = done
 	p := new(p)
+	RunSchedLocalQueueEmptyState.p = p
 	gs := make([]g, 2)
 	ready := new(uint32)
+	RunSchedLocalQueueEmptyState.ready = ready
 	for i := 0; i < iters; i++ {
 		*ready = 0
 		next0 := (i & 1) == 0
 		next1 := (i & 2) == 0
 		runqput(p, &gs[0], next0)
 		go func() {
-			for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
+			for atomic.Xadd(RunSchedLocalQueueEmptyState.ready, 1); atomic.Load(RunSchedLocalQueueEmptyState.ready) != 2; {
 			}
-			if runqempty(p) {
-				println("next:", next0, next1)
+			if runqempty(RunSchedLocalQueueEmptyState.p) {
+				//println("next:", next0, next1)
 				throw("queue is empty")
 			}
-			done <- true
+			RunSchedLocalQueueEmptyState.done <- true
 		}()
 		for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
 		}
@@ -373,7 +393,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
 			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
+		slow.Frees += uint64(m.tinyAllocCount) + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
 		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
@@ -396,9 +416,6 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			slow.HeapReleased += uint64(pg) * pageSize
 		}
 
-		// Unused space in the current arena also counts as released space.
-		slow.HeapReleased += uint64(mheap_.curArena.end - mheap_.curArena.base)
-
 		getg().m.mallocing--
 	})
 
@@ -1212,3 +1229,43 @@ func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
 func (th *TimeHistogram) Record(duration int64) {
 	(*timeHistogram)(th).record(duration)
 }
+
+func SetIntArgRegs(a int) int {
+	lock(&finlock)
+	old := intArgRegs
+	if a >= 0 {
+		intArgRegs = a
+	}
+	unlock(&finlock)
+	return old
+}
+
+func FinalizerGAsleep() bool {
+	lock(&finlock)
+	result := fingwait
+	unlock(&finlock)
+	return result
+}
+
+// For GCTestMoveStackOnNextCall, it's important not to introduce an
+// extra layer of call, since then there's a return before the "real"
+// next call.
+var GCTestMoveStackOnNextCall = gcTestMoveStackOnNextCall
+
+// For GCTestIsReachable, it's important that we do this as a call so
+// escape analysis can see through it.
+func GCTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
+	return gcTestIsReachable(ptrs...)
+}
+
+// For GCTestPointerClass, it's important that we do this as a call so
+// escape analysis can see through it.
+//
+// This is nosplit because gcTestPointerClass is.
+//
+//go:nosplit
+func GCTestPointerClass(p unsafe.Pointer) string {
+	return gcTestPointerClass(p)
+}
+
+const Raceenabled = raceenabled
diff --git a/src/runtime/export_unix_test.go b/src/runtime/export_unix_test.go
index 307c63fd68..215e234286 100644
--- a/src/runtime/export_unix_test.go
+++ b/src/runtime/export_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index bbe41dd0d4..48e1e6603b 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -110,8 +110,6 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	with a trivial allocator that obtains memory from the operating system and
 	never reclaims any memory.
 
-	scavenge: scavenge=1 enables debugging mode of heap scavenger.
-
 	scavtrace: setting scavtrace=1 causes the runtime to emit a single line to standard
 	error, roughly once per GC cycle, summarizing the amount of work done by the
 	scavenger as well as the total amount of memory returned to the operating system
@@ -242,11 +240,21 @@ func GOROOT() string {
 	return defaultGOROOT
 }
 
+// buildVersion is the Go tree's version string at build time.
+//
+// If any GOEXPERIMENTs are set to non-default values, it will include
+// "X:<GOEXPERIMENT>".
+//
+// This is set by the linker.
+//
+// This is accessed by "go version <binary>".
+var buildVersion string
+
 // Version returns the Go tree's version string.
 // It is either the commit hash and date at the time of the build or,
 // when possible, a release tag like "go1.3".
 func Version() string {
-	return sys.TheVersion
+	return buildVersion
 }
 
 // GOOS is the running program's operating system target:
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 798dbaceab..1002b181e4 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -17,6 +17,7 @@
 #define FUNCDATA_StackObjects 2
 #define FUNCDATA_InlTree 3
 #define FUNCDATA_OpenCodedDeferInfo 4 /* info for func with open-coded defers */
+#define FUNCDATA_ArgInfo 5
 
 // Pseudo-assembly statements.
 
diff --git a/src/runtime/futex_test.go b/src/runtime/futex_test.go
index 3051bd5880..10a735327a 100644
--- a/src/runtime/futex_test.go
+++ b/src/runtime/futex_test.go
@@ -6,6 +6,7 @@
 // The race detector emits calls to split stack functions so it breaks
 // the test.
 
+//go:build (dragonfly || freebsd || linux) && !race
 // +build dragonfly freebsd linux
 // +build !race
 
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 7870f31ae9..5e7c6c574f 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -202,6 +202,108 @@ func TestGcZombieReporting(t *testing.T) {
 	}
 }
 
+func TestGCTestMoveStackOnNextCall(t *testing.T) {
+	t.Parallel()
+	var onStack int
+	// GCTestMoveStackOnNextCall can fail in rare cases if there's
+	// a preemption. This won't happen many times in quick
+	// succession, so just retry a few times.
+	for retry := 0; retry < 5; retry++ {
+		runtime.GCTestMoveStackOnNextCall()
+		if moveStackCheck(t, &onStack, uintptr(unsafe.Pointer(&onStack))) {
+			// Passed.
+			return
+		}
+	}
+	t.Fatal("stack did not move")
+}
+
+// This must not be inlined because the point is to force a stack
+// growth check and move the stack.
+//
+//go:noinline
+func moveStackCheck(t *testing.T, new *int, old uintptr) bool {
+	// new should have been updated by the stack move;
+	// old should not have.
+
+	// Capture new's value before doing anything that could
+	// further move the stack.
+	new2 := uintptr(unsafe.Pointer(new))
+
+	t.Logf("old stack pointer %x, new stack pointer %x", old, new2)
+	if new2 == old {
+		// Check that we didn't screw up the test's escape analysis.
+		if cls := runtime.GCTestPointerClass(unsafe.Pointer(new)); cls != "stack" {
+			t.Fatalf("test bug: new (%#x) should be a stack pointer, not %s", new2, cls)
+		}
+		// This was a real failure.
+		return false
+	}
+	return true
+}
+
+func TestGCTestMoveStackRepeatedly(t *testing.T) {
+	// Move the stack repeatedly to make sure we're not doubling
+	// it each time.
+	for i := 0; i < 100; i++ {
+		runtime.GCTestMoveStackOnNextCall()
+		moveStack1(false)
+	}
+}
+
+//go:noinline
+func moveStack1(x bool) {
+	// Make sure this function doesn't get auto-nosplit.
+	if x {
+		println("x")
+	}
+}
+
+func TestGCTestIsReachable(t *testing.T) {
+	var all, half []unsafe.Pointer
+	var want uint64
+	for i := 0; i < 16; i++ {
+		// The tiny allocator muddies things, so we use a
+		// scannable type.
+		p := unsafe.Pointer(new(*int))
+		all = append(all, p)
+		if i%2 == 0 {
+			half = append(half, p)
+			want |= 1 << i
+		}
+	}
+
+	got := runtime.GCTestIsReachable(all...)
+	if want != got {
+		t.Fatalf("did not get expected reachable set; want %b, got %b", want, got)
+	}
+	runtime.KeepAlive(half)
+}
+
+var pointerClassSink *int
+var pointerClassData = 42
+
+func TestGCTestPointerClass(t *testing.T) {
+	t.Parallel()
+	check := func(p unsafe.Pointer, want string) {
+		t.Helper()
+		got := runtime.GCTestPointerClass(p)
+		if got != want {
+			// Convert the pointer to a uintptr to avoid
+			// escaping it.
+			t.Errorf("for %#x, want class %s, got %s", uintptr(p), want, got)
+		}
+	}
+	var onStack int
+	var notOnStack int
+	pointerClassSink = &notOnStack
+	check(unsafe.Pointer(&onStack), "stack")
+	check(unsafe.Pointer(&notOnStack), "heap")
+	check(unsafe.Pointer(&pointerClassSink), "bss")
+	check(unsafe.Pointer(&pointerClassData), "data")
+	check(nil, "other")
+}
+
 func BenchmarkSetTypePtr(b *testing.B) {
 	benchSetType(b, new(*byte))
 }
diff --git a/src/runtime/hash32.go b/src/runtime/hash32.go
index 966f70e1aa..7c22c76b87 100644
--- a/src/runtime/hash32.go
+++ b/src/runtime/hash32.go
@@ -3,110 +3,61 @@
 // license that can be found in the LICENSE file.
 
 // Hashing algorithm inspired by
-//   xxhash: https://code.google.com/p/xxhash/
-// cityhash: https://code.google.com/p/cityhash/
+// wyhash: https://github.com/wangyi-fudan/wyhash/blob/ceb019b530e2c1c14d70b79bfa2bc49de7d95bc1/Modern%20Non-Cryptographic%20Hash%20Function%20and%20Pseudorandom%20Number%20Generator.pdf
 
+//go:build 386 || arm || mips || mipsle
 // +build 386 arm mips mipsle
 
 package runtime
 
 import "unsafe"
 
-const (
-	// Constants for multiplication: four random odd 32-bit numbers.
-	m1 = 3168982561
-	m2 = 3339683297
-	m3 = 832293441
-	m4 = 2336365089
-)
-
-func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
-	h := uint32(seed + s*hashkey[0])
-tail:
-	switch {
-	case s == 0:
-	case s < 4:
-		h ^= uint32(*(*byte)(p))
-		h ^= uint32(*(*byte)(add(p, s>>1))) << 8
-		h ^= uint32(*(*byte)(add(p, s-1))) << 16
-		h = rotl_15(h*m1) * m2
-	case s == 4:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-	case s <= 8:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-4))
-		h = rotl_15(h*m1) * m2
-	case s <= 16:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, 4))
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-8))
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-4))
-		h = rotl_15(h*m1) * m2
-	default:
-		v1 := h
-		v2 := uint32(seed * hashkey[1])
-		v3 := uint32(seed * hashkey[2])
-		v4 := uint32(seed * hashkey[3])
-		for s >= 16 {
-			v1 ^= readUnaligned32(p)
-			v1 = rotl_15(v1*m1) * m2
-			p = add(p, 4)
-			v2 ^= readUnaligned32(p)
-			v2 = rotl_15(v2*m2) * m3
-			p = add(p, 4)
-			v3 ^= readUnaligned32(p)
-			v3 = rotl_15(v3*m3) * m4
-			p = add(p, 4)
-			v4 ^= readUnaligned32(p)
-			v4 = rotl_15(v4*m4) * m1
-			p = add(p, 4)
-			s -= 16
-		}
-		h = v1 ^ v2 ^ v3 ^ v4
-		goto tail
-	}
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
-}
-
 func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint32(seed + 4*hashkey[0])
-	h ^= readUnaligned32(p)
-	h = rotl_15(h*m1) * m2
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
+	a, b := mix32(uint32(seed), uint32(4^hashkey[0]))
+	t := readUnaligned32(p)
+	a ^= t
+	b ^= t
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
 }
 
 func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint32(seed + 8*hashkey[0])
-	h ^= readUnaligned32(p)
-	h = rotl_15(h*m1) * m2
-	h ^= readUnaligned32(add(p, 4))
-	h = rotl_15(h*m1) * m2
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
+	a, b := mix32(uint32(seed), uint32(8^hashkey[0]))
+	a ^= readUnaligned32(p)
+	b ^= readUnaligned32(add(p, 4))
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
+}
+
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
+
+	a, b := mix32(uint32(seed), uint32(s^hashkey[0]))
+	if s == 0 {
+		return uintptr(a ^ b)
+	}
+	for ; s > 8; s -= 8 {
+		a ^= readUnaligned32(p)
+		b ^= readUnaligned32(add(p, 4))
+		a, b = mix32(a, b)
+		p = add(p, 8)
+	}
+	if s >= 4 {
+		a ^= readUnaligned32(p)
+		b ^= readUnaligned32(add(p, s-4))
+	} else {
+		t := uint32(*(*byte)(p))
+		t |= uint32(*(*byte)(add(p, s>>1))) << 8
+		t |= uint32(*(*byte)(add(p, s-1))) << 16
+		b ^= t
+	}
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
 }
 
-// Note: in order to get the compiler to issue rotl instructions, we
-// need to constant fold the shift amount by hand.
-// TODO: convince the compiler to issue rotl instructions after inlining.
-func rotl_15(x uint32) uint32 {
-	return (x << 15) | (x >> (32 - 15))
+func mix32(a, b uint32) (uint32, uint32) {
+	c := uint64(a^uint32(hashkey[1])) * uint64(b^uint32(hashkey[2]))
+	return uint32(c), uint32(c >> 32)
 }
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index d1283824ad..5f7d00bf7f 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -3,106 +3,91 @@
 // license that can be found in the LICENSE file.
 
 // Hashing algorithm inspired by
-//   xxhash: https://code.google.com/p/xxhash/
-// cityhash: https://code.google.com/p/cityhash/
+// wyhash: https://github.com/wangyi-fudan/wyhash
 
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
 // +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/math"
+	"unsafe"
+)
 
 const (
-	// Constants for multiplication: four random odd 64-bit numbers.
-	m1 = 16877499708836156737
-	m2 = 2820277070424839065
-	m3 = 9497967016996688599
-	m4 = 15839092249703872147
+	m1 = 0xa0761d6478bd642f
+	m2 = 0xe7037ed1a0b428db
+	m3 = 0x8ebc6af09c88c6e3
+	m4 = 0x589965cc75374cc3
+	m5 = 0x1d8e4e27c47d124f
 )
 
 func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
-	h := uint64(seed + s*hashkey[0])
-tail:
+	var a, b uintptr
+	seed ^= hashkey[0] ^ m1
 	switch {
 	case s == 0:
+		return seed
 	case s < 4:
-		h ^= uint64(*(*byte)(p))
-		h ^= uint64(*(*byte)(add(p, s>>1))) << 8
-		h ^= uint64(*(*byte)(add(p, s-1))) << 16
-		h = rotl_31(h*m1) * m2
-	case s <= 8:
-		h ^= uint64(readUnaligned32(p))
-		h ^= uint64(readUnaligned32(add(p, s-4))) << 32
-		h = rotl_31(h*m1) * m2
+		a = uintptr(*(*byte)(p))
+		a |= uintptr(*(*byte)(add(p, s>>1))) << 8
+		a |= uintptr(*(*byte)(add(p, s-1))) << 16
+	case s == 4:
+		a = r4(p)
+		b = a
+	case s < 8:
+		a = r4(p)
+		b = r4(add(p, s-4))
+	case s == 8:
+		a = r8(p)
+		b = a
 	case s <= 16:
-		h ^= readUnaligned64(p)
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-8))
-		h = rotl_31(h*m1) * m2
-	case s <= 32:
-		h ^= readUnaligned64(p)
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, 8))
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-16))
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-8))
-		h = rotl_31(h*m1) * m2
+		a = r8(p)
+		b = r8(add(p, s-8))
 	default:
-		v1 := h
-		v2 := uint64(seed * hashkey[1])
-		v3 := uint64(seed * hashkey[2])
-		v4 := uint64(seed * hashkey[3])
-		for s >= 32 {
-			v1 ^= readUnaligned64(p)
-			v1 = rotl_31(v1*m1) * m2
-			p = add(p, 8)
-			v2 ^= readUnaligned64(p)
-			v2 = rotl_31(v2*m2) * m3
-			p = add(p, 8)
-			v3 ^= readUnaligned64(p)
-			v3 = rotl_31(v3*m3) * m4
-			p = add(p, 8)
-			v4 ^= readUnaligned64(p)
-			v4 = rotl_31(v4*m4) * m1
-			p = add(p, 8)
-			s -= 32
+		l := s
+		if l > 48 {
+			seed1 := seed
+			seed2 := seed
+			for ; l > 48; l -= 48 {
+				seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+				seed1 = mix(r8(add(p, 16))^m3, r8(add(p, 24))^seed1)
+				seed2 = mix(r8(add(p, 32))^m4, r8(add(p, 40))^seed2)
+				p = add(p, 48)
+			}
+			seed ^= seed1 ^ seed2
+		}
+		for ; l > 16; l -= 16 {
+			seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+			p = add(p, 16)
 		}
-		h = v1 ^ v2 ^ v3 ^ v4
-		goto tail
+		a = r8(add(p, l-16))
+		b = r8(add(p, l-8))
 	}
 
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	return mix(m5^s, mix(a^m2, b^seed))
 }
 
 func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint64(seed + 4*hashkey[0])
-	v := uint64(readUnaligned32(p))
-	h ^= v
-	h ^= v << 32
-	h = rotl_31(h*m1) * m2
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	a := r4(p)
+	return mix(m5^4, mix(a^m2, a^seed^hashkey[0]^m1))
 }
 
 func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint64(seed + 8*hashkey[0])
-	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
-	h = rotl_31(h*m1) * m2
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	a := r8(p)
+	return mix(m5^8, mix(a^m2, a^seed^hashkey[0]^m1))
+}
+
+func mix(a, b uintptr) uintptr {
+	hi, lo := math.Mul64(uint64(a), uint64(b))
+	return uintptr(hi ^ lo)
+}
+
+func r4(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned32(p))
 }
 
-// Note: in order to get the compiler to issue rotl instructions, we
-// need to constant fold the shift amount by hand.
-// TODO: convince the compiler to issue rotl instructions after inlining.
-func rotl_31(x uint64) uint64 {
-	return (x << 31) | (x >> (64 - 31))
+func r8(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned64(p))
 }
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 2d531571aa..934e55f495 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -403,9 +403,10 @@ func dumpgoroutine(gp *g) {
 }
 
 func dumpgs() {
+	assertWorldStopped()
+
 	// goroutines & stacks
-	for i := 0; uintptr(i) < allglen; i++ {
-		gp := allgs[i]
+	forEachG(func(gp *g) {
 		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
 		switch status {
 		default:
@@ -418,7 +419,7 @@ func dumpgs() {
 			_Gwaiting:
 			dumpgoroutine(gp)
 		}
-	}
+	})
 }
 
 func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, ot *ptrtype) {
@@ -531,7 +532,7 @@ func dumpparams() {
 	dumpint(uint64(arenaStart))
 	dumpint(uint64(arenaEnd))
 	dumpstr(sys.GOARCH)
-	dumpstr(sys.Goexperiment)
+	dumpstr(buildVersion)
 	dumpint(uint64(ncpu))
 }
 
diff --git a/src/runtime/histogram.go b/src/runtime/histogram.go
index da4910d341..0cccbcca16 100644
--- a/src/runtime/histogram.go
+++ b/src/runtime/histogram.go
@@ -81,6 +81,10 @@ type timeHistogram struct {
 }
 
 // record adds the given duration to the distribution.
+//
+// Disallow preemptions and stack growths because this function
+// may run in sensitive locations.
+//go:nosplit
 func (h *timeHistogram) record(duration int64) {
 	if duration < 0 {
 		atomic.Xadd64(&h.underflow, 1)
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 0504b89363..cd5fead999 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -447,23 +447,18 @@ func convI2I(inter *interfacetype, i iface) (r iface) {
 	return
 }
 
-func assertI2I(inter *interfacetype, i iface) (r iface) {
-	tab := i.tab
+func assertI2I(inter *interfacetype, tab *itab) *itab {
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
 		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
 	}
 	if tab.inter == inter {
-		r.tab = tab
-		r.data = i.data
-		return
+		return tab
 	}
-	r.tab = getitab(inter, tab._type, false)
-	r.data = i.data
-	return
+	return getitab(inter, tab._type, false)
 }
 
-func assertI2I2(inter *interfacetype, i iface) (r iface, b bool) {
+func assertI2I2(inter *interfacetype, i iface) (r iface) {
 	tab := i.tab
 	if tab == nil {
 		return
@@ -476,22 +471,18 @@ func assertI2I2(inter *interfacetype, i iface) (r iface, b bool) {
 	}
 	r.tab = tab
 	r.data = i.data
-	b = true
 	return
 }
 
-func assertE2I(inter *interfacetype, e eface) (r iface) {
-	t := e._type
+func assertE2I(inter *interfacetype, t *_type) *itab {
 	if t == nil {
 		// explicit conversions require non-nil interface value.
 		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
 	}
-	r.tab = getitab(inter, t, false)
-	r.data = e.data
-	return
+	return getitab(inter, t, false)
 }
 
-func assertE2I2(inter *interfacetype, e eface) (r iface, b bool) {
+func assertE2I2(inter *interfacetype, e eface) (r iface) {
 	t := e._type
 	if t == nil {
 		return
@@ -502,18 +493,17 @@ func assertE2I2(inter *interfacetype, e eface) (r iface, b bool) {
 	}
 	r.tab = tab
 	r.data = e.data
-	b = true
 	return
 }
 
 //go:linkname reflect_ifaceE2I reflect.ifaceE2I
 func reflect_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
-	*dst = assertE2I(inter, e)
+	*dst = iface{assertE2I(inter, e._type), e.data}
 }
 
 //go:linkname reflectlite_ifaceE2I internal/reflectlite.ifaceE2I
 func reflectlite_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
-	*dst = assertE2I(inter, e)
+	*dst = iface{assertE2I(inter, e._type), e.data}
 }
 
 func iterate_itabs(fn func(*itab)) {
@@ -563,3 +553,10 @@ var staticuint64s = [...]uint64{
 	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 }
+
+// The linker redirects a reference of a method that it determined
+// unreachable to a reference to this function, so it will throw if
+// ever called.
+func unreachableMethod() {
+	throw("unreachable method called. linker bug?")
+}
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
deleted file mode 100644
index 2cf7c55870..0000000000
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Note: some of these functions are semantically inlined
-// by the compiler (in src/cmd/compile/internal/gc/ssa.go).
-
-#include "textflag.h"
-
-// bool Cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0-17
-	MOVQ	ptr+0(FP), BX
-	MOVL	old+8(FP), AX
-	MOVL	new+12(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// bool	runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
-	JMP	runtime∕internal∕atomic·Cas64(SB)
-
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-17
-	JMP	runtime∕internal∕atomic·Cas(SB)
-
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
-	JMP	runtime∕internal∕atomic·Xadd64(SB)
-
-// bool Casp1(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// uint32 Xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	delta+8(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
-	JMP	runtime∕internal∕atomic·Xadd64(SB)
-
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	new+8(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
-	JMP	runtime∕internal∕atomic·Xchg64(SB)
-
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
-	MOVQ	ptr+0(FP), BX
-	MOVL	val+8(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Store(SB)
-
-TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), BX
-	MOVB	val+8(FP), AX
-	XCHGB	AX, 0(BX)
-	RET
-
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ORB	BX, (AX)
-	RET
-
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ANDB	BX, (AX)
-	RET
-
-// func Or(addr *uint32, v uint32)
-TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
-	MOVQ	ptr+0(FP), AX
-	MOVL	val+8(FP), BX
-	LOCK
-	ORL	BX, (AX)
-	RET
-
-// func And(addr *uint32, v uint32)
-TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
-	MOVQ	ptr+0(FP), AX
-	MOVL	val+8(FP), BX
-	LOCK
-	ANDL	BX, (AX)
-	RET
diff --git a/src/runtime/internal/atomic/asm_arm64.s b/src/runtime/internal/atomic/asm_arm64.s
deleted file mode 100644
index 8336a859ad..0000000000
--- a/src/runtime/internal/atomic/asm_arm64.s
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// bool Cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R0
-	MOVW	old+8(FP), R1
-	MOVW	new+12(FP), R2
-again:
-	LDAXRW	(R0), R3
-	CMPW	R1, R3
-	BNE	ok
-	STLXRW	R2, (R0), R3
-	CBNZ	R3, again
-ok:
-	CSET	EQ, R0
-	MOVB	R0, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
-	B	runtime∕internal∕atomic·Cas64(SB)
-
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-17
-	B	runtime∕internal∕atomic·Cas(SB)
-
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
-	B	runtime∕internal∕atomic·Xadd64(SB)
-
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
-	B	runtime∕internal∕atomic·Xadd64(SB)
-
-// bool Casp1(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
-	B runtime∕internal∕atomic·Cas64(SB)
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
deleted file mode 100644
index a515683ebb..0000000000
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips64 mips64le
-
-#include "textflag.h"
-
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT ·Cas(SB), NOSPLIT, $0-17
-	MOVV	ptr+0(FP), R1
-	MOVW	old+8(FP), R2
-	MOVW	new+12(FP), R5
-	SYNC
-cas_again:
-	MOVV	R5, R3
-	LL	(R1), R4
-	BNE	R2, R4, cas_fail
-	SC	R3, (R1)
-	BEQ	R3, cas_again
-	MOVV	$1, R1
-	MOVB	R1, ret+16(FP)
-	SYNC
-	RET
-cas_fail:
-	MOVV	$0, R1
-	JMP	-4(PC)
-
-// bool	cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT ·Cas64(SB), NOSPLIT, $0-25
-	MOVV	ptr+0(FP), R1
-	MOVV	old+8(FP), R2
-	MOVV	new+16(FP), R5
-	SYNC
-cas64_again:
-	MOVV	R5, R3
-	LLV	(R1), R4
-	BNE	R2, R4, cas64_fail
-	SCV	R3, (R1)
-	BEQ	R3, cas64_again
-	MOVV	$1, R1
-	MOVB	R1, ret+24(FP)
-	SYNC
-	RET
-cas64_fail:
-	MOVV	$0, R1
-	JMP	-4(PC)
-
-TEXT ·Casuintptr(SB), NOSPLIT, $0-25
-	JMP	·Cas64(SB)
-
-TEXT ·CasRel(SB), NOSPLIT, $0-17
-	JMP	·Cas(SB)
-
-TEXT ·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
-	JMP	·Load64(SB)
-
-TEXT ·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
-	JMP	·Load64(SB)
-
-TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
-	JMP	·Store64(SB)
-
-TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
-	JMP	·Xadd64(SB)
-
-TEXT ·Loadint64(SB), NOSPLIT, $0-16
-	JMP	·Load64(SB)
-
-TEXT ·Xaddint64(SB), NOSPLIT, $0-24
-	JMP	·Xadd64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT ·Casp1(SB), NOSPLIT, $0-25
-	JMP runtime∕internal∕atomic·Cas64(SB)
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT ·Xadd(SB), NOSPLIT, $0-20
-	MOVV	ptr+0(FP), R2
-	MOVW	delta+8(FP), R3
-	SYNC
-	LL	(R2), R1
-	ADDU	R1, R3, R4
-	MOVV	R4, R1
-	SC	R4, (R2)
-	BEQ	R4, -4(PC)
-	MOVW	R1, ret+16(FP)
-	SYNC
-	RET
-
-TEXT ·Xadd64(SB), NOSPLIT, $0-24
-	MOVV	ptr+0(FP), R2
-	MOVV	delta+8(FP), R3
-	SYNC
-	LLV	(R2), R1
-	ADDVU	R1, R3, R4
-	MOVV	R4, R1
-	SCV	R4, (R2)
-	BEQ	R4, -4(PC)
-	MOVV	R1, ret+16(FP)
-	SYNC
-	RET
-
-TEXT ·Xchg(SB), NOSPLIT, $0-20
-	MOVV	ptr+0(FP), R2
-	MOVW	new+8(FP), R5
-
-	SYNC
-	MOVV	R5, R3
-	LL	(R2), R1
-	SC	R3, (R2)
-	BEQ	R3, -3(PC)
-	MOVW	R1, ret+16(FP)
-	SYNC
-	RET
-
-TEXT ·Xchg64(SB), NOSPLIT, $0-24
-	MOVV	ptr+0(FP), R2
-	MOVV	new+8(FP), R5
-
-	SYNC
-	MOVV	R5, R3
-	LLV	(R2), R1
-	SCV	R3, (R2)
-	BEQ	R3, -3(PC)
-	MOVV	R1, ret+16(FP)
-	SYNC
-	RET
-
-TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
-	JMP	·Xchg64(SB)
-
-TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
-	JMP	·Store64(SB)
-
-TEXT ·StoreRel(SB), NOSPLIT, $0-12
-	JMP	·Store(SB)
-
-TEXT ·StoreRel64(SB), NOSPLIT, $0-16
-	JMP	·Store64(SB)
-
-TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
-	JMP	·Store64(SB)
-
-TEXT ·Store(SB), NOSPLIT, $0-12
-	MOVV	ptr+0(FP), R1
-	MOVW	val+8(FP), R2
-	SYNC
-	MOVW	R2, 0(R1)
-	SYNC
-	RET
-
-TEXT ·Store8(SB), NOSPLIT, $0-9
-	MOVV	ptr+0(FP), R1
-	MOVB	val+8(FP), R2
-	SYNC
-	MOVB	R2, 0(R1)
-	SYNC
-	RET
-
-TEXT ·Store64(SB), NOSPLIT, $0-16
-	MOVV	ptr+0(FP), R1
-	MOVV	val+8(FP), R2
-	SYNC
-	MOVV	R2, 0(R1)
-	SYNC
-	RET
-
-// void	Or8(byte volatile*, byte);
-TEXT ·Or8(SB), NOSPLIT, $0-9
-	MOVV	ptr+0(FP), R1
-	MOVBU	val+8(FP), R2
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	MOVV	$~3, R3
-	AND	R1, R3
-	// Compute val shift.
-#ifdef GOARCH_mips64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R1
-#endif
-	// R4 = ((ptr & 3) * 8)
-	AND	$3, R1, R4
-	SLLV	$3, R4
-	// Shift val for aligned ptr. R2 = val << R4
-	SLLV	R4, R2
-
-	SYNC
-	LL	(R3), R4
-	OR	R2, R4
-	SC	R4, (R3)
-	BEQ	R4, -4(PC)
-	SYNC
-	RET
-
-// void	And8(byte volatile*, byte);
-TEXT ·And8(SB), NOSPLIT, $0-9
-	MOVV	ptr+0(FP), R1
-	MOVBU	val+8(FP), R2
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	MOVV	$~3, R3
-	AND	R1, R3
-	// Compute val shift.
-#ifdef GOARCH_mips64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R1
-#endif
-	// R4 = ((ptr & 3) * 8)
-	AND	$3, R1, R4
-	SLLV	$3, R4
-	// Shift val for aligned ptr. R2 = val << R4 | ^(0xFF << R4)
-	MOVV	$0xFF, R5
-	SLLV	R4, R2
-	SLLV	R4, R5
-	NOR	R0, R5
-	OR	R5, R2
-
-	SYNC
-	LL	(R3), R4
-	AND	R2, R4
-	SC	R4, (R3)
-	BEQ	R4, -4(PC)
-	SYNC
-	RET
-
-// func Or(addr *uint32, v uint32)
-TEXT ·Or(SB), NOSPLIT, $0-12
-	MOVV	ptr+0(FP), R1
-	MOVW	val+8(FP), R2
-
-	SYNC
-	LL	(R1), R3
-	OR	R2, R3
-	SC	R3, (R1)
-	BEQ	R3, -4(PC)
-	SYNC
-	RET
-
-// func And(addr *uint32, v uint32)
-TEXT ·And(SB), NOSPLIT, $0-12
-	MOVV	ptr+0(FP), R1
-	MOVW	val+8(FP), R2
-
-	SYNC
-	LL	(R1), R3
-	AND	R2, R3
-	SC	R3, (R1)
-	BEQ	R3, -4(PC)
-	SYNC
-	RET
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
deleted file mode 100644
index 2b2cfabe08..0000000000
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build mips mipsle
-
-#include "textflag.h"
-
-TEXT ·Cas(SB),NOSPLIT,$0-13
-	MOVW	ptr+0(FP), R1
-	MOVW	old+4(FP), R2
-	MOVW	new+8(FP), R5
-	SYNC
-try_cas:
-	MOVW	R5, R3
-	LL	(R1), R4	// R4 = *R1
-	BNE	R2, R4, cas_fail
-	SC	R3, (R1)	// *R1 = R3
-	BEQ	R3, try_cas
-	SYNC
-	MOVB	R3, ret+12(FP)
-	RET
-cas_fail:
-	MOVB	R0, ret+12(FP)
-	RET
-
-TEXT ·Store(SB),NOSPLIT,$0-8
-	MOVW	ptr+0(FP), R1
-	MOVW	val+4(FP), R2
-	SYNC
-	MOVW	R2, 0(R1)
-	SYNC
-	RET
-
-TEXT ·Store8(SB),NOSPLIT,$0-5
-	MOVW	ptr+0(FP), R1
-	MOVB	val+4(FP), R2
-	SYNC
-	MOVB	R2, 0(R1)
-	SYNC
-	RET
-
-TEXT ·Load(SB),NOSPLIT,$0-8
-	MOVW	ptr+0(FP), R1
-	SYNC
-	MOVW	0(R1), R1
-	SYNC
-	MOVW	R1, ret+4(FP)
-	RET
-
-TEXT ·Load8(SB),NOSPLIT,$0-5
-	MOVW	ptr+0(FP), R1
-	SYNC
-	MOVB	0(R1), R1
-	SYNC
-	MOVB	R1, ret+4(FP)
-	RET
-
-TEXT ·Xadd(SB),NOSPLIT,$0-12
-	MOVW	ptr+0(FP), R2
-	MOVW	delta+4(FP), R3
-	SYNC
-try_xadd:
-	LL	(R2), R1	// R1 = *R2
-	ADDU	R1, R3, R4
-	MOVW	R4, R1
-	SC	R4, (R2)	// *R2 = R4
-	BEQ	R4, try_xadd
-	SYNC
-	MOVW	R1, ret+8(FP)
-	RET
-
-TEXT ·Xchg(SB),NOSPLIT,$0-12
-	MOVW	ptr+0(FP), R2
-	MOVW	new+4(FP), R5
-	SYNC
-try_xchg:
-	MOVW	R5, R3
-	LL	(R2), R1	// R1 = *R2
-	SC	R3, (R2)	// *R2 = R3
-	BEQ	R3, try_xchg
-	SYNC
-	MOVW	R1, ret+8(FP)
-	RET
-
-TEXT ·Casuintptr(SB),NOSPLIT,$0-13
-	JMP	·Cas(SB)
-
-TEXT ·CasRel(SB),NOSPLIT,$0-13
-	JMP	·Cas(SB)
-
-TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
-	JMP	·Load(SB)
-
-TEXT ·Loaduint(SB),NOSPLIT,$0-8
-	JMP	·Load(SB)
-
-TEXT ·Loadp(SB),NOSPLIT,$-0-8
-	JMP	·Load(SB)
-
-TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
-	JMP	·Store(SB)
-
-TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
-	JMP	·Xadd(SB)
-
-TEXT ·Loadint64(SB),NOSPLIT,$0-12
-	JMP	·Load64(SB)
-
-TEXT ·Xaddint64(SB),NOSPLIT,$0-20
-	JMP	·Xadd64(SB)
-
-TEXT ·Casp1(SB),NOSPLIT,$0-13
-	JMP	·Cas(SB)
-
-TEXT ·Xchguintptr(SB),NOSPLIT,$0-12
-	JMP	·Xchg(SB)
-
-TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
-	JMP	·Store(SB)
-
-TEXT ·StoreRel(SB),NOSPLIT,$0-8
-	JMP	·Store(SB)
-
-TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
-	JMP	·Store(SB)
-
-// void	Or8(byte volatile*, byte);
-TEXT ·Or8(SB),NOSPLIT,$0-5
-	MOVW	ptr+0(FP), R1
-	MOVBU	val+4(FP), R2
-	MOVW	$~3, R3	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	AND	R1, R3
-#ifdef GOARCH_mips
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R1
-#endif
-	AND	$3, R1, R4	// R4 = ((ptr & 3) * 8)
-	SLL	$3, R4
-	SLL	R4, R2, R2	// Shift val for aligned ptr. R2 = val << R4
-	SYNC
-try_or8:
-	LL	(R3), R4	// R4 = *R3
-	OR	R2, R4
-	SC	R4, (R3)	// *R3 = R4
-	BEQ	R4, try_or8
-	SYNC
-	RET
-
-// void	And8(byte volatile*, byte);
-TEXT ·And8(SB),NOSPLIT,$0-5
-	MOVW	ptr+0(FP), R1
-	MOVBU	val+4(FP), R2
-	MOVW	$~3, R3
-	AND	R1, R3
-#ifdef GOARCH_mips
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R1
-#endif
-	AND	$3, R1, R4	// R4 = ((ptr & 3) * 8)
-	SLL	$3, R4
-	MOVW	$0xFF, R5
-	SLL	R4, R2
-	SLL	R4, R5
-	NOR	R0, R5
-	OR	R5, R2	// Shift val for aligned ptr. R2 = val << R4 | ^(0xFF << R4)
-	SYNC
-try_and8:
-	LL	(R3), R4	// R4 = *R3
-	AND	R2, R4
-	SC	R4, (R3)	// *R3 = R4
-	BEQ	R4, try_and8
-	SYNC
-	RET
-
-// func Or(addr *uint32, v uint32)
-TEXT ·Or(SB), NOSPLIT, $0-8
-	MOVW	ptr+0(FP), R1
-	MOVW	val+4(FP), R2
-
-	SYNC
-	LL	(R1), R3
-	OR	R2, R3
-	SC	R3, (R1)
-	BEQ	R3, -4(PC)
-	SYNC
-	RET
-
-// func And(addr *uint32, v uint32)
-TEXT ·And(SB), NOSPLIT, $0-8
-	MOVW	ptr+0(FP), R1
-	MOVW	val+4(FP), R2
-
-	SYNC
-	LL	(R1), R3
-	AND	R2, R3
-	SC	R3, (R1)
-	BEQ	R3, -4(PC)
-	SYNC
-	RET
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
deleted file mode 100644
index bb009ab34d..0000000000
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-
-#include "textflag.h"
-
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R3
-	MOVWZ	old+8(FP), R4
-	MOVWZ	new+12(FP), R5
-	LWSYNC
-cas_again:
-	LWAR	(R3), R6
-	CMPW	R6, R4
-	BNE	cas_fail
-	STWCCC	R5, (R3)
-	BNE	cas_again
-	MOVD	$1, R3
-	LWSYNC
-	MOVB	R3, ret+16(FP)
-	RET
-cas_fail:
-	MOVB	R0, ret+16(FP)
-	RET
-
-// bool	runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
-	MOVD	ptr+0(FP), R3
-	MOVD	old+8(FP), R4
-	MOVD	new+16(FP), R5
-	LWSYNC
-cas64_again:
-	LDAR	(R3), R6
-	CMP	R6, R4
-	BNE	cas64_fail
-	STDCCC	R5, (R3)
-	BNE	cas64_again
-	MOVD	$1, R3
-	LWSYNC
-	MOVB	R3, ret+24(FP)
-	RET
-cas64_fail:
-	MOVB	R0, ret+24(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-17
-	MOVD    ptr+0(FP), R3
-	MOVWZ   old+8(FP), R4
-	MOVWZ   new+12(FP), R5
-	LWSYNC
-cas_again:
-	LWAR    (R3), $0, R6        // 0 = Mutex release hint
-	CMPW    R6, R4
-	BNE     cas_fail
-	STWCCC  R5, (R3)
-	BNE     cas_again
-	MOVD    $1, R3
-	MOVB    R3, ret+16(FP)
-	RET
-cas_fail:
-	MOVB    R0, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
-	BR	runtime∕internal∕atomic·Cas64(SB)
-
-TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
-	BR	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
-	BR	runtime∕internal∕atomic·LoadAcq64(SB)
-
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
-	BR	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
-	BR	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
-	BR	runtime∕internal∕atomic·StoreRel64(SB)
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
-	BR	runtime∕internal∕atomic·Xadd64(SB)
-
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
-	BR	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
-	BR	runtime∕internal∕atomic·Xadd64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
-	BR runtime∕internal∕atomic·Cas64(SB)
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	delta+8(FP), R5
-	LWSYNC
-	LWAR	(R4), R3
-	ADD	R5, R3
-	STWCCC	R3, (R4)
-	BNE	-3(PC)
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	delta+8(FP), R5
-	LWSYNC
-	LDAR	(R4), R3
-	ADD	R5, R3
-	STDCCC	R3, (R4)
-	BNE	-3(PC)
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	new+8(FP), R5
-	LWSYNC
-	LWAR	(R4), R3
-	STWCCC	R5, (R4)
-	BNE	-2(PC)
-	ISYNC
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	new+8(FP), R5
-	LWSYNC
-	LDAR	(R4), R3
-	STDCCC	R5, (R4)
-	BNE	-2(PC)
-	ISYNC
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
-	BR	runtime∕internal∕atomic·Xchg64(SB)
-
-
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
-	BR	runtime∕internal∕atomic·Store64(SB)
-
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	SYNC
-	MOVW	R4, 0(R3)
-	RET
-
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVB	val+8(FP), R4
-	SYNC
-	MOVB	R4, 0(R3)
-	RET
-
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	val+8(FP), R4
-	SYNC
-	MOVD	R4, 0(R3)
-	RET
-
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	LWSYNC
-	MOVW	R4, 0(R3)
-	RET
-
-TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	val+8(FP), R4
-	LWSYNC
-	MOVD	R4, 0(R3)
-	RET
-
-// void runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	LWSYNC
-again:
-	LBAR	(R3), R6
-	OR	R4, R6
-	STBCCC	R6, (R3)
-	BNE	again
-	RET
-
-// void runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	LWSYNC
-again:
-	LBAR	(R3), R6
-	AND	R4, R6
-	STBCCC	R6, (R3)
-	BNE	again
-	RET
-
-// func Or(addr *uint32, v uint32)
-TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	LWSYNC
-again:
-	LWAR	(R3), R6
-	OR	R4, R6
-	STWCCC	R6, (R3)
-	BNE	again
-	RET
-
-// func And(addr *uint32, v uint32)
-TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	LWSYNC
-again:
-	LWAR	(R3),R6
-	AND	R4, R6
-	STWCCC	R6, (R3)
-	BNE	again
-	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 1bfcb1143d..d4aed6b671 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386
 // +build 386
 
 package atomic
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/atomic_386.s
index d82faef1f0..37318e0ad7 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/atomic_386.s
@@ -21,6 +21,12 @@ TEXT ·Cas(SB), NOSPLIT, $0-13
 	SETEQ	ret+12(FP)
 	RET
 
+TEXT ·Casint32(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-21
+	JMP	·Cas64(SB)
+
 TEXT ·Casuintptr(SB), NOSPLIT, $0-13
 	JMP	·Cas(SB)
 
@@ -33,15 +39,27 @@ TEXT ·Loaduintptr(SB), NOSPLIT, $0-8
 TEXT ·Loaduint(SB), NOSPLIT, $0-8
 	JMP	·Load(SB)
 
+TEXT ·Storeint32(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-12
+	JMP	·Store64(SB)
+
 TEXT ·Storeuintptr(SB), NOSPLIT, $0-8
 	JMP	·Store(SB)
 
 TEXT ·Xadduintptr(SB), NOSPLIT, $0-12
 	JMP	·Xadd(SB)
 
+TEXT ·Loadint32(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
+
 TEXT ·Loadint64(SB), NOSPLIT, $0-12
 	JMP	·Load64(SB)
 
+TEXT ·Xaddint32(SB), NOSPLIT, $0-12
+	JMP	·Xadd(SB)
+
 TEXT ·Xaddint64(SB), NOSPLIT, $0-20
 	JMP	·Xadd64(SB)
 
@@ -142,6 +160,12 @@ TEXT ·Xchg(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
+TEXT ·Xchgint32(SB), NOSPLIT, $0-12
+	JMP	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-20
+	JMP	·Xchg64(SB)
+
 TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
 	JMP	·Xchg(SB)
 
@@ -189,8 +213,8 @@ TEXT ·Store(SB), NOSPLIT, $0-8
 TEXT ·StoreRel(SB), NOSPLIT, $0-8
 	JMP	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
 // uint64 atomicload64(uint64 volatile* addr);
 TEXT ·Load64(SB), NOSPLIT, $0-12
diff --git a/src/runtime/internal/atomic/atomic_amd64.s b/src/runtime/internal/atomic/atomic_amd64.s
new file mode 100644
index 0000000000..57cd59dd8c
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_amd64.s
@@ -0,0 +1,225 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Note: some of these functions are semantically inlined
+// by the compiler (in src/cmd/compile/internal/gc/ssa.go).
+
+#include "textflag.h"
+
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	JMP	·Load(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB),NOSPLIT,$0-17
+	MOVQ	ptr+0(FP), BX
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// bool	·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// bool Casp1(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+// uint64 Xadd64(uint64 volatile *val, int64 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	JMP	·Xadd(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+16(FP)
+	RET
+
+// uint64 Xchg64(ptr *uint64, new uint64)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	JMP	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), BX
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), BX
+	MOVB	val+8(FP), AX
+	XCHGB	AX, 0(BX)
+	RET
+
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 546b3d6120..2e9374ca26 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build arm
 // +build arm
 
 package atomic
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/atomic_arm.s
index 274925ed60..be3fd3a395 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/atomic_arm.s
@@ -60,6 +60,12 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
 TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-8
 	B 	·Load(SB)
 
+TEXT ·Casint32(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
+
+TEXT ·Casint64(SB),NOSPLIT,$-4-21
+	B	·Cas64(SB)
+
 TEXT ·Casuintptr(SB),NOSPLIT,$0-13
 	B	·Cas(SB)
 
@@ -69,12 +75,24 @@ TEXT ·Casp1(SB),NOSPLIT,$0-13
 TEXT ·CasRel(SB),NOSPLIT,$0-13
 	B	·Cas(SB)
 
+TEXT ·Loadint32(SB),NOSPLIT,$0-8
+	B	·Load(SB)
+
+TEXT ·Loadint64(SB),NOSPLIT,$-4-12
+	B	·Load64(SB)
+
 TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
 	B	·Load(SB)
 
 TEXT ·Loaduint(SB),NOSPLIT,$0-8
 	B	·Load(SB)
 
+TEXT ·Storeint32(SB),NOSPLIT,$0-8
+	B	·Store(SB)
+
+TEXT ·Storeint64(SB),NOSPLIT,$0-12
+	B	·Store64(SB)
+
 TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
 	B	·Store(SB)
 
@@ -87,21 +105,26 @@ TEXT ·StoreRel(SB),NOSPLIT,$0-8
 TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
 	B	·Store(SB)
 
+TEXT ·Xaddint32(SB),NOSPLIT,$0-12
+	B	·Xadd(SB)
+
+TEXT ·Xaddint64(SB),NOSPLIT,$-4-20
+	B	·Xadd64(SB)
+
 TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
 	B	·Xadd(SB)
 
-TEXT ·Loadint64(SB),NOSPLIT,$0-12
-	B	·Load64(SB)
+TEXT ·Xchgint32(SB),NOSPLIT,$0-12
+	B	·Xchg(SB)
 
-TEXT ·Xaddint64(SB),NOSPLIT,$0-20
-	B	·Xadd64(SB)
+TEXT ·Xchgint64(SB),NOSPLIT,$-4-20
+	B	·Xchg64(SB)
 
 // 64-bit atomics
 // The native ARM implementations use LDREXD/STREXD, which are
 // available on ARMv6k or later. We use them only on ARMv7.
 // On older ARM, we use Go implementations which simulate 64-bit
 // atomics with locks.
-
 TEXT armCas64<>(SB),NOSPLIT,$0-21
 	// addr is already in R1
 	MOVW	old_lo+4(FP), R2
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index d49bee8936..131c687e1b 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build arm64
 // +build arm64
 
 package atomic
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index 0cf3c40223..587e7f05e2 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -4,77 +4,127 @@
 
 #include "textflag.h"
 
-// uint32 runtime∕internal∕atomic·Load(uint32 volatile* addr)
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	B	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	B	·Cas64(SB)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	B	·Cas64(SB)
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	B	·Cas(SB)
+
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	B	·Load(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	B	·Load64(SB)
+
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
+	B	·Load64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT, $0-16
+	B	·Load64(SB)
+
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	B	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	B	·Store64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	B	·Store64(SB)
+
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	B	·Xadd(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	B	·Xadd64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	B	·Xadd64(SB)
+
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	B ·Cas64(SB)
+
+// uint32 ·Load(uint32 volatile* addr)
 TEXT ·Load(SB),NOSPLIT,$0-12
 	MOVD	ptr+0(FP), R0
 	LDARW	(R0), R0
 	MOVW	R0, ret+8(FP)
 	RET
 
-// uint8 runtime∕internal∕atomic·Load8(uint8 volatile* addr)
+// uint8 ·Load8(uint8 volatile* addr)
 TEXT ·Load8(SB),NOSPLIT,$0-9
 	MOVD	ptr+0(FP), R0
 	LDARB	(R0), R0
 	MOVB	R0, ret+8(FP)
 	RET
 
-// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* addr)
+// uint64 ·Load64(uint64 volatile* addr)
 TEXT ·Load64(SB),NOSPLIT,$0-16
 	MOVD	ptr+0(FP), R0
 	LDAR	(R0), R0
 	MOVD	R0, ret+8(FP)
 	RET
 
-// void *runtime∕internal∕atomic·Loadp(void *volatile *addr)
+// void *·Loadp(void *volatile *addr)
 TEXT ·Loadp(SB),NOSPLIT,$0-16
 	MOVD	ptr+0(FP), R0
 	LDAR	(R0), R0
 	MOVD	R0, ret+8(FP)
 	RET
 
-// uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* addr)
+// uint32 ·LoadAcq(uint32 volatile* addr)
 TEXT ·LoadAcq(SB),NOSPLIT,$0-12
 	B	·Load(SB)
 
-// uint64 runtime∕internal∕atomic·LoadAcquintptr(uint64 volatile* addr)
+// uint64 ·LoadAcquintptr(uint64 volatile* addr)
 TEXT ·LoadAcq64(SB),NOSPLIT,$0-16
 	B	·Load64(SB)
 
-// uintptr runtime∕internal∕atomic·LoadAcq64(uintptr volatile* addr)
+// uintptr ·LoadAcq64(uintptr volatile* addr)
 TEXT ·LoadAcquintptr(SB),NOSPLIT,$0-16
 	B	·Load64(SB)
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Store64(SB)
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	B	·Store64(SB)
 
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Store64(SB)
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	B	·Store64(SB)
 
-TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
-	B	runtime∕internal∕atomic·Store64(SB)
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	B	·Store64(SB)
 
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+TEXT ·Store(SB), NOSPLIT, $0-12
 	MOVD	ptr+0(FP), R0
 	MOVW	val+8(FP), R1
 	STLRW	R1, (R0)
 	RET
 
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
+TEXT ·Store8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R0
 	MOVB	val+8(FP), R1
 	STLRB	R1, (R0)
 	RET
 
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+TEXT ·Store64(SB), NOSPLIT, $0-16
 	MOVD	ptr+0(FP), R0
 	MOVD	val+8(FP), R1
 	STLR	R1, (R0)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB), NOSPLIT, $0-20
 	MOVD	ptr+0(FP), R0
 	MOVW	new+8(FP), R1
 again:
@@ -84,7 +134,12 @@ again:
 	MOVW	R2, ret+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+// uint64 Xchg64(ptr *uint64, new uint64)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
 	MOVD	ptr+0(FP), R0
 	MOVD	new+8(FP), R1
 again:
@@ -94,7 +149,29 @@ again:
 	MOVD	R2, ret+16(FP)
 	RET
 
-// bool runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// bool Cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R0
+	MOVW	old+8(FP), R1
+	MOVW	new+12(FP), R2
+again:
+	LDAXRW	(R0), R3
+	CMPW	R1, R3
+	BNE	ok
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, again
+ok:
+	CSET	EQ, R0
+	MOVB	R0, ret+16(FP)
+	RET
+
+// bool ·Cas64(uint64 *ptr, uint64 old, uint64 new)
 // Atomically:
 //      if(*val == *old){
 //              *val = new;
@@ -102,7 +179,7 @@ again:
 //      } else {
 //              return 0;
 //      }
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+TEXT ·Cas64(SB), NOSPLIT, $0-25
 	MOVD	ptr+0(FP), R0
 	MOVD	old+8(FP), R1
 	MOVD	new+16(FP), R2
@@ -121,7 +198,7 @@ ok:
 // Atomically:
 //      *val += delta;
 //      return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+TEXT ·Xadd(SB), NOSPLIT, $0-20
 	MOVD	ptr+0(FP), R0
 	MOVW	delta+8(FP), R1
 again:
@@ -132,7 +209,11 @@ again:
 	MOVW	R2, ret+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+// uint64 Xadd64(uint64 volatile *ptr, int64 delta)
+// Atomically:
+//      *val += delta;
+//      return *val;
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
 	MOVD	ptr+0(FP), R0
 	MOVD	delta+8(FP), R1
 again:
@@ -143,8 +224,14 @@ again:
 	MOVD	R2, ret+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
-	B	runtime∕internal∕atomic·Xchg64(SB)
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	B	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	B	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	B	·Xchg64(SB)
 
 TEXT ·And8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R0
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index b0109d72b0..5b407ebc7a 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips64 || mips64le
 // +build mips64 mips64le
 
 package atomic
diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
index 125c0c221c..2751c6f808 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.s
+++ b/src/runtime/internal/atomic/atomic_mips64x.s
@@ -8,7 +8,309 @@
 
 #define SYNC	WORD $0xf
 
-// uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
+// bool cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVV	ptr+0(FP), R1
+	MOVW	old+8(FP), R2
+	MOVW	new+12(FP), R5
+	SYNC
+cas_again:
+	MOVV	R5, R3
+	LL	(R1), R4
+	BNE	R2, R4, cas_fail
+	SC	R3, (R1)
+	BEQ	R3, cas_again
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	SYNC
+	RET
+cas_fail:
+	MOVV	$0, R1
+	JMP	-4(PC)
+
+// bool	cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVV	ptr+0(FP), R1
+	MOVV	old+8(FP), R2
+	MOVV	new+16(FP), R5
+	SYNC
+cas64_again:
+	MOVV	R5, R3
+	LLV	(R1), R4
+	BNE	R2, R4, cas64_fail
+	SCV	R3, (R1)
+	BEQ	R3, cas64_again
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	SYNC
+	RET
+cas64_fail:
+	MOVV	$0, R1
+	JMP	-4(PC)
+
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+TEXT ·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	JMP	·Load(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	JMP	·Xadd(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	JMP ·Cas64(SB)
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVV	ptr+0(FP), R2
+	MOVW	delta+8(FP), R3
+	SYNC
+	LL	(R2), R1
+	ADDU	R1, R3, R4
+	MOVV	R4, R1
+	SC	R4, (R2)
+	BEQ	R4, -4(PC)
+	MOVW	R1, ret+16(FP)
+	SYNC
+	RET
+
+// uint64 Xadd64(uint64 volatile *ptr, int64 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVV	ptr+0(FP), R2
+	MOVV	delta+8(FP), R3
+	SYNC
+	LLV	(R2), R1
+	ADDVU	R1, R3, R4
+	MOVV	R4, R1
+	SCV	R4, (R2)
+	BEQ	R4, -4(PC)
+	MOVV	R1, ret+16(FP)
+	SYNC
+	RET
+
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVV	ptr+0(FP), R2
+	MOVW	new+8(FP), R5
+
+	SYNC
+	MOVV	R5, R3
+	LL	(R2), R1
+	SC	R3, (R2)
+	BEQ	R3, -3(PC)
+	MOVW	R1, ret+16(FP)
+	SYNC
+	RET
+
+// uint64 Xchg64(ptr *uint64, new uint64)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVV	ptr+0(FP), R2
+	MOVV	new+8(FP), R5
+
+	SYNC
+	MOVV	R5, R3
+	LLV	(R2), R1
+	SCV	R3, (R2)
+	BEQ	R3, -3(PC)
+	MOVV	R1, ret+16(FP)
+	SYNC
+	RET
+
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	JMP	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+	SYNC
+	MOVW	R2, 0(R1)
+	SYNC
+	RET
+
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVB	val+8(FP), R2
+	SYNC
+	MOVB	R2, 0(R1)
+	SYNC
+	RET
+
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOVV	ptr+0(FP), R1
+	MOVV	val+8(FP), R2
+	SYNC
+	MOVV	R2, 0(R1)
+	SYNC
+	RET
+
+// void	Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVBU	val+8(FP), R2
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	MOVV	$~3, R3
+	AND	R1, R3
+	// Compute val shift.
+#ifdef GOARCH_mips64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	// R4 = ((ptr & 3) * 8)
+	AND	$3, R1, R4
+	SLLV	$3, R4
+	// Shift val for aligned ptr. R2 = val << R4
+	SLLV	R4, R2
+
+	SYNC
+	LL	(R3), R4
+	OR	R2, R4
+	SC	R4, (R3)
+	BEQ	R4, -4(PC)
+	SYNC
+	RET
+
+// void	And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVBU	val+8(FP), R2
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	MOVV	$~3, R3
+	AND	R1, R3
+	// Compute val shift.
+#ifdef GOARCH_mips64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	// R4 = ((ptr & 3) * 8)
+	AND	$3, R1, R4
+	SLLV	$3, R4
+	// Shift val for aligned ptr. R2 = val << R4 | ^(0xFF << R4)
+	MOVV	$0xFF, R5
+	SLLV	R4, R2
+	SLLV	R4, R5
+	NOR	R0, R5
+	OR	R5, R2
+
+	SYNC
+	LL	(R3), R4
+	AND	R2, R4
+	SC	R4, (R3)
+	BEQ	R4, -4(PC)
+	SYNC
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// uint32 ·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
 	MOVV	ptr+0(FP), R1
 	SYNC
@@ -17,7 +319,7 @@ TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
 	MOVW	R1, ret+8(FP)
 	RET
 
-// uint8 runtime∕internal∕atomic·Load8(uint8 volatile* ptr)
+// uint8 ·Load8(uint8 volatile* ptr)
 TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
 	MOVV	ptr+0(FP), R1
 	SYNC
@@ -26,7 +328,7 @@ TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
 	MOVB	R1, ret+8(FP)
 	RET
 
-// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* ptr)
+// uint64 ·Load64(uint64 volatile* ptr)
 TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
 	MOVV	ptr+0(FP), R1
 	SYNC
@@ -35,7 +337,7 @@ TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
 	MOVV	R1, ret+8(FP)
 	RET
 
-// void *runtime∕internal∕atomic·Loadp(void *volatile *ptr)
+// void *·Loadp(void *volatile *ptr)
 TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 	MOVV	ptr+0(FP), R1
 	SYNC
@@ -44,14 +346,14 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 	MOVV	R1, ret+8(FP)
 	RET
 
-// uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
+// uint32 ·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
 
-// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+// uint64 ·LoadAcq64(uint64 volatile* ptr)
 TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
 	JMP	atomic·Load64(SB)
 
-// uintptr runtime∕internal∕atomic·LoadAcquintptr(uintptr volatile* ptr)
+// uintptr ·LoadAcquintptr(uintptr volatile* ptr)
 TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
 	JMP	atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 1336b50121..80cd65b333 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips || mipsle
 // +build mips mipsle
 
 // Export some functions via linkname to assembly in sync/atomic.
diff --git a/src/runtime/internal/atomic/atomic_mipsx.s b/src/runtime/internal/atomic/atomic_mipsx.s
index aeebc8f2ff..3f61321450 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.s
+++ b/src/runtime/internal/atomic/atomic_mipsx.s
@@ -6,6 +6,239 @@
 
 #include "textflag.h"
 
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB),NOSPLIT,$0-13
+	MOVW	ptr+0(FP), R1
+	MOVW	old+4(FP), R2
+	MOVW	new+8(FP), R5
+	SYNC
+try_cas:
+	MOVW	R5, R3
+	LL	(R1), R4	// R4 = *R1
+	BNE	R2, R4, cas_fail
+	SC	R3, (R1)	// *R1 = R3
+	BEQ	R3, try_cas
+	SYNC
+	MOVB	R3, ret+12(FP)
+	RET
+cas_fail:
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT ·Store(SB),NOSPLIT,$0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+	SYNC
+	MOVW	R2, 0(R1)
+	SYNC
+	RET
+
+TEXT ·Store8(SB),NOSPLIT,$0-5
+	MOVW	ptr+0(FP), R1
+	MOVB	val+4(FP), R2
+	SYNC
+	MOVB	R2, 0(R1)
+	SYNC
+	RET
+
+TEXT ·Load(SB),NOSPLIT,$0-8
+	MOVW	ptr+0(FP), R1
+	SYNC
+	MOVW	0(R1), R1
+	SYNC
+	MOVW	R1, ret+4(FP)
+	RET
+
+TEXT ·Load8(SB),NOSPLIT,$0-5
+	MOVW	ptr+0(FP), R1
+	SYNC
+	MOVB	0(R1), R1
+	SYNC
+	MOVB	R1, ret+4(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB),NOSPLIT,$0-12
+	MOVW	ptr+0(FP), R2
+	MOVW	delta+4(FP), R3
+	SYNC
+try_xadd:
+	LL	(R2), R1	// R1 = *R2
+	ADDU	R1, R3, R4
+	MOVW	R4, R1
+	SC	R4, (R2)	// *R2 = R4
+	BEQ	R4, try_xadd
+	SYNC
+	MOVW	R1, ret+8(FP)
+	RET
+
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB),NOSPLIT,$0-12
+	MOVW	ptr+0(FP), R2
+	MOVW	new+4(FP), R5
+	SYNC
+try_xchg:
+	MOVW	R5, R3
+	LL	(R2), R1	// R1 = *R2
+	SC	R3, (R2)	// *R2 = R3
+	BEQ	R3, try_xchg
+	SYNC
+	MOVW	R1, ret+8(FP)
+	RET
+
+TEXT ·Casint32(SB),NOSPLIT,$0-13
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB),NOSPLIT,$0-21
+	JMP	·Cas64(SB)
+
+TEXT ·Casuintptr(SB),NOSPLIT,$0-13
+	JMP	·Cas(SB)
+
+TEXT ·CasRel(SB),NOSPLIT,$0-13
+	JMP	·Cas(SB)
+
+TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
+	JMP	·Load(SB)
+
+TEXT ·Loaduint(SB),NOSPLIT,$0-8
+	JMP	·Load(SB)
+
+TEXT ·Loadp(SB),NOSPLIT,$-0-8
+	JMP	·Load(SB)
+
+TEXT ·Storeint32(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB),NOSPLIT,$0-12
+	JMP	·Store64(SB)
+
+TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
+TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
+	JMP	·Xadd(SB)
+
+TEXT ·Loadint32(SB),NOSPLIT,$0-8
+	JMP	·Load(SB)
+
+TEXT ·Loadint64(SB),NOSPLIT,$0-12
+	JMP	·Load64(SB)
+
+TEXT ·Xaddint32(SB),NOSPLIT,$0-12
+	JMP	·Xadd(SB)
+
+TEXT ·Xaddint64(SB),NOSPLIT,$0-20
+	JMP	·Xadd64(SB)
+
+TEXT ·Casp1(SB),NOSPLIT,$0-13
+	JMP	·Cas(SB)
+
+TEXT ·Xchgint32(SB),NOSPLIT,$0-12
+	JMP	·Xchg(SB)
+
+TEXT ·Xchgint64(SB),NOSPLIT,$0-20
+	JMP	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB),NOSPLIT,$0-12
+	JMP	·Xchg(SB)
+
+TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
+TEXT ·StoreRel(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
+// void	Or8(byte volatile*, byte);
+TEXT ·Or8(SB),NOSPLIT,$0-5
+	MOVW	ptr+0(FP), R1
+	MOVBU	val+4(FP), R2
+	MOVW	$~3, R3	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	AND	R1, R3
+#ifdef GOARCH_mips
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	AND	$3, R1, R4	// R4 = ((ptr & 3) * 8)
+	SLL	$3, R4
+	SLL	R4, R2, R2	// Shift val for aligned ptr. R2 = val << R4
+	SYNC
+try_or8:
+	LL	(R3), R4	// R4 = *R3
+	OR	R2, R4
+	SC	R4, (R3)	// *R3 = R4
+	BEQ	R4, try_or8
+	SYNC
+	RET
+
+// void	And8(byte volatile*, byte);
+TEXT ·And8(SB),NOSPLIT,$0-5
+	MOVW	ptr+0(FP), R1
+	MOVBU	val+4(FP), R2
+	MOVW	$~3, R3
+	AND	R1, R3
+#ifdef GOARCH_mips
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	AND	$3, R1, R4	// R4 = ((ptr & 3) * 8)
+	SLL	$3, R4
+	MOVW	$0xFF, R5
+	SLL	R4, R2
+	SLL	R4, R5
+	NOR	R0, R5
+	OR	R5, R2	// Shift val for aligned ptr. R2 = val << R4 | ^(0xFF << R4)
+	SYNC
+try_and8:
+	LL	(R3), R4	// R4 = *R3
+	AND	R2, R4
+	SC	R4, (R3)	// *R3 = R4
+	BEQ	R4, try_and8
+	SYNC
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
 TEXT ·spinLock(SB),NOSPLIT,$0-4
 	MOVW	state+0(FP), R1
 	MOVW	$1, R2
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index e4b109f0ec..98101e3287 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ppc64 || ppc64le
 // +build ppc64 ppc64le
 
 package atomic
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
index b79cdbca34..37c8515d37 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.s
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -15,7 +15,7 @@
 //
 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
 
-// uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
+// uint32 ·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD	ptr+0(FP), R3
 	SYNC
@@ -26,7 +26,7 @@ TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVW	R3, ret+8(FP)
 	RET
 
-// uint8 runtime∕internal∕atomic·Load8(uint8 volatile* ptr)
+// uint8 ·Load8(uint8 volatile* ptr)
 TEXT ·Load8(SB),NOSPLIT|NOFRAME,$-8-9
 	MOVD	ptr+0(FP), R3
 	SYNC
@@ -37,7 +37,7 @@ TEXT ·Load8(SB),NOSPLIT|NOFRAME,$-8-9
 	MOVB	R3, ret+8(FP)
 	RET
 
-// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* ptr)
+// uint64 ·Load64(uint64 volatile* ptr)
 TEXT ·Load64(SB),NOSPLIT|NOFRAME,$-8-16
 	MOVD	ptr+0(FP), R3
 	SYNC
@@ -48,7 +48,7 @@ TEXT ·Load64(SB),NOSPLIT|NOFRAME,$-8-16
 	MOVD	R3, ret+8(FP)
 	RET
 
-// void *runtime∕internal∕atomic·Loadp(void *volatile *ptr)
+// void *·Loadp(void *volatile *ptr)
 TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$-8-16
 	MOVD	ptr+0(FP), R3
 	SYNC
@@ -59,7 +59,7 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$-8-16
 	MOVD	R3, ret+8(FP)
 	RET
 
-// uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
+// uint32 ·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD   ptr+0(FP), R3
 	MOVWZ  0(R3), R3
@@ -69,7 +69,7 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVW   R3, ret+8(FP)
 	RET
 
-// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+// uint64 ·LoadAcq64(uint64 volatile* ptr)
 TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
 	MOVD   ptr+0(FP), R3
 	MOVD   0(R3), R3
@@ -78,3 +78,286 @@ TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
 	ISYNC
 	MOVD   R3, ret+8(FP)
 	RET
+
+// bool cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R3
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+	LWSYNC
+cas_again:
+	LWAR	(R3), R6
+	CMPW	R6, R4
+	BNE	cas_fail
+	STWCCC	R5, (R3)
+	BNE	cas_again
+	MOVD	$1, R3
+	LWSYNC
+	MOVB	R3, ret+16(FP)
+	RET
+cas_fail:
+	MOVB	R0, ret+16(FP)
+	RET
+
+// bool	·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	LWSYNC
+cas64_again:
+	LDAR	(R3), R6
+	CMP	R6, R4
+	BNE	cas64_fail
+	STDCCC	R5, (R3)
+	BNE	cas64_again
+	MOVD	$1, R3
+	LWSYNC
+	MOVB	R3, ret+24(FP)
+	RET
+cas64_fail:
+	MOVB	R0, ret+24(FP)
+	RET
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	MOVD    ptr+0(FP), R3
+	MOVWZ   old+8(FP), R4
+	MOVWZ   new+12(FP), R5
+	LWSYNC
+cas_again:
+	LWAR    (R3), $0, R6        // 0 = Mutex release hint
+	CMPW    R6, R4
+	BNE     cas_fail
+	STWCCC  R5, (R3)
+	BNE     cas_again
+	MOVD    $1, R3
+	MOVB    R3, ret+16(FP)
+	RET
+cas_fail:
+	MOVB    R0, ret+16(FP)
+	RET
+
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	BR	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	BR	·Cas64(SB)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	BR	·Cas64(SB)
+
+TEXT ·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	·Load64(SB)
+
+TEXT ·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	·LoadAcq64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
+	BR	·Load64(SB)
+
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	BR	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	BR	·StoreRel64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	BR	·Xadd64(SB)
+
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	BR	·Load(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	BR	·Xadd(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	BR	·Xadd64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	BR ·Cas64(SB)
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	delta+8(FP), R5
+	LWSYNC
+	LWAR	(R4), R3
+	ADD	R5, R3
+	STWCCC	R3, (R4)
+	BNE	-3(PC)
+	MOVW	R3, ret+16(FP)
+	RET
+
+// uint64 Xadd64(uint64 volatile *val, int64 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	delta+8(FP), R5
+	LWSYNC
+	LDAR	(R4), R3
+	ADD	R5, R3
+	STDCCC	R3, (R4)
+	BNE	-3(PC)
+	MOVD	R3, ret+16(FP)
+	RET
+
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	new+8(FP), R5
+	LWSYNC
+	LWAR	(R4), R3
+	STWCCC	R5, (R4)
+	BNE	-2(PC)
+	ISYNC
+	MOVW	R3, ret+16(FP)
+	RET
+
+// uint64 Xchg64(ptr *uint64, new uint64)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	new+8(FP), R5
+	LWSYNC
+	LDAR	(R4), R3
+	STDCCC	R5, (R4)
+	BNE	-2(PC)
+	ISYNC
+	MOVD	R3, ret+16(FP)
+	RET
+
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	BR	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	BR	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	BR	·Xchg64(SB)
+
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(R3)
+	RET
+
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVB	val+8(FP), R4
+	SYNC
+	MOVB	R4, 0(R3)
+	RET
+
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(R3)
+	RET
+
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+	MOVW	R4, 0(R3)
+	RET
+
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	LWSYNC
+	MOVD	R4, 0(R3)
+	RET
+
+// void ·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+	LWSYNC
+again:
+	LBAR	(R3), R6
+	OR	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// void ·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+	LWSYNC
+again:
+	LBAR	(R3), R6
+	AND	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3), R6
+	OR	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3),R6
+	AND	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index 74c896cea6..ec05302a78 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -37,7 +37,6 @@
 //      } else {
 //              return 0;
 //      }
-
 TEXT ·Cas(SB), NOSPLIT, $0-17
 	MOV	ptr+0(FP), A0
 	MOVW	old+8(FP), A1
@@ -121,6 +120,12 @@ TEXT ·Store64(SB), NOSPLIT, $0-16
 TEXT ·Casp1(SB), NOSPLIT, $0-25
 	JMP	·Cas64(SB)
 
+TEXT ·Casint32(SB),NOSPLIT,$0-17
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB),NOSPLIT,$0-25
+	JMP	·Cas64(SB)
+
 TEXT ·Casuintptr(SB),NOSPLIT,$0-25
 	JMP	·Cas64(SB)
 
@@ -130,14 +135,26 @@ TEXT ·CasRel(SB), NOSPLIT, $0-17
 TEXT ·Loaduintptr(SB),NOSPLIT,$0-16
 	JMP	·Load64(SB)
 
+TEXT ·Storeint32(SB),NOSPLIT,$0-12
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB),NOSPLIT,$0-16
+	JMP	·Store64(SB)
+
 TEXT ·Storeuintptr(SB),NOSPLIT,$0-16
 	JMP	·Store64(SB)
 
 TEXT ·Loaduint(SB),NOSPLIT,$0-16
 	JMP ·Loaduintptr(SB)
 
+TEXT ·Loadint32(SB),NOSPLIT,$0-12
+	JMP ·Load(SB)
+
 TEXT ·Loadint64(SB),NOSPLIT,$0-16
-	JMP ·Loaduintptr(SB)
+	JMP ·Load64(SB)
+
+TEXT ·Xaddint32(SB),NOSPLIT,$0-20
+	JMP ·Xadd(SB)
 
 TEXT ·Xaddint64(SB),NOSPLIT,$0-24
 	MOV	ptr+0(FP), A0
@@ -215,6 +232,14 @@ TEXT ·Xadd64(SB), NOSPLIT, $0-24
 TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
 	JMP	·Xadd64(SB)
 
+// func Xchgint32(ptr *int32, new int32) int32
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	JMP	·Xchg(SB)
+
+// func Xchgint64(ptr *int64, new int64) int64
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
 // func Xchguintptr(ptr *uintptr, new uintptr) uintptr
 TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 	JMP	·Xchg64(SB)
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/atomic_s390x.s
index daf1f3cc9f..a0c204b0e1 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/atomic_s390x.s
@@ -76,6 +76,14 @@ cas64_fail:
 	MOVB	$0, ret+24(FP)
 	RET
 
+// func Casint32(ptr *int32, old, new int32) bool
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	BR	·Cas(SB)
+
+// func Casint64(ptr *int64, old, new int64) bool
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	BR	·Cas64(SB)
+
 // func Casuintptr(ptr *uintptr, old, new uintptr) bool
 TEXT ·Casuintptr(SB), NOSPLIT, $0-25
 	BR	·Cas64(SB)
@@ -92,10 +100,22 @@ TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
 TEXT ·Loaduint(SB), NOSPLIT, $0-16
 	BR	·Load64(SB)
 
+// func Storeint32(ptr *int32, new int32)
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	BR	·Store(SB)
+
+// func Storeint64(ptr *int64, new int64)
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
 // func Storeuintptr(ptr *uintptr, new uintptr)
 TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
 	BR	·Store64(SB)
 
+// func Loadint32(ptr *int32) int32
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	BR	·Load(SB)
+
 // func Loadint64(ptr *int64) int64
 TEXT ·Loadint64(SB), NOSPLIT, $0-16
 	BR	·Load64(SB)
@@ -104,6 +124,10 @@ TEXT ·Loadint64(SB), NOSPLIT, $0-16
 TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
 	BR	·Xadd64(SB)
 
+// func Xaddint32(ptr *int32, delta int32) int32
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	BR	·Xadd(SB)
+
 // func Xaddint64(ptr *int64, delta int64) int64
 TEXT ·Xaddint64(SB), NOSPLIT, $0-24
 	BR	·Xadd64(SB)
@@ -168,6 +192,14 @@ repeat:
 	MOVD	R6, ret+16(FP)
 	RET
 
+// func Xchgint32(ptr *int32, new int32) int32
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	BR	·Xchg(SB)
+
+// func Xchgint64(ptr *int64, new int64) int64
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	BR	·Xchg64(SB)
+
 // func Xchguintptr(ptr *uintptr, new uintptr) uintptr
 TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 	BR	·Xchg64(SB)
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index b05d98ed51..3f77f16b4e 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -9,18 +9,28 @@
 //go:linkname Load
 //go:linkname Loadp
 //go:linkname Load64
+//go:linkname Loadint32
+//go:linkname Loadint64
 //go:linkname Loaduintptr
 //go:linkname Xadd
+//go:linkname Xaddint32
+//go:linkname Xaddint64
 //go:linkname Xadd64
 //go:linkname Xadduintptr
 //go:linkname Xchg
 //go:linkname Xchg64
+//go:linkname Xchgint32
+//go:linkname Xchgint64
 //go:linkname Xchguintptr
 //go:linkname Cas
 //go:linkname Cas64
+//go:linkname Casint32
+//go:linkname Casint64
 //go:linkname Casuintptr
 //go:linkname Store
 //go:linkname Store64
+//go:linkname Storeint32
+//go:linkname Storeint64
 //go:linkname Storeuintptr
 
 package atomic
@@ -111,6 +121,22 @@ func Xchg64(ptr *uint64, new uint64) uint64 {
 
 //go:nosplit
 //go:noinline
+func Xchgint32(ptr *int32, new int32) int32 {
+	old := *ptr
+	*ptr = new
+	return old
+}
+
+//go:nosplit
+//go:noinline
+func Xchgint64(ptr *int64, new int64) int64 {
+	old := *ptr
+	*ptr = new
+	return old
+}
+
+//go:nosplit
+//go:noinline
 func Xchguintptr(ptr *uintptr, new uintptr) uintptr {
 	old := *ptr
 	*ptr = new
@@ -197,6 +223,26 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:nosplit
 //go:noinline
+func Casint32(ptr *int32, old, new int32) bool {
+	if *ptr == old {
+		*ptr = new
+		return true
+	}
+	return false
+}
+
+//go:nosplit
+//go:noinline
+func Casint64(ptr *int64, old, new int64) bool {
+	if *ptr == old {
+		*ptr = new
+		return true
+	}
+	return false
+}
+
+//go:nosplit
+//go:noinline
 func Cas(ptr *uint32, old, new uint32) bool {
 	if *ptr == old {
 		*ptr = new
@@ -237,6 +283,18 @@ func CasRel(ptr *uint32, old, new uint32) bool {
 
 //go:nosplit
 //go:noinline
+func Storeint32(ptr *int32, new int32) {
+	*ptr = new
+}
+
+//go:nosplit
+//go:noinline
+func Storeint64(ptr *int64, new int64) {
+	*ptr = new
+}
+
+//go:nosplit
+//go:noinline
 func Storeuintptr(ptr *uintptr, new uintptr) {
 	*ptr = new
 }
@@ -255,12 +313,26 @@ func Loaduint(ptr *uint) uint {
 
 //go:nosplit
 //go:noinline
+func Loadint32(ptr *int32) int32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
 func Loadint64(ptr *int64) int64 {
 	return *ptr
 }
 
 //go:nosplit
 //go:noinline
+func Xaddint32(ptr *int32, delta int32) int32 {
+	new := *ptr + delta
+	*ptr = new
+	return new
+}
+
+//go:nosplit
+//go:noinline
 func Xaddint64(ptr *int64, delta int64) int64 {
 	new := *ptr + delta
 	*ptr = new
diff --git a/src/runtime/internal/atomic/asm_wasm.s b/src/runtime/internal/atomic/atomic_wasm.s
index 7c33cb1ee9..1c2d1ce5e1 100644
--- a/src/runtime/internal/atomic/asm_wasm.s
+++ b/src/runtime/internal/atomic/atomic_wasm.s
@@ -4,7 +4,7 @@
 
 #include "textflag.h"
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 	MOVD ptr+0(FP), R0
 	MOVD val+8(FP), 0(R0)
 	RET
diff --git a/src/runtime/internal/atomic/stubs.go b/src/runtime/internal/atomic/stubs.go
index 62e30d1788..e7544ba448 100644
--- a/src/runtime/internal/atomic/stubs.go
+++ b/src/runtime/internal/atomic/stubs.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !wasm
 // +build !wasm
 
 package atomic
@@ -15,9 +16,21 @@ func Cas(ptr *uint32, old, new uint32) bool
 func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
 
 //go:noescape
+func Casint32(ptr *int32, old, new int32) bool
+
+//go:noescape
+func Casint64(ptr *int64, old, new int64) bool
+
+//go:noescape
 func Casuintptr(ptr *uintptr, old, new uintptr) bool
 
 //go:noescape
+func Storeint32(ptr *int32, new int32)
+
+//go:noescape
+func Storeint64(ptr *int64, new int64)
+
+//go:noescape
 func Storeuintptr(ptr *uintptr, new uintptr)
 
 //go:noescape
@@ -29,7 +42,19 @@ func Loaduint(ptr *uint) uint
 // TODO(matloob): Should these functions have the go:noescape annotation?
 
 //go:noescape
+func Loadint32(ptr *int32) int32
+
+//go:noescape
 func Loadint64(ptr *int64) int64
 
 //go:noescape
+func Xaddint32(ptr *int32, delta int32) int32
+
+//go:noescape
 func Xaddint64(ptr *int64, delta int64) int64
+
+//go:noescape
+func Xchgint32(ptr *int32, new int32) int32
+
+//go:noescape
+func Xchgint64(ptr *int64, new int64) int64
diff --git a/src/runtime/internal/atomic/sys_linux_arm.s b/src/runtime/internal/atomic/sys_linux_arm.s
index 192be4b64f..0cc7fa73d1 100644
--- a/src/runtime/internal/atomic/sys_linux_arm.s
+++ b/src/runtime/internal/atomic/sys_linux_arm.s
@@ -24,7 +24,7 @@
 TEXT cas<>(SB),NOSPLIT,$0
 	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
 
-TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT|NOFRAME,$0
+TEXT ·Cas(SB),NOSPLIT|NOFRAME,$0
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
diff --git a/src/runtime/internal/math/math.go b/src/runtime/internal/math/math.go
index 5385f5dd86..b6bd12d3e8 100644
--- a/src/runtime/internal/math/math.go
+++ b/src/runtime/internal/math/math.go
@@ -17,3 +17,24 @@ func MulUintptr(a, b uintptr) (uintptr, bool) {
 	overflow := b > MaxUintptr/a
 	return a * b, overflow
 }
+
+// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+// This is a copy from math/bits.Mul64
+// On supported platforms this is an intrinsic lowered by the compiler.
+func Mul64(x, y uint64) (hi, lo uint64) {
+	const mask32 = 1<<32 - 1
+	x0 := x & mask32
+	x1 := x >> 32
+	y0 := y & mask32
+	y1 := y >> 32
+	w0 := x0 * y0
+	t := x1*y0 + w0>>32
+	w1 := t & mask32
+	w2 := t >> 32
+	w1 += x0 * y1
+	hi = x1*y1 + w2 + w1>>32
+	lo = x * y
+	return
+}
diff --git a/src/runtime/internal/sys/gengoos.go b/src/runtime/internal/sys/gengoos.go
index 9bbc48d94f..51f64a6e5c 100644
--- a/src/runtime/internal/sys/gengoos.go
+++ b/src/runtime/internal/sys/gengoos.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 package main
diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
index 3c8898236c..e76d8dd064 100644
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !386
 // +build !386
 
 // TODO finish intrinsifying 386, deadcode the assembly, remove build tags, merge w/ intrinsics_common
diff --git a/src/runtime/internal/sys/intrinsics_stubs.go b/src/runtime/internal/sys/intrinsics_stubs.go
index 9cbf48216c..bf1494d48a 100644
--- a/src/runtime/internal/sys/intrinsics_stubs.go
+++ b/src/runtime/internal/sys/intrinsics_stubs.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386
 // +build 386
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_386.go b/src/runtime/internal/sys/zgoarch_386.go
index c286d0df2b..98a2401bfe 100644
--- a/src/runtime/internal/sys/zgoarch_386.go
+++ b/src/runtime/internal/sys/zgoarch_386.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build 386
 // +build 386
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_amd64.go b/src/runtime/internal/sys/zgoarch_amd64.go
index d21c1d7d2a..d8faa5c786 100644
--- a/src/runtime/internal/sys/zgoarch_amd64.go
+++ b/src/runtime/internal/sys/zgoarch_amd64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build amd64
 // +build amd64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_arm.go b/src/runtime/internal/sys/zgoarch_arm.go
index 9085fb0ea8..b64a69c9b4 100644
--- a/src/runtime/internal/sys/zgoarch_arm.go
+++ b/src/runtime/internal/sys/zgoarch_arm.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build arm
 // +build arm
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_arm64.go b/src/runtime/internal/sys/zgoarch_arm64.go
index ed7ef2ebcb..de6f85347b 100644
--- a/src/runtime/internal/sys/zgoarch_arm64.go
+++ b/src/runtime/internal/sys/zgoarch_arm64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build arm64
 // +build arm64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_arm64be.go b/src/runtime/internal/sys/zgoarch_arm64be.go
index faf3111053..b762bb069f 100644
--- a/src/runtime/internal/sys/zgoarch_arm64be.go
+++ b/src/runtime/internal/sys/zgoarch_arm64be.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build arm64be
 // +build arm64be
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_armbe.go b/src/runtime/internal/sys/zgoarch_armbe.go
index cb28301e0b..e5297e4b16 100644
--- a/src/runtime/internal/sys/zgoarch_armbe.go
+++ b/src/runtime/internal/sys/zgoarch_armbe.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build armbe
 // +build armbe
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mips.go b/src/runtime/internal/sys/zgoarch_mips.go
index 315dea1c84..b5f4ed390c 100644
--- a/src/runtime/internal/sys/zgoarch_mips.go
+++ b/src/runtime/internal/sys/zgoarch_mips.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mips
 // +build mips
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mips64.go b/src/runtime/internal/sys/zgoarch_mips64.go
index 5258cbfbe7..73777cceb2 100644
--- a/src/runtime/internal/sys/zgoarch_mips64.go
+++ b/src/runtime/internal/sys/zgoarch_mips64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mips64
 // +build mips64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mips64le.go b/src/runtime/internal/sys/zgoarch_mips64le.go
index 1721698518..0c81c36c09 100644
--- a/src/runtime/internal/sys/zgoarch_mips64le.go
+++ b/src/runtime/internal/sys/zgoarch_mips64le.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mips64le
 // +build mips64le
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mips64p32.go b/src/runtime/internal/sys/zgoarch_mips64p32.go
index 44c4624da9..d63ce27d24 100644
--- a/src/runtime/internal/sys/zgoarch_mips64p32.go
+++ b/src/runtime/internal/sys/zgoarch_mips64p32.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mips64p32
 // +build mips64p32
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mips64p32le.go b/src/runtime/internal/sys/zgoarch_mips64p32le.go
index eb63225430..2d577890b2 100644
--- a/src/runtime/internal/sys/zgoarch_mips64p32le.go
+++ b/src/runtime/internal/sys/zgoarch_mips64p32le.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mips64p32le
 // +build mips64p32le
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_mipsle.go b/src/runtime/internal/sys/zgoarch_mipsle.go
index e0ebfbf038..8af919d03a 100644
--- a/src/runtime/internal/sys/zgoarch_mipsle.go
+++ b/src/runtime/internal/sys/zgoarch_mipsle.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build mipsle
 // +build mipsle
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_ppc.go b/src/runtime/internal/sys/zgoarch_ppc.go
index ef26aa3201..f6f12a5ddc 100644
--- a/src/runtime/internal/sys/zgoarch_ppc.go
+++ b/src/runtime/internal/sys/zgoarch_ppc.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build ppc
 // +build ppc
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_ppc64.go b/src/runtime/internal/sys/zgoarch_ppc64.go
index 32c2d46d4c..a8379601f4 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build ppc64
 // +build ppc64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_ppc64le.go b/src/runtime/internal/sys/zgoarch_ppc64le.go
index 3a6e56763c..f2ec5dcba7 100644
--- a/src/runtime/internal/sys/zgoarch_ppc64le.go
+++ b/src/runtime/internal/sys/zgoarch_ppc64le.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build ppc64le
 // +build ppc64le
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_riscv.go b/src/runtime/internal/sys/zgoarch_riscv.go
index d8f6b49093..83a3312f5f 100644
--- a/src/runtime/internal/sys/zgoarch_riscv.go
+++ b/src/runtime/internal/sys/zgoarch_riscv.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build riscv
 // +build riscv
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_riscv64.go b/src/runtime/internal/sys/zgoarch_riscv64.go
index 0ba843b5ac..1dfcc84997 100644
--- a/src/runtime/internal/sys/zgoarch_riscv64.go
+++ b/src/runtime/internal/sys/zgoarch_riscv64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build riscv64
 // +build riscv64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_s390.go b/src/runtime/internal/sys/zgoarch_s390.go
index 20a1b234a6..91aba5a0f6 100644
--- a/src/runtime/internal/sys/zgoarch_s390.go
+++ b/src/runtime/internal/sys/zgoarch_s390.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build s390
 // +build s390
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_s390x.go b/src/runtime/internal/sys/zgoarch_s390x.go
index ffdda0c827..edce50234e 100644
--- a/src/runtime/internal/sys/zgoarch_s390x.go
+++ b/src/runtime/internal/sys/zgoarch_s390x.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build s390x
 // +build s390x
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_sparc.go b/src/runtime/internal/sys/zgoarch_sparc.go
index b4949510d5..5ae9560ab0 100644
--- a/src/runtime/internal/sys/zgoarch_sparc.go
+++ b/src/runtime/internal/sys/zgoarch_sparc.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build sparc
 // +build sparc
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_sparc64.go b/src/runtime/internal/sys/zgoarch_sparc64.go
index 0f6df411ce..e2a0134aff 100644
--- a/src/runtime/internal/sys/zgoarch_sparc64.go
+++ b/src/runtime/internal/sys/zgoarch_sparc64.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build sparc64
 // +build sparc64
 
 package sys
diff --git a/src/runtime/internal/sys/zgoarch_wasm.go b/src/runtime/internal/sys/zgoarch_wasm.go
index e69afb0cb3..52e85dea37 100644
--- a/src/runtime/internal/sys/zgoarch_wasm.go
+++ b/src/runtime/internal/sys/zgoarch_wasm.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build wasm
 // +build wasm
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_aix.go b/src/runtime/internal/sys/zgoos_aix.go
index 0631d02aa5..f3b907471f 100644
--- a/src/runtime/internal/sys/zgoos_aix.go
+++ b/src/runtime/internal/sys/zgoos_aix.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build aix
 // +build aix
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_android.go b/src/runtime/internal/sys/zgoos_android.go
index d356a40bec..e28baf7c48 100644
--- a/src/runtime/internal/sys/zgoos_android.go
+++ b/src/runtime/internal/sys/zgoos_android.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build android
 // +build android
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_darwin.go b/src/runtime/internal/sys/zgoos_darwin.go
index 6aa2db7e3a..3c7f7b543e 100644
--- a/src/runtime/internal/sys/zgoos_darwin.go
+++ b/src/runtime/internal/sys/zgoos_darwin.go
@@ -1,7 +1,7 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
-// +build !ios
-// +build darwin
+//go:build !ios && darwin
+// +build !ios,darwin
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoos_dragonfly.go b/src/runtime/internal/sys/zgoos_dragonfly.go
index 88ee1174f1..f844d29e2a 100644
--- a/src/runtime/internal/sys/zgoos_dragonfly.go
+++ b/src/runtime/internal/sys/zgoos_dragonfly.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build dragonfly
 // +build dragonfly
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_freebsd.go b/src/runtime/internal/sys/zgoos_freebsd.go
index 8de2ec0559..8999a2797a 100644
--- a/src/runtime/internal/sys/zgoos_freebsd.go
+++ b/src/runtime/internal/sys/zgoos_freebsd.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build freebsd
 // +build freebsd
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_hurd.go b/src/runtime/internal/sys/zgoos_hurd.go
index 183ccb02a1..a546488bf8 100644
--- a/src/runtime/internal/sys/zgoos_hurd.go
+++ b/src/runtime/internal/sys/zgoos_hurd.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build hurd
 // +build hurd
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_illumos.go b/src/runtime/internal/sys/zgoos_illumos.go
index d04134e1df..02a4ca06e8 100644
--- a/src/runtime/internal/sys/zgoos_illumos.go
+++ b/src/runtime/internal/sys/zgoos_illumos.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build illumos
 // +build illumos
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_ios.go b/src/runtime/internal/sys/zgoos_ios.go
index cf6e9d61f0..033eec623d 100644
--- a/src/runtime/internal/sys/zgoos_ios.go
+++ b/src/runtime/internal/sys/zgoos_ios.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build ios
 // +build ios
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_js.go b/src/runtime/internal/sys/zgoos_js.go
index 1d9279ab38..28226ad60a 100644
--- a/src/runtime/internal/sys/zgoos_js.go
+++ b/src/runtime/internal/sys/zgoos_js.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build js
 // +build js
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_linux.go b/src/runtime/internal/sys/zgoos_linux.go
index 0f718d704f..01546e4b9f 100644
--- a/src/runtime/internal/sys/zgoos_linux.go
+++ b/src/runtime/internal/sys/zgoos_linux.go
@@ -1,7 +1,7 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
-// +build !android
-// +build linux
+//go:build !android && linux
+// +build !android,linux
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoos_netbsd.go b/src/runtime/internal/sys/zgoos_netbsd.go
index 2ae149ff13..9d658b20ee 100644
--- a/src/runtime/internal/sys/zgoos_netbsd.go
+++ b/src/runtime/internal/sys/zgoos_netbsd.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build netbsd
 // +build netbsd
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_openbsd.go b/src/runtime/internal/sys/zgoos_openbsd.go
index 7d4d61e4ca..0f55454a95 100644
--- a/src/runtime/internal/sys/zgoos_openbsd.go
+++ b/src/runtime/internal/sys/zgoos_openbsd.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build openbsd
 // +build openbsd
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_plan9.go b/src/runtime/internal/sys/zgoos_plan9.go
index 30aec46df3..d0347464d6 100644
--- a/src/runtime/internal/sys/zgoos_plan9.go
+++ b/src/runtime/internal/sys/zgoos_plan9.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build plan9
 // +build plan9
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_solaris.go b/src/runtime/internal/sys/zgoos_solaris.go
index 4bb8c99bce..05c3007e2c 100644
--- a/src/runtime/internal/sys/zgoos_solaris.go
+++ b/src/runtime/internal/sys/zgoos_solaris.go
@@ -1,7 +1,7 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
-// +build !illumos
-// +build solaris
+//go:build !illumos && solaris
+// +build !illumos,solaris
 
 package sys
 
diff --git a/src/runtime/internal/sys/zgoos_windows.go b/src/runtime/internal/sys/zgoos_windows.go
index d1f4290204..7d07fa3a45 100644
--- a/src/runtime/internal/sys/zgoos_windows.go
+++ b/src/runtime/internal/sys/zgoos_windows.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build windows
 // +build windows
 
 package sys
diff --git a/src/runtime/internal/sys/zgoos_zos.go b/src/runtime/internal/sys/zgoos_zos.go
index d22be46fc0..d6e5b9b0cb 100644
--- a/src/runtime/internal/sys/zgoos_zos.go
+++ b/src/runtime/internal/sys/zgoos_zos.go
@@ -1,5 +1,6 @@
 // Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
 
+//go:build zos
 // +build zos
 
 package sys
diff --git a/src/runtime/lfstack_32bit.go b/src/runtime/lfstack_32bit.go
index f07ff1c06b..c00f0965bd 100644
--- a/src/runtime/lfstack_32bit.go
+++ b/src/runtime/lfstack_32bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386 || arm || mips || mipsle
 // +build 386 arm mips mipsle
 
 package runtime
diff --git a/src/runtime/lfstack_64bit.go b/src/runtime/lfstack_64bit.go
index 9d821b989e..4812dd1156 100644
--- a/src/runtime/lfstack_64bit.go
+++ b/src/runtime/lfstack_64bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
 // +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm
 
 package runtime
diff --git a/src/runtime/libfuzzer.go b/src/runtime/libfuzzer.go
index 0161955f09..578bce0414 100644
--- a/src/runtime/libfuzzer.go
+++ b/src/runtime/libfuzzer.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build libfuzzer
 // +build libfuzzer
 
 package runtime
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 91467fdfae..e4c8d01941 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux
 // +build dragonfly freebsd linux
 
 package runtime
@@ -238,7 +239,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) (*g, bool) {
+func beforeIdle(int64, int64) (*g, bool) {
 	return nil, false
 }
 
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go
index 14bdc76842..0ca3512baf 100644
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
@@ -175,7 +176,12 @@ var idleID int32
 // If an event handler returned, we resume it and it will pause the execution.
 // beforeIdle either returns the specific goroutine to schedule next or
 // indicates with otherReady that some goroutine became ready.
-func beforeIdle(delay int64) (gp *g, otherReady bool) {
+func beforeIdle(now, pollUntil int64) (gp *g, otherReady bool) {
+	delay := int64(-1)
+	if pollUntil != 0 {
+		delay = pollUntil - now
+	}
+
 	if delay > 0 {
 		clearIdleID()
 		if delay < 1e6 {
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index 671e524e45..7a6af28b56 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || netbsd || openbsd || plan9 || solaris || windows
 // +build aix darwin netbsd openbsd plan9 solaris windows
 
 package runtime
@@ -297,7 +298,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) (*g, bool) {
+func beforeIdle(int64, int64) (*g, bool) {
 	return nil, false
 }
 
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index b3c01ba104..dde9f7c21a 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -44,7 +44,6 @@ const (
 	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
-	lockRankPanic
 	lockRankAllg
 	lockRankAllp
 
@@ -92,6 +91,7 @@ const (
 	// rank, we don't allow any further locks to be acquired other than more
 	// hchan locks.
 	lockRankHchanLeaf
+	lockRankPanic
 
 	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
 	// There are other architecture-dependent leaf locks as well.
@@ -123,7 +123,6 @@ var lockNames = []string{
 	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
-	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
 
@@ -162,6 +161,7 @@ var lockNames = []string{
 
 	lockRankGFree:     "gFree",
 	lockRankHchanLeaf: "hchanLeaf",
+	lockRankPanic:     "panic",
 
 	lockRankNewmHandoff:   "newmHandoff.lock",
 	lockRankDebugPtrmask:  "debugPtrmask.lock",
@@ -202,10 +202,9 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankPollDesc:      {},
 	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
-	lockRankPanic:         {lockRankDeadlock},
-	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
+	lockRankAllg:          {lockRankSysmon, lockRankSched},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
+	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankPollDesc, lockRankSched, lockRankAllp, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
 	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
@@ -214,29 +213,30 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
 	lockRankTraceStrings:  {lockRankTraceBuf},
 	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankRoot:          {},
-	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankSweep},
-	lockRankTraceStackTab: {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
 	lockRankNetpollInit:   {lockRankTimers},
 
 	lockRankRwmutexW: {},
 	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
 
-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
 	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
 	lockRankDefer:        {},
-	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
-	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
-	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSudog:        {lockRankHchan, lockRankNotifyList},
+	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
+	lockRankPanic:     {lockRankDeadlock}, // plus any other lock held on throw.
 
 	lockRankNewmHandoff:   {},
 	lockRankDebugPtrmask:  {},
diff --git a/src/runtime/lockrank_off.go b/src/runtime/lockrank_off.go
index 7dcd8f5fe9..f3d2c00914 100644
--- a/src/runtime/lockrank_off.go
+++ b/src/runtime/lockrank_off.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !goexperiment.staticlockranking
 // +build !goexperiment.staticlockranking
 
 package runtime
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
index 88ac95a004..fc8d2dc8d1 100644
--- a/src/runtime/lockrank_on.go
+++ b/src/runtime/lockrank_on.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build goexperiment.staticlockranking
 // +build goexperiment.staticlockranking
 
 package runtime
@@ -24,19 +25,6 @@ type lockRankStruct struct {
 	pad int
 }
 
-// init checks that the partial order in lockPartialOrder fits within the total
-// order determined by the order of the lockRank constants.
-func init() {
-	for rank, list := range lockPartialOrder {
-		for _, entry := range list {
-			if entry > lockRank(rank) {
-				println("lockPartial order row", lockRank(rank).String(), "entry", entry.String())
-				throw("lockPartialOrder table is inconsistent with total lock ranking order")
-			}
-		}
-	}
-}
-
 func lockInit(l *mutex, rank lockRank) {
 	l.rank = rank
 }
@@ -64,12 +52,11 @@ func lockWithRank(l *mutex, rank lockRank) {
 		// rank recording for it, since print/println are used when
 		// printing out a lock ordering problem below.
 		//
-		// paniclk has an ordering problem, since it can be acquired
-		// during a panic with any other locks held (especially if the
-		// panic is because of a directed segv), and yet also allg is
-		// acquired after paniclk in tracebackothers()). This is a genuine
-		// problem, so for now we don't do lock rank recording for paniclk
-		// either.
+		// paniclk is only used for fatal throw/panic. Don't do lock
+		// ranking recording for it, since we throw after reporting a
+		// lock ordering problem. Additionally, paniclk may be taken
+		// after effectively any lock (anywhere we might panic), which
+		// the partial order doesn't cover.
 		lock2(l)
 		return
 	}
@@ -219,7 +206,7 @@ func releaseLockRank(rank lockRank) {
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 	gp := getg()
 	if gp.m.locksHeldLen == 0 {
-		// No possibilty of lock ordering problem if no other locks held
+		// No possibility of lock ordering problem if no other locks held
 		return
 	}
 
diff --git a/src/runtime/lockrank_test.go b/src/runtime/lockrank_test.go
new file mode 100644
index 0000000000..4b2fc0eaee
--- /dev/null
+++ b/src/runtime/lockrank_test.go
@@ -0,0 +1,41 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+// Check that the partial order in lockPartialOrder fits within the total order
+// determined by the order of the lockRank constants.
+func TestLockRankPartialOrder(t *testing.T) {
+	for r, list := range LockPartialOrder {
+		rank := LockRank(r)
+		for _, e := range list {
+			entry := LockRank(e)
+			if entry > rank {
+				t.Errorf("lockPartialOrder row %v entry %v is inconsistent with total lock ranking order", rank, entry)
+			}
+		}
+	}
+}
+
+// Verify that partial order lists are kept sorted. This is a purely cosemetic
+// check to make manual reviews simpler. It does not affect correctness, unlike
+// the above test.
+func TestLockRankPartialOrderSortedEntries(t *testing.T) {
+	for r, list := range LockPartialOrder {
+		rank := LockRank(r)
+		var prev LockRank
+		for _, e := range list {
+			entry := LockRank(e)
+			if entry <= prev {
+				t.Errorf("Partial order for rank %v out of order: %v <= %v in %v", rank, entry, prev, list)
+			}
+			prev = entry
+		}
+	}
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index f20ded5bf7..81e5225883 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -227,6 +227,7 @@ const (
 	// --------------  ---------  ----------  ----------  -----------
 	//       */64-bit         48        64MB           1    4M (32MB)
 	// windows/64-bit         48         4MB          64    1M  (8MB)
+	//      ios/arm64         33         4MB           1  2048  (8KB)
 	//       */32-bit         32         4MB           1  1024  (4KB)
 	//     */mips(le)         31         4MB           1   512  (2KB)
 
@@ -247,7 +248,7 @@ const (
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
-	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm
+	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm + (2+20)*sys.GoosIos*sys.GoarchArm64
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
@@ -568,7 +569,7 @@ func mallocinit() {
 		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{})
 		meta := uintptr(sysReserve(nil, arenaMetaSize))
 		if meta != 0 {
-			mheap_.heapArenaAlloc.init(meta, arenaMetaSize)
+			mheap_.heapArenaAlloc.init(meta, arenaMetaSize, true)
 		}
 
 		// We want to start the arena low, but if we're linked
@@ -605,7 +606,7 @@ func mallocinit() {
 		for _, arenaSize := range arenaSizes {
 			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
 			if a != nil {
-				mheap_.arena.init(uintptr(a), size)
+				mheap_.arena.init(uintptr(a), size, false)
 				p = mheap_.arena.end // For hint below
 				break
 			}
@@ -622,8 +623,8 @@ func mallocinit() {
 // heapArenaBytes. sysAlloc returns nil on failure.
 // There is no corresponding free function.
 //
-// sysAlloc returns a memory region in the Prepared state. This region must
-// be transitioned to Ready before use.
+// sysAlloc returns a memory region in the Reserved state. This region must
+// be transitioned to Prepared and then Ready before use.
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
@@ -725,9 +726,6 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
 		throw("misrounded allocation in sysAlloc")
 	}
 
-	// Transition from Reserved to Prepared.
-	sysMap(v, size, &memstats.heap_sys)
-
 mapped:
 	// Create arena metadata.
 	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
@@ -936,7 +934,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 
 		if inittrace.active && inittrace.id == getg().goid {
-			// Init functions are executed sequentially in a single Go routine.
+			// Init functions are executed sequentially in a single goroutine.
 			inittrace.allocs += 1
 		}
 	}
@@ -981,6 +979,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
+	// In some cases block zeroing can profitably (for latency reduction purposes)
+	// be delayed till preemption is possible; isZeroed tracks that state.
+	isZeroed := true
 	if size <= maxSmallSize {
 		if noscan && size < maxTinySize {
 			// Tiny allocator.
@@ -1049,7 +1050,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			(*[2]uint64)(x)[1] = 0
 			// See if we need to replace the existing tiny block with the new one
 			// based on amount of remaining free space.
-			if size < c.tinyoffset || c.tiny == 0 {
+			if !raceenabled && (size < c.tinyoffset || c.tiny == 0) {
+				// Note: disabled when race detector is on, see comment near end of this function.
 				c.tiny = uintptr(x)
 				c.tinyoffset = size
 			}
@@ -1075,7 +1077,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		span = c.allocLarge(size, needzero, noscan)
+		// For large allocations, keep track of zeroed state so that
+		// bulk zeroing can be happen later in a preemptible context.
+		span, isZeroed = c.allocLarge(size, needzero && !noscan, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1134,13 +1138,19 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
+	// Pointerfree data can be zeroed late in a context where preemption can occur.
+	// x will keep the memory alive.
+	if !isZeroed && needzero {
+		memclrNoHeapPointersChunked(size, x)
+	}
+
 	if debug.malloc {
 		if debug.allocfreetrace != 0 {
 			tracealloc(x, size, typ)
 		}
 
 		if inittrace.active && inittrace.id == getg().goid {
-			// Init functions are executed sequentially in a single Go routine.
+			// Init functions are executed sequentially in a single goroutine.
 			inittrace.bytes += uint64(size)
 		}
 	}
@@ -1167,9 +1177,52 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	}
 
+	if raceenabled && noscan && dataSize < maxTinySize {
+		// Pad tinysize allocations so they are aligned with the end
+		// of the tinyalloc region. This ensures that any arithmetic
+		// that goes off the top end of the object will be detectable
+		// by checkptr (issue 38872).
+		// Note that we disable tinyalloc when raceenabled for this to work.
+		// TODO: This padding is only performed when the race detector
+		// is enabled. It would be nice to enable it if any package
+		// was compiled with checkptr, but there's no easy way to
+		// detect that (especially at compile time).
+		// TODO: enable this padding for all allocations, not just
+		// tinyalloc ones. It's tricky because of pointer maps.
+		// Maybe just all noscan objects?
+		x = add(x, size-dataSize)
+	}
+
 	return x
 }
 
+// memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
+// on chunks of the buffer to be zeroed, with opportunities for preemption
+// along the way.  memclrNoHeapPointers contains no safepoints and also
+// cannot be preemptively scheduled, so this provides a still-efficient
+// block copy that can also be preempted on a reasonable granularity.
+//
+// Use this with care; if the data being cleared is tagged to contain
+// pointers, this allows the GC to run before it is all cleared.
+func memclrNoHeapPointersChunked(size uintptr, x unsafe.Pointer) {
+	v := uintptr(x)
+	// got this from benchmarking. 128k is too small, 512k is too large.
+	const chunkBytes = 256 * 1024
+	vsize := v + size
+	for voff := v; voff < vsize; voff = voff + chunkBytes {
+		if getg().preempt {
+			// may hold locks, e.g., profiling
+			goschedguarded()
+		}
+		// clear min(avail, lump) bytes
+		n := vsize - voff
+		if n > chunkBytes {
+			n = chunkBytes
+		}
+		memclrNoHeapPointers(unsafe.Pointer(voff), n)
+	}
+}
+
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
@@ -1400,15 +1453,19 @@ func inPersistentAlloc(p uintptr) bool {
 }
 
 // linearAlloc is a simple linear allocator that pre-reserves a region
-// of memory and then maps that region into the Ready state as needed. The
-// caller is responsible for locking.
+// of memory and then optionally maps that region into the Ready state
+// as needed.
+//
+// The caller is responsible for locking.
 type linearAlloc struct {
 	next   uintptr // next free byte
 	mapped uintptr // one byte past end of mapped space
 	end    uintptr // end of reserved space
+
+	mapMemory bool // transition memory from Reserved to Ready if true
 }
 
-func (l *linearAlloc) init(base, size uintptr) {
+func (l *linearAlloc) init(base, size uintptr, mapMemory bool) {
 	if base+size < base {
 		// Chop off the last byte. The runtime isn't prepared
 		// to deal with situations where the bounds could overflow.
@@ -1418,6 +1475,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	}
 	l.next, l.mapped = base, base
 	l.end = base + size
+	l.mapMemory = mapMemory
 }
 
 func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
@@ -1427,9 +1485,11 @@ func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Poi
 	}
 	l.next = p + size
 	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
-		// Transition from Reserved to Prepared to Ready.
-		sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
-		sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
+		if l.mapMemory {
+			// Transition from Reserved to Prepared to Ready.
+			sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
+			sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
+		}
 		l.mapped = pEnd
 	}
 	return unsafe.Pointer(p)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 4ba94d0494..e028554b23 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -154,6 +154,9 @@ func TestStringConcatenationAllocs(t *testing.T) {
 }
 
 func TestTinyAlloc(t *testing.T) {
+	if runtime.Raceenabled {
+		t.Skip("tinyalloc suppressed when running in race mode")
+	}
 	const N = 16
 	var v [N]unsafe.Pointer
 	for i := range v {
@@ -182,6 +185,9 @@ type obj12 struct {
 }
 
 func TestTinyAllocIssue37262(t *testing.T) {
+	if runtime.Raceenabled {
+		t.Skip("tinyalloc suppressed when running in race mode")
+	}
 	// Try to cause an alignment access fault
 	// by atomically accessing the first 64-bit
 	// value of a tiny-allocated object.
diff --git a/src/runtime/map.go b/src/runtime/map.go
index 0beff57a1a..111db56b01 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go
@@ -113,7 +113,7 @@ func isEmpty(x uint8) bool {
 
 // A header for a Go map.
 type hmap struct {
-	// Note: the format of the hmap is also encoded in cmd/compile/internal/gc/reflect.go.
+	// Note: the format of the hmap is also encoded in cmd/compile/internal/reflectdata/reflect.go.
 	// Make sure this stays in sync with the compiler's definition.
 	count     int // # live cells == size of map.  Must be first (used by len() builtin)
 	flags     uint8
@@ -159,11 +159,11 @@ type bmap struct {
 }
 
 // A hash iteration structure.
-// If you modify hiter, also change cmd/compile/internal/gc/reflect.go to indicate
+// If you modify hiter, also change cmd/compile/internal/reflectdata/reflect.go to indicate
 // the layout of this structure.
 type hiter struct {
-	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/gc/range.go).
-	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/gc/range.go).
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/walk/range.go).
+	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/walk/range.go).
 	t           *maptype
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
@@ -470,13 +470,13 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 	}
 	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
 			// There used to be half as many buckets; mask down one more power of two.
 			m >>= 1
 		}
-		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&m)*uintptr(t.bucketsize)))
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
 		if !evacuated(oldb) {
 			b = oldb
 		}
@@ -514,13 +514,13 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	}
 	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
 			// There used to be half as many buckets; mask down one more power of two.
 			m >>= 1
 		}
-		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&m)*uintptr(t.bucketsize)))
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
 		if !evacuated(oldb) {
 			b = oldb
 		}
@@ -810,7 +810,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	}
 
 	if unsafe.Sizeof(hiter{})/sys.PtrSize != 12 {
-		throw("hash_iter size incorrect") // see cmd/compile/internal/gc/reflect.go
+		throw("hash_iter size incorrect") // see cmd/compile/internal/reflectdata/reflect.go
 	}
 	it.t = t
 	it.h = h
diff --git a/src/runtime/map_faststr.go b/src/runtime/map_faststr.go
index 2d1ac762a8..0673dd39c8 100644
--- a/src/runtime/map_faststr.go
+++ b/src/runtime/map_faststr.go
@@ -255,6 +255,9 @@ bucketloop:
 			// already have a mapping for key. Update it.
 			inserti = i
 			insertb = b
+			// Overwrite existing key, so it can be garbage collected.
+			// The size is already guaranteed to be set correctly.
+			k.str = key.str
 			goto done
 		}
 		ovf := b.overflow(t)
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index fbfaae0f93..32b8db7a50 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -226,16 +226,25 @@ func (s *mspan) isFree(index uintptr) bool {
 	return *bytep&mask == 0
 }
 
-func (s *mspan) objIndex(p uintptr) uintptr {
-	byteOffset := p - s.base()
-	if byteOffset == 0 {
-		return 0
-	}
-	if s.baseMask != 0 {
-		// s.baseMask is non-0, elemsize is a power of two, so shift by s.divShift
-		return byteOffset >> s.divShift
+// divideByElemSize returns n/s.elemsize.
+// n must be within [0, s.npages*_PageSize),
+// or may be exactly s.npages*_PageSize
+// if s.elemsize is from sizeclasses.go.
+func (s *mspan) divideByElemSize(n uintptr) uintptr {
+	const doubleCheck = false
+
+	// See explanation in mksizeclasses.go's computeDivMagic.
+	q := uintptr((uint64(n) * uint64(s.divMul)) >> 32)
+
+	if doubleCheck && q != n/s.elemsize {
+		println(n, "/", s.elemsize, "should be", n/s.elemsize, "but got", q)
+		throw("bad magic division")
 	}
-	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+	return q
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	return s.divideByElemSize(p - s.base())
 }
 
 func markBitsForAddr(p uintptr) markBits {
@@ -324,6 +333,10 @@ func heapBitsForAddr(addr uintptr) (h heapBits) {
 	return
 }
 
+// clobberdeadPtr is a special value that is used by the compiler to
+// clobber dead stack slots, when -clobberdead flag is set.
+const clobberdeadPtr = uintptr(0xdeaddead | 0xdeaddead<<((^uintptr(0)>>63)*32))
+
 // badPointer throws bad pointer in heap panic.
 func badPointer(s *mspan, p, refBase, refOff uintptr) {
 	// Typically this indicates an incorrect use
@@ -336,13 +349,16 @@ func badPointer(s *mspan, p, refBase, refOff uintptr) {
 	// in allocated spans.
 	printlock()
 	print("runtime: pointer ", hex(p))
-	state := s.state.get()
-	if state != mSpanInUse {
-		print(" to unallocated span")
-	} else {
-		print(" to unused region of span")
+	if s != nil {
+		state := s.state.get()
+		if state != mSpanInUse {
+			print(" to unallocated span")
+		} else {
+			print(" to unused region of span")
+		}
+		print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state)
 	}
-	print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state, "\n")
+	print("\n")
 	if refBase != 0 {
 		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 		gcDumpObject("object", refBase, refOff)
@@ -370,6 +386,12 @@ func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex ui
 	// If s is nil, the virtual address has never been part of the heap.
 	// This pointer may be to some mmap'd region, so we allow it.
 	if s == nil {
+		if GOARCH == "amd64" && p == clobberdeadPtr && debug.invalidptr != 0 {
+			// Crash if clobberdeadPtr is seen. Only on AMD64 for now, as
+			// it is the only platform where compiler's clobberdead mode is
+			// implemented. On AMD64 clobberdeadPtr cannot be a valid address.
+			badPointer(s, p, refBase, refOff)
+		}
 		return
 	}
 	// If p is a bad pointer, it may not be in s's bounds.
@@ -388,24 +410,9 @@ func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex ui
 		}
 		return
 	}
-	// If this span holds object of a power of 2 size, just mask off the bits to
-	// the interior of the object. Otherwise use the size to get the base.
-	if s.baseMask != 0 {
-		// optimize for power of 2 sized objects.
-		base = s.base()
-		base = base + (p-base)&uintptr(s.baseMask)
-		objIndex = (base - s.base()) >> s.divShift
-		// base = p & s.baseMask is faster for small spans,
-		// but doesn't work for large spans.
-		// Overall, it's faster to use the more general computation above.
-	} else {
-		base = s.base()
-		if p-base >= s.elemsize {
-			// n := (p - base) / s.elemsize, using division by multiplication
-			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
-			base += objIndex * s.elemsize
-		}
-	}
+
+	objIndex = s.objIndex(p)
+	base = s.base() + objIndex*s.elemsize
 	return
 }
 
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index bb7475b6f3..a9e959109a 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -176,29 +176,29 @@ func (c *mcache) refill(spc spanClass) {
 	// mcache. If it gets uncached, we'll adjust this.
 	stats := memstats.heapStats.acquire()
 	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
-	memstats.heapStats.release()
-
-	// Update heap_live with the same assumption.
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
 	// Flush tinyAllocs.
 	if spc == tinySpanClass {
-		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
 		c.tinyAllocs = 0
 	}
+	memstats.heapStats.release()
+
+	// Update gcController.heapLive with the same assumption.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&gcController.heapLive, int64(s.npages*pageSize)-int64(usedBytes))
 
 	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	atomic.Xadd64(&gcController.heapScan, int64(c.scanAlloc))
 	c.scanAlloc = 0
 
 	if trace.enabled {
-		// heap_live changed.
+		// gcController.heapLive changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
-		// heap_live and heap_scan changed.
+		// gcController.heapLive and heapScan changed.
 		gcController.revise()
 	}
 
@@ -206,7 +206,10 @@ func (c *mcache) refill(spc spanClass) {
 }
 
 // allocLarge allocates a span for a large object.
-func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
+// The boolean result indicates whether the span is known-zeroed.
+// If it did not need to be zeroed, it may not have been zeroed;
+// but if it came directly from the OS, it is already zeroed.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) (*mspan, bool) {
 	if size+_PageSize < size {
 		throw("out of memory")
 	}
@@ -221,7 +224,7 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	deductSweepCredit(npages*_PageSize, npages)
 
 	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
+	s, isZeroed := mheap_.alloc(npages, spc, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -230,10 +233,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	atomic.Xadduintptr(&stats.largeAllocCount, 1)
 	memstats.heapStats.release()
 
-	// Update heap_live and revise pacing if needed.
-	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	// Update gcController.heapLive and revise pacing if needed.
+	atomic.Xadd64(&gcController.heapLive, int64(npages*pageSize))
 	if trace.enabled {
-		// Trace that a heap alloc occurred because heap_live changed.
+		// Trace that a heap alloc occurred because gcController.heapLive changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
@@ -245,12 +248,12 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
 	s.limit = s.base() + size
 	heapBitsForAddr(s.base()).initSpan(s)
-	return s
+	return s, isZeroed
 }
 
 func (c *mcache) releaseAll() {
 	// Take this opportunity to flush scanAlloc.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	atomic.Xadd64(&gcController.heapScan, int64(c.scanAlloc))
 	c.scanAlloc = 0
 
 	sg := mheap_.sweepgen
@@ -263,14 +266,14 @@ func (c *mcache) releaseAll() {
 			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
 			memstats.heapStats.release()
 			if s.sweepgen != sg+1 {
-				// refill conservatively counted unallocated slots in heap_live.
+				// refill conservatively counted unallocated slots in gcController.heapLive.
 				// Undo this.
 				//
 				// If this span was cached before sweep, then
-				// heap_live was totally recomputed since
+				// gcController.heapLive was totally recomputed since
 				// caching this span, so we don't do this for
 				// stale spans.
-				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+				atomic.Xadd64(&gcController.heapLive, -int64(n)*int64(s.elemsize))
 			}
 			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
@@ -280,10 +283,14 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
-	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+
+	// Flush tinyAllocs.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
 	c.tinyAllocs = 0
+	memstats.heapStats.release()
 
-	// Updated heap_scan and possible heap_live.
+	// Updated heapScan and possible gcController.heapLive.
 	if gcBlackenEnabled != 0 {
 		gcController.revise()
 	}
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index cd20dec539..6013c94c69 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -81,8 +81,6 @@ func (c *mcentral) cacheSpan() *mspan {
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
 
-	sg := mheap_.sweepgen
-
 	traceDone := false
 	if trace.enabled {
 		traceGCSweepStart()
@@ -104,6 +102,8 @@ func (c *mcentral) cacheSpan() *mspan {
 	spanBudget := 100
 
 	var s *mspan
+	sl := newSweepLocker()
+	sg := sl.sweepGen
 
 	// Try partial swept spans first.
 	if s = c.partialSwept(sg).pop(); s != nil {
@@ -116,9 +116,10 @@ func (c *mcentral) cacheSpan() *mspan {
 		if s == nil {
 			break
 		}
-		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
 			// We got ownership of the span, so let's sweep it and use it.
 			s.sweep(true)
+			sl.dispose()
 			goto havespan
 		}
 		// We failed to get ownership of the span, which means it's being or
@@ -135,20 +136,22 @@ func (c *mcentral) cacheSpan() *mspan {
 		if s == nil {
 			break
 		}
-		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
 			// We got ownership of the span, so let's sweep it.
 			s.sweep(true)
 			// Check if there's any free space.
 			freeIndex := s.nextFreeIndex()
 			if freeIndex != s.nelems {
 				s.freeindex = freeIndex
+				sl.dispose()
 				goto havespan
 			}
 			// Add it to the swept list, because sweeping didn't give us any free space.
-			c.fullSwept(sg).push(s)
+			c.fullSwept(sg).push(s.mspan)
 		}
 		// See comment for partial unswept spans.
 	}
+	sl.dispose()
 	if trace.enabled {
 		traceGCSweepDone()
 		traceDone = true
@@ -211,7 +214,13 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 	if stale {
 		// It's stale, so just sweep it. Sweeping will put it on
 		// the right list.
-		s.sweep(false)
+		//
+		// We don't use a sweepLocker here. Stale cached spans
+		// aren't in the global sweep lists, so mark termination
+		// itself holds up sweep completion until all mcaches
+		// have been swept.
+		ss := sweepLocked{s}
+		ss.sweep(false)
 	} else {
 		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
@@ -229,14 +238,14 @@ func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
 	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 
-	s := mheap_.alloc(npages, c.spanclass, true)
+	s, _ := mheap_.alloc(npages, c.spanclass, true)
 	if s == nil {
 		return nil
 	}
 
 	// Use division by multiplication and shifts to quickly compute:
 	// n := (npages << _PageShift) / size
-	n := (npages << _PageShift) >> s.divShift * uintptr(s.divMul) >> s.divShift2
+	n := s.divideByElemSize(npages << _PageShift)
 	s.limit = s.base() + size*n
 	heapBitsForAddr(s.base()).initSpan(s)
 	return s
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index bc672019fb..dcbb9a1d51 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || netbsd || openbsd || solaris
 // +build dragonfly freebsd netbsd openbsd solaris
 
 package runtime
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index 957ed36ffa..fe940360c0 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s
index 5e090ef09e..d2ef17f7ce 100644
--- a/src/runtime/memclr_386.s
+++ b/src/runtime/memclr_386.s
@@ -7,8 +7,6 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
-
 // See memclrNoHeapPointers Go doc for important implementation constraints.
 
 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 37fe9745b1..5d2bebb901 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -7,14 +7,19 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
-
 // See memclrNoHeapPointers Go doc for important implementation constraints.
 
 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
-TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
+// ABIInternal for performance.
+TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr
+	// BX = n
+	MOVQ	AX, DI	// DI = ptr
+#else
 	MOVQ	ptr+0(FP), DI
 	MOVQ	n+8(FP), BX
+#endif
 	XORQ	AX, AX
 
 	// MOVOU seems always faster than REP STOSQ.
@@ -31,7 +36,9 @@ tail:
 	JE	_8
 	CMPQ	BX, $16
 	JBE	_9through16
-	PXOR	X0, X0
+#ifndef GOEXPERIMENT_regabig
+	PXOR	X15, X15
+#endif
 	CMPQ	BX, $32
 	JBE	_17through32
 	CMPQ	BX, $64
@@ -45,22 +52,22 @@ tail:
 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
 
 loop:
-	MOVOU	X0, 0(DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, 128(DI)
-	MOVOU	X0, 144(DI)
-	MOVOU	X0, 160(DI)
-	MOVOU	X0, 176(DI)
-	MOVOU	X0, 192(DI)
-	MOVOU	X0, 208(DI)
-	MOVOU	X0, 224(DI)
-	MOVOU	X0, 240(DI)
+	MOVOU	X15, 0(DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, 128(DI)
+	MOVOU	X15, 144(DI)
+	MOVOU	X15, 160(DI)
+	MOVOU	X15, 176(DI)
+	MOVOU	X15, 192(DI)
+	MOVOU	X15, 208(DI)
+	MOVOU	X15, 224(DI)
+	MOVOU	X15, 240(DI)
 	SUBQ	$256, BX
 	ADDQ	$256, DI
 	CMPQ	BX, $256
@@ -141,40 +148,40 @@ _9through16:
 	MOVQ	AX, -8(DI)(BX*1)
 	RET
 _17through32:
-	MOVOU	X0, (DI)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _33through64:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _65through128:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _129through256:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, -128(DI)(BX*1)
-	MOVOU	X0, -112(DI)(BX*1)
-	MOVOU	X0, -96(DI)(BX*1)
-	MOVOU	X0, -80(DI)(BX*1)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, -128(DI)(BX*1)
+	MOVOU	X15, -112(DI)(BX*1)
+	MOVOU	X15, -96(DI)(BX*1)
+	MOVOU	X15, -80(DI)(BX*1)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index d91641a8e8..f1e3403596 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -31,11 +31,20 @@
 // See memmove Go doc for important implementation constraints.
 
 // func memmove(to, from unsafe.Pointer, n uintptr)
-TEXT runtime·memmove(SB), NOSPLIT, $0-24
-
+// ABIInternal for performance.
+TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = to
+	// BX = from
+	// CX = n
+	MOVQ	AX, DI
+	MOVQ	BX, SI
+	MOVQ	CX, BX
+#else
 	MOVQ	to+0(FP), DI
 	MOVQ	from+8(FP), SI
 	MOVQ	n+16(FP), BX
+#endif
 
 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code. The REP MOVSQ instruction is really fast
@@ -244,6 +253,10 @@ move_129through256:
 	MOVOU	X13, -48(DI)(BX*1)
 	MOVOU	X14, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	RET
 move_256through2048:
 	SUBQ	$256, BX
@@ -283,6 +296,10 @@ move_256through2048:
 	LEAQ	256(SI), SI
 	LEAQ	256(DI), DI
 	JGE	move_256through2048
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	JMP	tail
 
 avxUnaligned:
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index edc6452bba..dbd835506f 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -157,7 +157,7 @@ backwardlargeloop:
 
 backward32setup:
 	MOVD	QWORDS, CTR			// set up loop ctr
-	MOVD	$16, IDX16			// 32 bytes at at time
+	MOVD	$16, IDX16			// 32 bytes at a time
 
 backward32loop:
 	SUB	$32, TGT
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 3e8dbda0ca..ba0a920a5d 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -98,6 +98,20 @@ func initMetrics() {
 				}
 			},
 		},
+		"/gc/heap/allocs:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocated
+			},
+		},
+		"/gc/heap/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocs
+			},
+		},
 		"/gc/heap/frees-by-size:bytes": {
 			deps: makeStatDepSet(heapStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -110,6 +124,20 @@ func initMetrics() {
 				}
 			},
 		},
+		"/gc/heap/frees:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFreed
+			},
+		},
+		"/gc/heap/frees:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFrees
+			},
+		},
 		"/gc/heap/goal:bytes": {
 			deps: makeStatDepSet(sysStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -124,6 +152,13 @@ func initMetrics() {
 				out.scalar = in.heapStats.numObjects
 			},
 		},
+		"/gc/heap/tiny/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.tinyAllocCount)
+			},
+		},
 		"/gc/pauses:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				hist := out.float64HistOrInit(timeHistBuckets)
@@ -245,6 +280,15 @@ func initMetrics() {
 				out.scalar = uint64(gcount())
 			},
 		},
+		"/sched/latencies:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[0] = atomic.Load64(&sched.timeToRun.underflow)
+				for i := range sched.timeToRun.counts {
+					hist.counts[i+1] = atomic.Load64(&sched.timeToRun.counts[i])
+				}
+			},
+		},
 	}
 	metricsInit = true
 }
@@ -321,6 +365,22 @@ type heapStatsAggregate struct {
 
 	// numObjects is the number of live objects in the heap.
 	numObjects uint64
+
+	// totalAllocated is the total bytes of heap objects allocated
+	// over the lifetime of the program.
+	totalAllocated uint64
+
+	// totalFreed is the total bytes of heap objects freed
+	// over the lifetime of the program.
+	totalFreed uint64
+
+	// totalAllocs is the number of heap objects allocated over
+	// the lifetime of the program.
+	totalAllocs uint64
+
+	// totalFrees is the number of heap objects freed over
+	// the lifetime of the program.
+	totalFrees uint64
 }
 
 // compute populates the heapStatsAggregate with values from the runtime.
@@ -328,13 +388,20 @@ func (a *heapStatsAggregate) compute() {
 	memstats.heapStats.read(&a.heapStatsDelta)
 
 	// Calculate derived stats.
-	a.inObjects = uint64(a.largeAlloc - a.largeFree)
-	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
+	a.totalAllocs = uint64(a.largeAllocCount)
+	a.totalFrees = uint64(a.largeFreeCount)
+	a.totalAllocated = uint64(a.largeAlloc)
+	a.totalFreed = uint64(a.largeFree)
 	for i := range a.smallAllocCount {
-		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
-		a.inObjects += n * uint64(class_to_size[i])
-		a.numObjects += n
+		na := uint64(a.smallAllocCount[i])
+		nf := uint64(a.smallFreeCount[i])
+		a.totalAllocs += na
+		a.totalFrees += nf
+		a.totalAllocated += na * uint64(class_to_size[i])
+		a.totalFreed += nf * uint64(class_to_size[i])
 	}
+	a.inObjects = a.totalAllocated - a.totalFreed
+	a.numObjects = a.totalAllocs - a.totalFrees
 }
 
 // sysStatsAggregate represents system memory stats obtained
@@ -364,7 +431,7 @@ func (a *sysStatsAggregate) compute() {
 	a.buckHashSys = memstats.buckhash_sys.load()
 	a.gcMiscSys = memstats.gcMiscSys.load()
 	a.otherSys = memstats.other_sys.load()
-	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.heapGoal = atomic.Load64(&gcController.heapGoal)
 	a.gcCyclesDone = uint64(memstats.numgc)
 	a.gcCyclesForced = uint64(memstats.numforcedgc)
 
@@ -481,7 +548,7 @@ func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
 
 	// Acquire the metricsSema but with handoff. This operation
 	// is expensive enough that queueing up goroutines and handing
-	// off between them will be noticably better-behaved.
+	// off between them will be noticeably better-behaved.
 	semacquire1(&metricsSema, true, 0, 0)
 
 	// Ensure the map is initialized.
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index 1175156104..c147cada89 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -70,18 +70,51 @@ var allDesc = []Description{
 		Cumulative:  true,
 	},
 	{
-		Name:        "/gc/heap/allocs-by-size:bytes",
-		Description: "Distribution of all objects allocated by approximate size.",
-		Kind:        KindFloat64Histogram,
+		Name: "/gc/heap/allocs-by-size:bytes",
+		Description: "Distribution of heap allocations by approximate size. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindFloat64Histogram,
+		Cumulative: true,
+	},
+	{
+		Name:        "/gc/heap/allocs:bytes",
+		Description: "Cumulative sum of memory allocated to the heap by the application.",
+		Kind:        KindUint64,
 		Cumulative:  true,
 	},
 	{
-		Name:        "/gc/heap/frees-by-size:bytes",
-		Description: "Distribution of all objects freed by approximate size.",
-		Kind:        KindFloat64Histogram,
+		Name: "/gc/heap/allocs:objects",
+		Description: "Cumulative count of heap allocations triggered by the application. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
+		Name: "/gc/heap/frees-by-size:bytes",
+		Description: "Distribution of freed heap allocations by approximate size. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindFloat64Histogram,
+		Cumulative: true,
+	},
+	{
+		Name:        "/gc/heap/frees:bytes",
+		Description: "Cumulative sum of heap memory freed by the garbage collector.",
+		Kind:        KindUint64,
 		Cumulative:  true,
 	},
 	{
+		Name: "/gc/heap/frees:objects",
+		Description: "Cumulative count of heap allocations whose storage was freed " +
+			"by the garbage collector. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
 		Name:        "/gc/heap/goal:bytes",
 		Description: "Heap size target for the end of the GC cycle.",
 		Kind:        KindUint64,
@@ -92,6 +125,16 @@ var allDesc = []Description{
 		Kind:        KindUint64,
 	},
 	{
+		Name: "/gc/heap/tiny/allocs:objects",
+		Description: "Count of small allocations that are packed together into blocks. " +
+			"These allocations are counted separately from other allocations " +
+			"because each individual allocation is not tracked by the runtime, " +
+			"only their block. Each block is already accounted for in " +
+			"allocs-by-size and frees-by-size.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
 		Name:        "/gc/pauses:seconds",
 		Description: "Distribution individual GC-related stop-the-world pause latencies.",
 		Kind:        KindFloat64Histogram,
@@ -176,6 +219,11 @@ var allDesc = []Description{
 		Description: "Count of live goroutines.",
 		Kind:        KindUint64,
 	},
+	{
+		Name:        "/sched/latencies:seconds",
+		Description: "Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.",
+		Kind:        KindFloat64Histogram,
+	},
 }
 
 // All returns a slice of containing metric descriptions for all supported metrics.
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 7f790afc12..91ef03072d 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -61,10 +61,30 @@ Below is the full list of supported metrics, ordered lexicographically.
 		Count of all completed GC cycles.
 
 	/gc/heap/allocs-by-size:bytes
-		Distribution of all objects allocated by approximate size.
+		Distribution of heap allocations by approximate size.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
+
+	/gc/heap/allocs:bytes
+		Cumulative sum of memory allocated to the heap by the application.
+
+	/gc/heap/allocs:objects
+		Cumulative count of heap allocations triggered by the application.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
 
 	/gc/heap/frees-by-size:bytes
-		Distribution of all objects freed by approximate size.
+		Distribution of freed heap allocations by approximate size.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
+
+	/gc/heap/frees:bytes
+		Cumulative sum of heap memory freed by the garbage collector.
+
+	/gc/heap/frees:objects
+		Cumulative count of heap allocations whose storage was freed by the garbage collector.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
 
 	/gc/heap/goal:bytes
 		Heap size target for the end of the GC cycle.
@@ -72,6 +92,13 @@ Below is the full list of supported metrics, ordered lexicographically.
 	/gc/heap/objects:objects
 		Number of objects, live or unswept, occupying heap memory.
 
+	/gc/heap/tiny/allocs:objects
+		Count of small allocations that are packed together into blocks.
+		These allocations are counted separately from other allocations
+		because each individual allocation is not tracked by the runtime,
+		only their block. Each block is already accounted for in
+		allocs-by-size and frees-by-size.
+
 	/gc/pauses:seconds
 		Distribution individual GC-related stop-the-world pause latencies.
 
@@ -139,5 +166,9 @@ Below is the full list of supported metrics, ordered lexicographically.
 
 	/sched/goroutines:goroutines
 		Count of live goroutines.
+
+	/sched/latencies:seconds
+		Distribution of the time goroutines have spent in the scheduler
+		in a runnable state before actually running.
 */
 package metrics
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 8a3cf019bd..5d32ef469c 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -40,6 +40,9 @@ func TestReadMetrics(t *testing.T) {
 	}
 
 	// Check to make sure the values we read line up with other values we read.
+	var allocsBySize *metrics.Float64Histogram
+	var tinyAllocs uint64
+	var mallocs, frees uint64
 	for i := range samples {
 		switch name := samples[i].Name; name {
 		case "/memory/classes/heap/free:bytes":
@@ -84,6 +87,9 @@ func TestReadMetrics(t *testing.T) {
 					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, m)
 				}
 			}
+			allocsBySize = hist
+		case "/gc/heap/allocs:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc)
 		case "/gc/heap/frees-by-size:bytes":
 			hist := samples[i].Value.Float64Histogram()
 			// Skip size class 0 in BySize, because it's always empty and not represented
@@ -95,9 +101,29 @@ func TestReadMetrics(t *testing.T) {
 					continue
 				}
 				if c, f := hist.Counts[i], sc.Frees; c != f {
-					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, f)
+					t.Errorf("histogram counts do not match BySize for class %d: got %d, want %d", i, c, f)
 				}
 			}
+		case "/gc/heap/frees:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc-mstats.HeapAlloc)
+		case "/gc/heap/tiny/allocs:objects":
+			// Currently, MemStats adds tiny alloc count to both Mallocs AND Frees.
+			// The reason for this is because MemStats couldn't be extended at the time
+			// but there was a desire to have Mallocs at least be a little more representative,
+			// while having Mallocs - Frees still represent a live object count.
+			// Unfortunately, MemStats doesn't actually export a large allocation count,
+			// so it's impossible to pull this number out directly.
+			//
+			// Check tiny allocation count outside of this loop, by using the allocs-by-size
+			// histogram in order to figure out how many large objects there are.
+			tinyAllocs = samples[i].Value.Uint64()
+			// Because the next two metrics tests are checking against Mallocs and Frees,
+			// we can't check them directly for the same reason: we need to account for tiny
+			// allocations included in Mallocs and Frees.
+		case "/gc/heap/allocs:objects":
+			mallocs = samples[i].Value.Uint64()
+		case "/gc/heap/frees:objects":
+			frees = samples[i].Value.Uint64()
 		case "/gc/heap/objects:objects":
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
 		case "/gc/heap/goal:bytes":
@@ -110,6 +136,17 @@ func TestReadMetrics(t *testing.T) {
 			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
 		}
 	}
+
+	// Check tinyAllocs.
+	nonTinyAllocs := uint64(0)
+	for _, c := range allocsBySize.Counts {
+		nonTinyAllocs += c
+	}
+	checkUint64(t, "/gc/heap/tiny/allocs:objects", tinyAllocs, mstats.Mallocs-nonTinyAllocs)
+
+	// Check allocation and free counts.
+	checkUint64(t, "/gc/heap/allocs:objects", mallocs, mstats.Mallocs-tinyAllocs)
+	checkUint64(t, "/gc/heap/frees:objects", frees, mstats.Frees-tinyAllocs)
 }
 
 func TestReadMetricsConsistency(t *testing.T) {
@@ -132,8 +169,10 @@ func TestReadMetricsConsistency(t *testing.T) {
 		got, want uint64
 	}
 	var objects struct {
-		alloc, free *metrics.Float64Histogram
-		total       uint64
+		alloc, free             *metrics.Float64Histogram
+		allocs, frees           uint64
+		allocdBytes, freedBytes uint64
+		total, totalBytes       uint64
 	}
 	var gc struct {
 		numGC  uint64
@@ -159,10 +198,20 @@ func TestReadMetricsConsistency(t *testing.T) {
 		switch samples[i].Name {
 		case "/memory/classes/total:bytes":
 			totalVirtual.got = samples[i].Value.Uint64()
+		case "/memory/classes/heap/objects:bytes":
+			objects.totalBytes = samples[i].Value.Uint64()
 		case "/gc/heap/objects:objects":
 			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs:bytes":
+			objects.allocdBytes = samples[i].Value.Uint64()
+		case "/gc/heap/allocs:objects":
+			objects.allocs = samples[i].Value.Uint64()
 		case "/gc/heap/allocs-by-size:bytes":
 			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees:bytes":
+			objects.freedBytes = samples[i].Value.Uint64()
+		case "/gc/heap/frees:objects":
+			objects.frees = samples[i].Value.Uint64()
 		case "/gc/heap/frees-by-size:bytes":
 			objects.free = samples[i].Value.Float64Histogram()
 		case "/gc/cycles:gc-cycles":
@@ -182,6 +231,12 @@ func TestReadMetricsConsistency(t *testing.T) {
 	if totalVirtual.got != totalVirtual.want {
 		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
 	}
+	if got, want := objects.allocs-objects.frees, objects.total; got != want {
+		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
+	}
+	if got, want := objects.allocdBytes-objects.freedBytes, objects.totalBytes; got != want {
+		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
+	}
 	if b, c := len(objects.alloc.Buckets), len(objects.alloc.Counts); b != c+1 {
 		t.Errorf("allocs-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
 	}
@@ -201,17 +256,25 @@ func TestReadMetricsConsistency(t *testing.T) {
 			}
 		}
 		if !t.Failed() {
-			got, want := uint64(0), objects.total
+			var gotAlloc, gotFree uint64
+			want := objects.total
 			for i := range objects.alloc.Counts {
 				if objects.alloc.Counts[i] < objects.free.Counts[i] {
 					t.Errorf("found more allocs than frees in object dist bucket %d", i)
 					continue
 				}
-				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+				gotAlloc += objects.alloc.Counts[i]
+				gotFree += objects.free.Counts[i]
 			}
-			if got != want {
+			if got := gotAlloc - gotFree; got != want {
 				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
 			}
+			if gotAlloc != objects.allocs {
+				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotAlloc, objects.allocs)
+			}
+			if gotFree != objects.frees {
+				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotFree, objects.frees)
+			}
 		}
 	}
 	// The current GC has at least 2 pauses per GC.
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 7d0313be12..fd318d49a8 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -163,6 +163,7 @@ func runfinq() {
 	var (
 		frame    unsafe.Pointer
 		framecap uintptr
+		argRegs  int
 	)
 
 	for {
@@ -176,6 +177,7 @@ func runfinq() {
 			goparkunlock(&finlock, waitReasonFinalizerWait, traceEvGoBlock, 1)
 			continue
 		}
+		argRegs = intArgRegs
 		unlock(&finlock)
 		if raceenabled {
 			racefingo()
@@ -184,7 +186,22 @@ func runfinq() {
 			for i := fb.cnt; i > 0; i-- {
 				f := &fb.fin[i-1]
 
-				framesz := unsafe.Sizeof((interface{})(nil)) + f.nret
+				var regs abi.RegArgs
+				var framesz uintptr
+				if argRegs > 0 {
+					// The args can always be passed in registers if they're
+					// available, because platforms we support always have no
+					// argument registers available, or more than 2.
+					//
+					// But unfortunately because we can have an arbitrary
+					// amount of returns and it would be complex to try and
+					// figure out how many of those can get passed in registers,
+					// just conservatively assume none of them do.
+					framesz = f.nret
+				} else {
+					// Need to pass arguments on the stack too.
+					framesz = unsafe.Sizeof((interface{})(nil)) + f.nret
+				}
 				if framecap < framesz {
 					// The frame does not contain pointers interesting for GC,
 					// all not yet finalized objects are stored in finq.
@@ -197,33 +214,34 @@ func runfinq() {
 				if f.fint == nil {
 					throw("missing type in runfinq")
 				}
-				// frame is effectively uninitialized
-				// memory. That means we have to clear
-				// it before writing to it to avoid
-				// confusing the write barrier.
-				*(*[2]uintptr)(frame) = [2]uintptr{}
+				r := frame
+				if argRegs > 0 {
+					r = unsafe.Pointer(&regs.Ints)
+				} else {
+					// frame is effectively uninitialized
+					// memory. That means we have to clear
+					// it before writing to it to avoid
+					// confusing the write barrier.
+					*(*[2]uintptr)(frame) = [2]uintptr{}
+				}
 				switch f.fint.kind & kindMask {
 				case kindPtr:
 					// direct use of pointer
-					*(*unsafe.Pointer)(frame) = f.arg
+					*(*unsafe.Pointer)(r) = f.arg
 				case kindInterface:
 					ityp := (*interfacetype)(unsafe.Pointer(f.fint))
 					// set up with empty interface
-					(*eface)(frame)._type = &f.ot.typ
-					(*eface)(frame).data = f.arg
+					(*eface)(r)._type = &f.ot.typ
+					(*eface)(r).data = f.arg
 					if len(ityp.mhdr) != 0 {
 						// convert to interface with methods
 						// this conversion is guaranteed to succeed - we checked in SetFinalizer
-						*(*iface)(frame) = assertE2I(ityp, *(*eface)(frame))
+						(*iface)(r).tab = assertE2I(ityp, (*eface)(r)._type)
 					}
 				default:
 					throw("bad kind in runfinq")
 				}
 				fingRunning = true
-				// Pass a dummy RegArgs for now.
-				//
-				// TODO(mknyszek): Pass arguments in registers.
-				var regs abi.RegArgs
 				reflectcall(nil, unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz), uint32(framesz), &regs)
 				fingRunning = false
 
@@ -403,7 +421,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 			// ok - satisfies empty interface
 			goto okarg
 		}
-		if _, ok := assertE2I2(ityp, *efaceOf(&obj)); ok {
+		if iface := assertE2I2(ityp, *efaceOf(&obj)); iface.tab != nil {
 			goto okarg
 		}
 	}
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 185d3201ca..4585663535 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -113,8 +113,8 @@
 // Next GC is after we've allocated an extra amount of memory proportional to
 // the amount already in use. The proportion is controlled by GOGC environment variable
 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
+// linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 // (and also the amount of extra memory used).
 
 // Oblets
@@ -149,45 +149,16 @@ const (
 	sweepMinHeapDistance = 1024 * 1024
 )
 
-// heapminimum is the minimum heap size at which to trigger GC.
-// For small heaps, this overrides the usual GOGC*live set rule.
-//
-// When there is a very small live set but a lot of allocation, simply
-// collecting when the heap reaches GOGC*live results in many GC
-// cycles and high total per-GC overhead. This minimum amortizes this
-// per-GC overhead while keeping the heap reasonably small.
-//
-// During initialization this is set to 4MB*GOGC/100. In the case of
-// GOGC==0, this will set heapminimum to 0, resulting in constant
-// collection even when the heap size is small, which is useful for
-// debugging.
-var heapminimum uint64 = defaultHeapMinimum
-
-// defaultHeapMinimum is the value of heapminimum for GOGC==100.
-const defaultHeapMinimum = 4 << 20
-
-// Initialized from $GOGC.  GOGC=off means no GC.
-var gcpercent int32
-
 func gcinit() {
 	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 		throw("size of Workbuf is suboptimal")
 	}
-
 	// No sweep on the first cycle.
-	mheap_.sweepdone = 1
-
-	// Set a reasonable initial GC trigger.
-	memstats.triggerRatio = 7 / 8.0
+	mheap_.sweepDrained = 1
 
-	// Fake a heap_marked value so it looks like a trigger at
-	// heapminimum is the appropriate growth from heap_marked.
-	// This will go into computing the initial GC goal.
-	memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
-
-	// Set gcpercent from the environment. This will also compute
-	// and set the GC trigger and goal.
-	_ = setGCPercent(readgogc())
+	// Initialize GC pacer state.
+	// Use the environment variable GOGC for the initial gcPercent value.
+	gcController.init(readGOGC())
 
 	work.startSema = 1
 	work.markDoneSema = 1
@@ -196,16 +167,9 @@ func gcinit() {
 	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 }
 
-func readgogc() int32 {
-	p := gogetenv("GOGC")
-	if p == "off" {
-		return -1
-	}
-	if n, ok := atoi32(p); ok {
-		return n
-	}
-	return 100
-}
+// Temporary in order to enable register ABI work.
+// TODO(register args): convert back to local chan in gcenabled, passed to "go" stmts.
+var gcenable_setup chan int
 
 // gcenable is called after the bulk of the runtime initialization,
 // just before we're about to start letting user code run.
@@ -213,39 +177,15 @@ func readgogc() int32 {
 // scavenger goroutine, and enables GC.
 func gcenable() {
 	// Kick off sweeping and scavenging.
-	c := make(chan int, 2)
-	go bgsweep(c)
-	go bgscavenge(c)
-	<-c
-	<-c
+	gcenable_setup = make(chan int, 2)
+	go bgsweep()
+	go bgscavenge()
+	<-gcenable_setup
+	<-gcenable_setup
+	gcenable_setup = nil
 	memstats.enablegc = true // now that runtime is initialized, GC is okay
 }
 
-//go:linkname setGCPercent runtime/debug.setGCPercent
-func setGCPercent(in int32) (out int32) {
-	// Run on the system stack since we grab the heap lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		out = gcpercent
-		if in < 0 {
-			in = -1
-		}
-		gcpercent = in
-		heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
-		// Update pacing in response to gcpercent change.
-		gcSetTriggerRatio(memstats.triggerRatio)
-		unlock(&mheap_.lock)
-	})
-
-	// If we just disabled GC, wait for any concurrent GC mark to
-	// finish so we always return with no GC running.
-	if in < 0 {
-		gcWaitOnMark(atomic.Load(&work.cycles))
-	}
-
-	return out
-}
-
 // Garbage collector phase.
 // Indicates to write barrier and synchronization task to perform.
 var gcphase uint32
@@ -302,9 +242,11 @@ const (
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
 	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
-	// an integer. The fractional worker should run until it is
-	// preempted and will be scheduled to pick up the fractional
-	// part of GOMAXPROCS*gcBackgroundUtilization.
+	// an integer and using only dedicated workers would result in
+	// utilization too far from the target of gcBackgroundUtilization.
+	// The fractional worker should run until it is preempted and
+	// will be scheduled to pick up the fractional part of
+	// GOMAXPROCS*gcBackgroundUtilization.
 	gcMarkWorkerFractionalMode
 
 	// gcMarkWorkerIdleMode indicates that a P is running the mark
@@ -323,474 +265,6 @@ var gcMarkWorkerModeStrings = [...]string{
 	"GC (idle)",
 }
 
-// gcController implements the GC pacing controller that determines
-// when to trigger concurrent garbage collection and how much marking
-// work to do in mutator assists and background marking.
-//
-// It uses a feedback control algorithm to adjust the memstats.gc_trigger
-// trigger based on the heap growth and GC CPU utilization each cycle.
-// This algorithm optimizes for heap growth to match GOGC and for CPU
-// utilization between assist and background marking to be 25% of
-// GOMAXPROCS. The high-level design of this algorithm is documented
-// at https://golang.org/s/go15gcpacing.
-//
-// All fields of gcController are used only during a single mark
-// cycle.
-var gcController gcControllerState
-
-type gcControllerState struct {
-	// scanWork is the total scan work performed this cycle. This
-	// is updated atomically during the cycle. Updates occur in
-	// bounded batches, since it is both written and read
-	// throughout the cycle. At the end of the cycle, this is how
-	// much of the retained heap is scannable.
-	//
-	// Currently this is the bytes of heap scanned. For most uses,
-	// this is an opaque unit of work, but for estimation the
-	// definition is important.
-	scanWork int64
-
-	// bgScanCredit is the scan work credit accumulated by the
-	// concurrent background scan. This credit is accumulated by
-	// the background scan and stolen by mutator assists. This is
-	// updated atomically. Updates occur in bounded batches, since
-	// it is both written and read throughout the cycle.
-	bgScanCredit int64
-
-	// assistTime is the nanoseconds spent in mutator assists
-	// during this cycle. This is updated atomically. Updates
-	// occur in bounded batches, since it is both written and read
-	// throughout the cycle.
-	assistTime int64
-
-	// dedicatedMarkTime is the nanoseconds spent in dedicated
-	// mark workers during this cycle. This is updated atomically
-	// at the end of the concurrent mark phase.
-	dedicatedMarkTime int64
-
-	// fractionalMarkTime is the nanoseconds spent in the
-	// fractional mark worker during this cycle. This is updated
-	// atomically throughout the cycle and will be up-to-date if
-	// the fractional mark worker is not currently running.
-	fractionalMarkTime int64
-
-	// idleMarkTime is the nanoseconds spent in idle marking
-	// during this cycle. This is updated atomically throughout
-	// the cycle.
-	idleMarkTime int64
-
-	// markStartTime is the absolute start time in nanoseconds
-	// that assists and background mark workers started.
-	markStartTime int64
-
-	// dedicatedMarkWorkersNeeded is the number of dedicated mark
-	// workers that need to be started. This is computed at the
-	// beginning of each cycle and decremented atomically as
-	// dedicated mark workers get started.
-	dedicatedMarkWorkersNeeded int64
-
-	// assistWorkPerByte is the ratio of scan work to allocated
-	// bytes that should be performed by mutator assists. This is
-	// computed at the beginning of each cycle and updated every
-	// time heap_scan is updated.
-	//
-	// Stored as a uint64, but it's actually a float64. Use
-	// float64frombits to get the value.
-	//
-	// Read and written atomically.
-	assistWorkPerByte uint64
-
-	// assistBytesPerWork is 1/assistWorkPerByte.
-	//
-	// Stored as a uint64, but it's actually a float64. Use
-	// float64frombits to get the value.
-	//
-	// Read and written atomically.
-	//
-	// Note that because this is read and written independently
-	// from assistWorkPerByte users may notice a skew between
-	// the two values, and such a state should be safe.
-	assistBytesPerWork uint64
-
-	// fractionalUtilizationGoal is the fraction of wall clock
-	// time that should be spent in the fractional mark worker on
-	// each P that isn't running a dedicated worker.
-	//
-	// For example, if the utilization goal is 25% and there are
-	// no dedicated workers, this will be 0.25. If the goal is
-	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
-	// this will be 0.05 to make up the missing 5%.
-	//
-	// If this is zero, no fractional workers are needed.
-	fractionalUtilizationGoal float64
-
-	_ cpu.CacheLinePad
-}
-
-// startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema and the world
-// must be stopped.
-func (c *gcControllerState) startCycle() {
-	c.scanWork = 0
-	c.bgScanCredit = 0
-	c.assistTime = 0
-	c.dedicatedMarkTime = 0
-	c.fractionalMarkTime = 0
-	c.idleMarkTime = 0
-
-	// Ensure that the heap goal is at least a little larger than
-	// the current live heap size. This may not be the case if GC
-	// start is delayed or if the allocation that pushed heap_live
-	// over gc_trigger is large or if the trigger is really close to
-	// GOGC. Assist is proportional to this distance, so enforce a
-	// minimum distance, even if it means going over the GOGC goal
-	// by a tiny bit.
-	if memstats.next_gc < memstats.heap_live+1024*1024 {
-		memstats.next_gc = memstats.heap_live + 1024*1024
-	}
-
-	// Compute the background mark utilization goal. In general,
-	// this may not come out exactly. We round the number of
-	// dedicated workers so that the utilization is closest to
-	// 25%. For small GOMAXPROCS, this would introduce too much
-	// error, so we add fractional workers in that case.
-	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
-	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
-	const maxUtilError = 0.3
-	if utilError < -maxUtilError || utilError > maxUtilError {
-		// Rounding put us more than 30% off our goal. With
-		// gcBackgroundUtilization of 25%, this happens for
-		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
-		// workers to compensate.
-		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
-			// Too many dedicated workers.
-			c.dedicatedMarkWorkersNeeded--
-		}
-		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
-	} else {
-		c.fractionalUtilizationGoal = 0
-	}
-
-	// In STW mode, we just want dedicated workers.
-	if debug.gcstoptheworld > 0 {
-		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
-		c.fractionalUtilizationGoal = 0
-	}
-
-	// Clear per-P state
-	for _, p := range allp {
-		p.gcAssistTime = 0
-		p.gcFractionalMarkTime = 0
-	}
-
-	// Compute initial values for controls that are updated
-	// throughout the cycle.
-	c.revise()
-
-	if debug.gcpacertrace > 0 {
-		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
-		print("pacer: assist ratio=", assistRatio,
-			" (scan ", memstats.heap_scan>>20, " MB in ",
-			work.initialHeapLive>>20, "->",
-			memstats.next_gc>>20, " MB)",
-			" workers=", c.dedicatedMarkWorkersNeeded,
-			"+", c.fractionalUtilizationGoal, "\n")
-	}
-}
-
-// revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called whenever memstats.heap_scan,
-// memstats.heap_live, or memstats.next_gc is updated. It is safe to
-// call concurrently, but it may race with other calls to revise.
-//
-// The result of this race is that the two assist ratio values may not line
-// up or may be stale. In practice this is OK because the assist ratio
-// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
-// heuristic anyway. Furthermore, no part of the heuristic depends on
-// the two assist ratio values being exact reciprocals of one another, since
-// the two values are used to convert values from different sources.
-//
-// The worst case result of this raciness is that we may miss a larger shift
-// in the ratio (say, if we decide to pace more aggressively against the
-// hard heap goal) but even this "hard goal" is best-effort (see #40460).
-// The dedicated GC should ensure we don't exceed the hard goal by too much
-// in the rare case we do exceed it.
-//
-// It should only be called when gcBlackenEnabled != 0 (because this
-// is when assists are enabled and the necessary statistics are
-// available).
-func (c *gcControllerState) revise() {
-	gcpercent := gcpercent
-	if gcpercent < 0 {
-		// If GC is disabled but we're running a forced GC,
-		// act like GOGC is huge for the below calculations.
-		gcpercent = 100000
-	}
-	live := atomic.Load64(&memstats.heap_live)
-	scan := atomic.Load64(&memstats.heap_scan)
-	work := atomic.Loadint64(&c.scanWork)
-
-	// Assume we're under the soft goal. Pace GC to complete at
-	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(atomic.Load64(&memstats.next_gc))
-
-	// Compute the expected scan work remaining.
-	//
-	// This is estimated based on the expected
-	// steady-state scannable heap. For example, with
-	// GOGC=100, only half of the scannable heap is
-	// expected to be live, so that's what we target.
-	//
-	// (This is a float calculation to avoid overflowing on
-	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
-
-	if int64(live) > heapGoal || work > scanWorkExpected {
-		// We're past the soft goal, or we've already done more scan
-		// work than we expected. Pace GC so that in the worst case it
-		// will complete by the hard goal.
-		const maxOvershoot = 1.1
-		heapGoal = int64(float64(heapGoal) * maxOvershoot)
-
-		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(scan)
-	}
-
-	// Compute the remaining scan work estimate.
-	//
-	// Note that we currently count allocations during GC as both
-	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so allocation will change this difference
-	// slowly in the soft regime and not at all in the hard
-	// regime.
-	scanWorkRemaining := scanWorkExpected - work
-	if scanWorkRemaining < 1000 {
-		// We set a somewhat arbitrary lower bound on
-		// remaining scan work since if we aim a little high,
-		// we can miss by a little.
-		//
-		// We *do* need to enforce that this is at least 1,
-		// since marking is racy and double-scanning objects
-		// may legitimately make the remaining scan work
-		// negative, even in the hard goal regime.
-		scanWorkRemaining = 1000
-	}
-
-	// Compute the heap distance remaining.
-	heapRemaining := heapGoal - int64(live)
-	if heapRemaining <= 0 {
-		// This shouldn't happen, but if it does, avoid
-		// dividing by zero or setting the assist negative.
-		heapRemaining = 1
-	}
-
-	// Compute the mutator assist ratio so by the time the mutator
-	// allocates the remaining heap bytes up to next_gc, it will
-	// have done (or stolen) the remaining amount of scan work.
-	// Note that the assist ratio values are updated atomically
-	// but not together. This means there may be some degree of
-	// skew between the two values. This is generally OK as the
-	// values shift relatively slowly over the course of a GC
-	// cycle.
-	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
-	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
-	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
-	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
-}
-
-// endCycle computes the trigger ratio for the next cycle.
-func (c *gcControllerState) endCycle() float64 {
-	if work.userForced {
-		// Forced GC means this cycle didn't start at the
-		// trigger, so where it finished isn't good
-		// information about how to adjust the trigger.
-		// Just leave it where it is.
-		return memstats.triggerRatio
-	}
-
-	// Proportional response gain for the trigger controller. Must
-	// be in [0, 1]. Lower values smooth out transient effects but
-	// take longer to respond to phase changes. Higher values
-	// react to phase changes quickly, but are more affected by
-	// transient changes. Values near 1 may be unstable.
-	const triggerGain = 0.5
-
-	// Compute next cycle trigger ratio. First, this computes the
-	// "error" for this cycle; that is, how far off the trigger
-	// was from what it should have been, accounting for both heap
-	// growth and GC CPU utilization. We compute the actual heap
-	// growth during this cycle and scale that by how far off from
-	// the goal CPU utilization we were (to estimate the heap
-	// growth if we had the desired CPU utilization). The
-	// difference between this estimate and the GOGC-based goal
-	// heap growth is the error.
-	goalGrowthRatio := gcEffectiveGrowthRatio()
-	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
-	assistDuration := nanotime() - c.markStartTime
-
-	// Assume background mark hit its utilization goal.
-	utilization := gcBackgroundUtilization
-	// Add assist utilization; avoid divide by zero.
-	if assistDuration > 0 {
-		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
-	}
-
-	triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
-
-	// Finally, we adjust the trigger for next time by this error,
-	// damped by the proportional gain.
-	triggerRatio := memstats.triggerRatio + triggerGain*triggerError
-
-	if debug.gcpacertrace > 0 {
-		// Print controller state in terms of the design
-		// document.
-		H_m_prev := memstats.heap_marked
-		h_t := memstats.triggerRatio
-		H_T := memstats.gc_trigger
-		h_a := actualGrowthRatio
-		H_a := memstats.heap_live
-		h_g := goalGrowthRatio
-		H_g := int64(float64(H_m_prev) * (1 + h_g))
-		u_a := utilization
-		u_g := gcGoalUtilization
-		W_a := c.scanWork
-		print("pacer: H_m_prev=", H_m_prev,
-			" h_t=", h_t, " H_T=", H_T,
-			" h_a=", h_a, " H_a=", H_a,
-			" h_g=", h_g, " H_g=", H_g,
-			" u_a=", u_a, " u_g=", u_g,
-			" W_a=", W_a,
-			" goalΔ=", goalGrowthRatio-h_t,
-			" actualΔ=", h_a-h_t,
-			" u_a/u_g=", u_a/u_g,
-			"\n")
-	}
-
-	return triggerRatio
-}
-
-// enlistWorker encourages another dedicated mark worker to start on
-// another P if there are spare worker slots. It is used by putfull
-// when more work is made available.
-//
-//go:nowritebarrier
-func (c *gcControllerState) enlistWorker() {
-	// If there are idle Ps, wake one so it will run an idle worker.
-	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
-	//
-	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-	//		wakep()
-	//		return
-	//	}
-
-	// There are no idle Ps. If we need more dedicated workers,
-	// try to preempt a running P so it will switch to a worker.
-	if c.dedicatedMarkWorkersNeeded <= 0 {
-		return
-	}
-	// Pick a random other P to preempt.
-	if gomaxprocs <= 1 {
-		return
-	}
-	gp := getg()
-	if gp == nil || gp.m == nil || gp.m.p == 0 {
-		return
-	}
-	myID := gp.m.p.ptr().id
-	for tries := 0; tries < 5; tries++ {
-		id := int32(fastrandn(uint32(gomaxprocs - 1)))
-		if id >= myID {
-			id++
-		}
-		p := allp[id]
-		if p.status != _Prunning {
-			continue
-		}
-		if preemptone(p) {
-			return
-		}
-	}
-}
-
-// findRunnableGCWorker returns a background mark worker for _p_ if it
-// should be run. This must only be called when gcBlackenEnabled != 0.
-func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
-	if gcBlackenEnabled == 0 {
-		throw("gcControllerState.findRunnable: blackening not enabled")
-	}
-
-	if !gcMarkWorkAvailable(_p_) {
-		// No work to be done right now. This can happen at
-		// the end of the mark phase when there are still
-		// assists tapering off. Don't bother running a worker
-		// now because it'll just return immediately.
-		return nil
-	}
-
-	// Grab a worker before we commit to running below.
-	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
-	if node == nil {
-		// There is at least one worker per P, so normally there are
-		// enough workers to run on all Ps, if necessary. However, once
-		// a worker enters gcMarkDone it may park without rejoining the
-		// pool, thus freeing a P with no corresponding worker.
-		// gcMarkDone never depends on another worker doing work, so it
-		// is safe to simply do nothing here.
-		//
-		// If gcMarkDone bails out without completing the mark phase,
-		// it will always do so with queued global work. Thus, that P
-		// will be immediately eligible to re-run the worker G it was
-		// just using, ensuring work can complete.
-		return nil
-	}
-
-	decIfPositive := func(ptr *int64) bool {
-		for {
-			v := atomic.Loadint64(ptr)
-			if v <= 0 {
-				return false
-			}
-
-			// TODO: having atomic.Casint64 would be more pleasant.
-			if atomic.Cas64((*uint64)(unsafe.Pointer(ptr)), uint64(v), uint64(v-1)) {
-				return true
-			}
-		}
-	}
-
-	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
-		// This P is now dedicated to marking until the end of
-		// the concurrent mark phase.
-		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
-	} else if c.fractionalUtilizationGoal == 0 {
-		// No need for fractional workers.
-		gcBgMarkWorkerPool.push(&node.node)
-		return nil
-	} else {
-		// Is this P behind on the fractional utilization
-		// goal?
-		//
-		// This should be kept in sync with pollFractionalWorkerExit.
-		delta := nanotime() - gcController.markStartTime
-		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
-			// Nope. No need to run a fractional worker.
-			gcBgMarkWorkerPool.push(&node.node)
-			return nil
-		}
-		// Run a fractional worker.
-		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
-	}
-
-	// Run the background mark worker.
-	gp := node.gp.ptr()
-	casgstatus(gp, _Gwaiting, _Grunnable)
-	if trace.enabled {
-		traceGoUnpark(gp, 0)
-	}
-	return gp
-}
-
 // pollFractionalWorkerExit reports whether a fractional mark worker
 // should self-preempt. It assumes it is called from the fractional
 // worker.
@@ -809,203 +283,6 @@ func pollFractionalWorkerExit() bool {
 	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 }
 
-// gcSetTriggerRatio sets the trigger ratio and updates everything
-// derived from it: the absolute trigger, the heap goal, mark pacing,
-// and sweep pacing.
-//
-// This can be called any time. If GC is the in the middle of a
-// concurrent phase, it will adjust the pacing of that phase.
-//
-// This depends on gcpercent, memstats.heap_marked, and
-// memstats.heap_live. These must be up to date.
-//
-// mheap_.lock must be held or the world must be stopped.
-func gcSetTriggerRatio(triggerRatio float64) {
-	assertWorldStoppedOrLockHeld(&mheap_.lock)
-
-	// Compute the next GC goal, which is when the allocated heap
-	// has grown by GOGC/100 over the heap marked by the last
-	// cycle.
-	goal := ^uint64(0)
-	if gcpercent >= 0 {
-		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
-	}
-
-	// Set the trigger ratio, capped to reasonable bounds.
-	if gcpercent >= 0 {
-		scalingFactor := float64(gcpercent) / 100
-		// Ensure there's always a little margin so that the
-		// mutator assist ratio isn't infinity.
-		maxTriggerRatio := 0.95 * scalingFactor
-		if triggerRatio > maxTriggerRatio {
-			triggerRatio = maxTriggerRatio
-		}
-
-		// If we let triggerRatio go too low, then if the application
-		// is allocating very rapidly we might end up in a situation
-		// where we're allocating black during a nearly always-on GC.
-		// The result of this is a growing heap and ultimately an
-		// increase in RSS. By capping us at a point >0, we're essentially
-		// saying that we're OK using more CPU during the GC to prevent
-		// this growth in RSS.
-		//
-		// The current constant was chosen empirically: given a sufficiently
-		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
-		// to <0.05, this constant causes applications to retain the same peak
-		// RSS compared to not having this allocator.
-		minTriggerRatio := 0.6 * scalingFactor
-		if triggerRatio < minTriggerRatio {
-			triggerRatio = minTriggerRatio
-		}
-	} else if triggerRatio < 0 {
-		// gcpercent < 0, so just make sure we're not getting a negative
-		// triggerRatio. This case isn't expected to happen in practice,
-		// and doesn't really matter because if gcpercent < 0 then we won't
-		// ever consume triggerRatio further on in this function, but let's
-		// just be defensive here; the triggerRatio being negative is almost
-		// certainly undesirable.
-		triggerRatio = 0
-	}
-	memstats.triggerRatio = triggerRatio
-
-	// Compute the absolute GC trigger from the trigger ratio.
-	//
-	// We trigger the next GC cycle when the allocated heap has
-	// grown by the trigger ratio over the marked heap size.
-	trigger := ^uint64(0)
-	if gcpercent >= 0 {
-		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
-		// Don't trigger below the minimum heap size.
-		minTrigger := heapminimum
-		if !isSweepDone() {
-			// Concurrent sweep happens in the heap growth
-			// from heap_live to gc_trigger, so ensure
-			// that concurrent sweep has some heap growth
-			// in which to perform sweeping before we
-			// start the next GC cycle.
-			sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance
-			if sweepMin > minTrigger {
-				minTrigger = sweepMin
-			}
-		}
-		if trigger < minTrigger {
-			trigger = minTrigger
-		}
-		if int64(trigger) < 0 {
-			print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
-			throw("gc_trigger underflow")
-		}
-		if trigger > goal {
-			// The trigger ratio is always less than GOGC/100, but
-			// other bounds on the trigger may have raised it.
-			// Push up the goal, too.
-			goal = trigger
-		}
-	}
-
-	// Commit to the trigger and goal.
-	memstats.gc_trigger = trigger
-	atomic.Store64(&memstats.next_gc, goal)
-	if trace.enabled {
-		traceNextGC()
-	}
-
-	// Update mark pacing.
-	if gcphase != _GCoff {
-		gcController.revise()
-	}
-
-	// Update sweep pacing.
-	if isSweepDone() {
-		mheap_.sweepPagesPerByte = 0
-	} else {
-		// Concurrent sweep needs to sweep all of the in-use
-		// pages by the time the allocated heap reaches the GC
-		// trigger. Compute the ratio of in-use pages to sweep
-		// per byte allocated, accounting for the fact that
-		// some might already be swept.
-		heapLiveBasis := atomic.Load64(&memstats.heap_live)
-		heapDistance := int64(trigger) - int64(heapLiveBasis)
-		// Add a little margin so rounding errors and
-		// concurrent sweep are less likely to leave pages
-		// unswept when GC starts.
-		heapDistance -= 1024 * 1024
-		if heapDistance < _PageSize {
-			// Avoid setting the sweep ratio extremely high
-			heapDistance = _PageSize
-		}
-		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
-		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
-		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
-		if sweepDistancePages <= 0 {
-			mheap_.sweepPagesPerByte = 0
-		} else {
-			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
-			mheap_.sweepHeapLiveBasis = heapLiveBasis
-			// Write pagesSweptBasis last, since this
-			// signals concurrent sweeps to recompute
-			// their debt.
-			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
-		}
-	}
-
-	gcPaceScavenger()
-}
-
-// gcEffectiveGrowthRatio returns the current effective heap growth
-// ratio (GOGC/100) based on heap_marked from the previous GC and
-// next_gc for the current GC.
-//
-// This may differ from gcpercent/100 because of various upper and
-// lower bounds on gcpercent. For example, if the heap is smaller than
-// heapminimum, this can be higher than gcpercent/100.
-//
-// mheap_.lock must be held or the world must be stopped.
-func gcEffectiveGrowthRatio() float64 {
-	assertWorldStoppedOrLockHeld(&mheap_.lock)
-
-	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
-	if egogc < 0 {
-		// Shouldn't happen, but just in case.
-		egogc = 0
-	}
-	return egogc
-}
-
-// gcGoalUtilization is the goal CPU utilization for
-// marking as a fraction of GOMAXPROCS.
-const gcGoalUtilization = 0.30
-
-// gcBackgroundUtilization is the fixed CPU utilization for background
-// marking. It must be <= gcGoalUtilization. The difference between
-// gcGoalUtilization and gcBackgroundUtilization will be made up by
-// mark assists. The scheduler will aim to use within 50% of this
-// goal.
-//
-// Setting this to < gcGoalUtilization avoids saturating the trigger
-// feedback controller when there are no assists, which allows it to
-// better control CPU and heap growth. However, the larger the gap,
-// the more mutator assists are expected to happen, which impact
-// mutator latency.
-const gcBackgroundUtilization = 0.25
-
-// gcCreditSlack is the amount of scan work credit that can
-// accumulate locally before updating gcController.scanWork and,
-// optionally, gcController.bgScanCredit. Lower values give a more
-// accurate assist ratio and make it more likely that assists will
-// successfully steal background credit. Higher values reduce memory
-// contention.
-const gcCreditSlack = 2000
-
-// gcAssistTimeSlack is the nanoseconds of mutator assist time that
-// can accumulate on a P before updating gcController.assistTime.
-const gcAssistTimeSlack = 5000
-
-// gcOverAssistWork determines how many extra units of scan work a GC
-// assist does when an assist happens. This amortizes the cost of an
-// assist by pre-paying for this many bytes of future allocations.
-const gcOverAssistWork = 64 << 10
-
 var work struct {
 	full  lfstack          // lock-free list of full blocks workbuf
 	empty lfstack          // lock-free list of empty blocks workbuf
@@ -1048,9 +325,11 @@ var work struct {
 	nwait  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nFlushCacheRoots                               int
 	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
 
+	// Base indexes of each root type. Set by gcMarkRootPrepare.
+	baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
+
 	// Each type of GC state transition is protected by a lock.
 	// Since multiple threads can simultaneously detect the state
 	// transition condition, any thread that detects a transition
@@ -1084,7 +363,7 @@ var work struct {
 	// program started if debug.gctrace > 0.
 	totaltime int64
 
-	// initialHeapLive is the value of memstats.heap_live at the
+	// initialHeapLive is the value of gcController.heapLive at the
 	// beginning of this GC cycle.
 	initialHeapLive uint64
 
@@ -1181,7 +460,7 @@ func GC() {
 	// First, wait for sweeping to finish. (We know there are no
 	// more spans on the sweep queue, but we may be concurrently
 	// sweeping spans, so we have to wait.)
-	for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
+	for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
 		Gosched()
 	}
 
@@ -1265,13 +544,13 @@ func (t gcTrigger) test() bool {
 	}
 	switch t.kind {
 	case gcTriggerHeap:
-		// Non-atomic access to heap_live for performance. If
+		// Non-atomic access to gcController.heapLive for performance. If
 		// we are going to trigger on this, this thread just
-		// atomically wrote heap_live anyway and we'll see our
+		// atomically wrote gcController.heapLive anyway and we'll see our
 		// own write.
-		return memstats.heap_live >= memstats.gc_trigger
+		return gcController.heapLive >= gcController.trigger
 	case gcTriggerTime:
-		if gcpercent < 0 {
+		if gcController.gcPercent < 0 {
 			return false
 		}
 		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
@@ -1365,7 +644,7 @@ func gcStart(trigger gcTrigger) {
 		// so it can't be more than ncpu, even if GOMAXPROCS is.
 		work.stwprocs = ncpu
 	}
-	work.heap0 = atomic.Load64(&memstats.heap_live)
+	work.heap0 = atomic.Load64(&gcController.heapLive)
 	work.pauseNS = 0
 	work.mode = mode
 
@@ -1388,7 +667,7 @@ func gcStart(trigger gcTrigger) {
 	work.cycles++
 
 	gcController.startCycle()
-	work.heapGoal = memstats.next_gc
+	work.heapGoal = gcController.heapGoal
 
 	// In STW mode, disable scheduling of user Gs. This may also
 	// disable scheduling of this goroutine, so it may block as
@@ -1617,7 +896,7 @@ top:
 	// endCycle depends on all gcWork cache stats being flushed.
 	// The termination algorithm above ensured that up to
 	// allocations since the ragged barrier.
-	nextTriggerRatio := gcController.endCycle()
+	nextTriggerRatio := gcController.endCycle(work.userForced)
 
 	// Perform mark termination. This will restart the world.
 	gcMarkTermination(nextTriggerRatio)
@@ -1629,7 +908,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
-	work.heap1 = memstats.heap_live
+	work.heap1 = gcController.heapLive
 	startTime := nanotime()
 
 	mp := acquirem()
@@ -1691,12 +970,12 @@ func gcMarkTermination(nextTriggerRatio float64) {
 		throw("gc done but gcphase != _GCoff")
 	}
 
-	// Record next_gc and heap_inuse for scavenger.
-	memstats.last_next_gc = memstats.next_gc
+	// Record heapGoal and heap_inuse for scavenger.
+	gcController.lastHeapGoal = gcController.heapGoal
 	memstats.last_heap_inuse = memstats.heap_inuse
 
 	// Update GC trigger and pacing for the next cycle.
-	gcSetTriggerRatio(nextTriggerRatio)
+	gcController.commit(nextTriggerRatio)
 
 	// Update timing memstats
 	now := nanotime()
@@ -1743,6 +1022,13 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// so events don't leak into the wrong cycle.
 	mProf_NextCycle()
 
+	// There may be stale spans in mcaches that need to be swept.
+	// Those aren't tracked in any sweep lists, so we need to
+	// count them against sweep completion until we ensure all
+	// those spans have been forced out.
+	sl := newSweepLocker()
+	sl.blockCompletion()
+
 	systemstack(func() { startTheWorldWithSema(true) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
@@ -1766,6 +1052,9 @@ func gcMarkTermination(nextTriggerRatio float64) {
 			_p_.mcache.prepareForSweep()
 		})
 	})
+	// Now that we've swept stale spans in mcaches, they don't
+	// count against unswept spans.
+	sl.dispose()
 
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
@@ -1984,15 +1273,11 @@ func gcBgMarkWorker() {
 					// everything out of the run
 					// queue so it can run
 					// somewhere else.
-					lock(&sched.lock)
-					for {
-						gp, _ := runqget(pp)
-						if gp == nil {
-							break
-						}
-						globrunqput(gp)
+					if drainQ, n := runqdrain(pp); n > 0 {
+						lock(&sched.lock)
+						globrunqputbatch(&drainQ, int32(n))
+						unlock(&sched.lock)
 					}
-					unlock(&sched.lock)
 				}
 				// Go back to draining, this time
 				// without preemption.
@@ -2066,7 +1351,7 @@ func gcMarkWorkAvailable(p *p) bool {
 // gcMark runs the mark (or, for concurrent GC, mark termination)
 // All gcWork caches must be empty.
 // STW is in effect at this point.
-func gcMark(start_time int64) {
+func gcMark(startTime int64) {
 	if debug.allocfreetrace > 0 {
 		tracegc()
 	}
@@ -2074,7 +1359,7 @@ func gcMark(start_time int64) {
 	if gcphase != _GCmarktermination {
 		throw("in gcMark expecting to see gcphase as _GCmarktermination")
 	}
-	work.tstart = start_time
+	work.tstart = startTime
 
 	// Check that there's no marking work remaining.
 	if work.full != 0 || work.markrootNext < work.markrootJobs {
@@ -2136,25 +1421,25 @@ func gcMark(start_time int64) {
 	}
 
 	// Update the marked heap stat.
-	memstats.heap_marked = work.bytesMarked
+	gcController.heapMarked = work.bytesMarked
 
 	// Flush scanAlloc from each mcache since we're about to modify
-	// heap_scan directly. If we were to flush this later, then scanAlloc
+	// heapScan directly. If we were to flush this later, then scanAlloc
 	// might have incorrect information.
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
 		}
-		memstats.heap_scan += uint64(c.scanAlloc)
+		gcController.heapScan += uint64(c.scanAlloc)
 		c.scanAlloc = 0
 	}
 
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
-	// flushallmcaches (which modifies heap_live).
-	memstats.heap_live = work.bytesMarked
-	memstats.heap_scan = uint64(gcController.scanWork)
+	// flushallmcaches (which modifies gcController.heapLive).
+	gcController.heapLive = work.bytesMarked
+	gcController.heapScan = uint64(gcController.scanWork)
 
 	if trace.enabled {
 		traceHeapAlloc()
@@ -2176,7 +1461,7 @@ func gcSweep(mode gcMode) {
 
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
-	mheap_.sweepdone = 0
+	mheap_.sweepDrained = 0
 	mheap_.pagesSwept = 0
 	mheap_.sweepArenas = mheap_.allArenas
 	mheap_.reclaimIndex = 0
@@ -2227,14 +1512,12 @@ func gcSweep(mode gcMode) {
 //
 //go:systemstack
 func gcResetMarkState() {
-	// This may be called during a concurrent phase, so make sure
+	// This may be called during a concurrent phase, so lock to make sure
 	// allgs doesn't change.
-	lock(&allglock)
-	for _, gp := range allgs {
+	forEachG(func(gp *g) {
 		gp.gcscandone = false // set to true in gcphasework
 		gp.gcAssistBytes = 0
-	}
-	unlock(&allglock)
+	})
 
 	// Clear page marks. This is just 1MB per 64GB of heap, so the
 	// time here is pretty trivial.
@@ -2249,7 +1532,7 @@ func gcResetMarkState() {
 	}
 
 	work.bytesMarked = 0
-	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
+	work.initialHeapLive = atomic.Load64(&gcController.heapLive)
 }
 
 // Hooks for other packages
@@ -2334,3 +1617,102 @@ func fmtNSAsMS(buf []byte, ns uint64) []byte {
 	}
 	return itoaDiv(buf, x, dec)
 }
+
+// Helpers for testing GC.
+
+// gcTestMoveStackOnNextCall causes the stack to be moved on a call
+// immediately following the call to this. It may not work correctly
+// if any other work appears after this call (such as returning).
+// Typically the following call should be marked go:noinline so it
+// performs a stack check.
+//
+// In rare cases this may not cause the stack to move, specifically if
+// there's a preemption between this call and the next.
+func gcTestMoveStackOnNextCall() {
+	gp := getg()
+	gp.stackguard0 = stackForceMove
+}
+
+// gcTestIsReachable performs a GC and returns a bit set where bit i
+// is set if ptrs[i] is reachable.
+func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
+	// This takes the pointers as unsafe.Pointers in order to keep
+	// them live long enough for us to attach specials. After
+	// that, we drop our references to them.
+
+	if len(ptrs) > 64 {
+		panic("too many pointers for uint64 mask")
+	}
+
+	// Block GC while we attach specials and drop our references
+	// to ptrs. Otherwise, if a GC is in progress, it could mark
+	// them reachable via this function before we have a chance to
+	// drop them.
+	semacquire(&gcsema)
+
+	// Create reachability specials for ptrs.
+	specials := make([]*specialReachable, len(ptrs))
+	for i, p := range ptrs {
+		lock(&mheap_.speciallock)
+		s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
+		unlock(&mheap_.speciallock)
+		s.special.kind = _KindSpecialReachable
+		if !addspecial(p, &s.special) {
+			throw("already have a reachable special (duplicate pointer?)")
+		}
+		specials[i] = s
+		// Make sure we don't retain ptrs.
+		ptrs[i] = nil
+	}
+
+	semrelease(&gcsema)
+
+	// Force a full GC and sweep.
+	GC()
+
+	// Process specials.
+	for i, s := range specials {
+		if !s.done {
+			printlock()
+			println("runtime: object", i, "was not swept")
+			throw("IsReachable failed")
+		}
+		if s.reachable {
+			mask |= 1 << i
+		}
+		lock(&mheap_.speciallock)
+		mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
+		unlock(&mheap_.speciallock)
+	}
+
+	return mask
+}
+
+// gcTestPointerClass returns the category of what p points to, one of:
+// "heap", "stack", "data", "bss", "other". This is useful for checking
+// that a test is doing what it's intended to do.
+//
+// This is nosplit simply to avoid extra pointer shuffling that may
+// complicate a test.
+//
+//go:nosplit
+func gcTestPointerClass(p unsafe.Pointer) string {
+	p2 := uintptr(noescape(p))
+	gp := getg()
+	if gp.stack.lo <= p2 && p2 < gp.stack.hi {
+		return "stack"
+	}
+	if base, _, _ := findObject(p2, 0, 0); base != 0 {
+		return "heap"
+	}
+	for _, datap := range activeModules() {
+		if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
+			return "data"
+		}
+		if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
+			return "bss"
+		}
+	}
+	KeepAlive(p)
+	return "other"
+}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 46fae5de72..1fd0732d62 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -56,8 +56,6 @@ const (
 func gcMarkRootPrepare() {
 	assertWorldStopped()
 
-	work.nFlushCacheRoots = 0
-
 	// Compute how many data and BSS root blocks there are.
 	nBlocks := func(bytes uintptr) int {
 		return int(divRoundUp(bytes, rootBlockBytes))
@@ -105,7 +103,14 @@ func gcMarkRootPrepare() {
 	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+
+	// Calculate base indexes of each root type
+	work.baseData = uint32(fixedRootCount)
+	work.baseBSS = work.baseData + uint32(work.nDataRoots)
+	work.baseSpans = work.baseBSS + uint32(work.nBSSRoots)
+	work.baseStacks = work.baseSpans + uint32(work.nSpanRoots)
+	work.baseEnd = work.baseStacks + uint32(work.nStackRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -116,23 +121,26 @@ func gcMarkRootCheck() {
 		throw("left over markroot jobs")
 	}
 
-	lock(&allglock)
 	// Check that stacks have been scanned.
-	var gp *g
-	for i := 0; i < work.nStackRoots; i++ {
-		gp = allgs[i]
+	//
+	// We only check the first nStackRoots Gs that we should have scanned.
+	// Since we don't care about newer Gs (see comment in
+	// gcMarkRootPrepare), no locking is required.
+	i := 0
+	forEachGRace(func(gp *g) {
+		if i >= work.nStackRoots {
+			return
+		}
+
 		if !gp.gcscandone {
-			goto fail
+			println("gp", gp, "goid", gp.goid,
+				"status", readgstatus(gp),
+				"gcscandone", gp.gcscandone)
+			throw("scan missed a g")
 		}
-	}
-	unlock(&allglock)
-	return
 
-fail:
-	println("gp", gp, "goid", gp.goid,
-		"status", readgstatus(gp),
-		"gcscandone", gp.gcscandone)
-	throw("scan missed a g")
+		i++
+	})
 }
 
 // ptrmask for an allocation containing a single pointer.
@@ -146,28 +154,16 @@ var oneptrmask = [...]uint8{1}
 //
 //go:nowritebarrier
 func markroot(gcw *gcWork, i uint32) {
-	// TODO(austin): This is a bit ridiculous. Compute and store
-	// the bases in gcMarkRootPrepare instead of the counts.
-	baseFlushCache := uint32(fixedRootCount)
-	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
-	baseBSS := baseData + uint32(work.nDataRoots)
-	baseSpans := baseBSS + uint32(work.nBSSRoots)
-	baseStacks := baseSpans + uint32(work.nSpanRoots)
-	end := baseStacks + uint32(work.nStackRoots)
-
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
-	case baseFlushCache <= i && i < baseData:
-		flushmcache(int(i - baseFlushCache))
-
-	case baseData <= i && i < baseBSS:
+	case work.baseData <= i && i < work.baseBSS:
 		for _, datap := range activeModules() {
-			markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
+			markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
 		}
 
-	case baseBSS <= i && i < baseSpans:
+	case work.baseBSS <= i && i < work.baseSpans:
 		for _, datap := range activeModules() {
-			markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-baseBSS))
+			markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
 		}
 
 	case i == fixedRootFinalizers:
@@ -181,15 +177,18 @@ func markroot(gcw *gcWork, i uint32) {
 		// stackfree.
 		systemstack(markrootFreeGStacks)
 
-	case baseSpans <= i && i < baseStacks:
+	case work.baseSpans <= i && i < work.baseStacks:
 		// mark mspan.specials
-		markrootSpans(gcw, int(i-baseSpans))
+		markrootSpans(gcw, int(i-work.baseSpans))
 
 	default:
 		// the rest is scanning goroutine stacks
 		var gp *g
-		if baseStacks <= i && i < end {
-			gp = allgs[i-baseStacks]
+		if work.baseStacks <= i && i < work.baseEnd {
+			// N.B. Atomic read of allglen in gcMarkRootPrepare
+			// acts as a barrier to ensure that allgs must be large
+			// enough to contain all relevant Gs.
+			gp = allgs[i-work.baseStacks]
 		} else {
 			throw("markroot: bad index")
 		}
@@ -793,24 +792,24 @@ func scanstack(gp *g, gcw *gcWork) {
 		if obj == nil {
 			continue
 		}
-		t := obj.typ
-		if t == nil {
+		r := obj.r
+		if r == nil {
 			// We've already scanned this object.
 			continue
 		}
-		obj.setType(nil) // Don't scan it again.
+		obj.setRecord(nil) // Don't scan it again.
 		if stackTraceDebug {
 			printlock()
-			print("  live stkobj at", hex(state.stack.lo+uintptr(obj.off)), "of type", t.string())
+			print("  live stkobj at", hex(state.stack.lo+uintptr(obj.off)), "of size", obj.size)
 			if conservative {
 				print(" (conservative)")
 			}
 			println()
 			printunlock()
 		}
-		gcdata := t.gcdata
+		gcdata := r.gcdata
 		var s *mspan
-		if t.kind&kindGCProg != 0 {
+		if r.useGCProg() {
 			// This path is pretty unlikely, an object large enough
 			// to have a GC program allocated on the stack.
 			// We need some space to unpack the program into a straight
@@ -820,15 +819,15 @@ func scanstack(gp *g, gcw *gcWork) {
 			// to change from a Lempel-Ziv style program to something else.
 			// Or we can forbid putting objects on stacks if they require
 			// a gc program (see issue 27447).
-			s = materializeGCProg(t.ptrdata, gcdata)
+			s = materializeGCProg(r.ptrdata(), gcdata)
 			gcdata = (*byte)(unsafe.Pointer(s.startAddr))
 		}
 
 		b := state.stack.lo + uintptr(obj.off)
 		if conservative {
-			scanConservative(b, t.ptrdata, gcdata, gcw, &state)
+			scanConservative(b, r.ptrdata(), gcdata, gcw, &state)
 		} else {
-			scanblock(b, t.ptrdata, gcdata, gcw, &state)
+			scanblock(b, r.ptrdata(), gcdata, gcw, &state)
 		}
 
 		if s != nil {
@@ -844,10 +843,10 @@ func scanstack(gp *g, gcw *gcWork) {
 		if stackTraceDebug {
 			for i := 0; i < x.nobj; i++ {
 				obj := &x.obj[i]
-				if obj.typ == nil { // reachable
+				if obj.r == nil { // reachable
 					continue
 				}
-				println("  dead stkobj at", hex(gp.stack.lo+uintptr(obj.off)), "of type", obj.typ.string())
+				println("  dead stkobj at", hex(gp.stack.lo+uintptr(obj.off)), "of size", obj.r.size)
 				// Note: not necessarily really dead - only reachable-from-ptr dead.
 			}
 		}
@@ -867,7 +866,7 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
 	}
 
 	isAsyncPreempt := frame.fn.valid() && frame.fn.funcID == funcID_asyncPreempt
-	isDebugCall := frame.fn.valid() && frame.fn.funcID == funcID_debugCallV1
+	isDebugCall := frame.fn.valid() && frame.fn.funcID == funcID_debugCallV2
 	if state.conservative || isAsyncPreempt || isDebugCall {
 		if debugScanConservative {
 			println("conservatively scanning function", funcname(frame.fn), "at PC", hex(frame.continpc))
@@ -928,7 +927,7 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
 		// varp is 0 for defers, where there are no locals.
 		// In that case, there can't be a pointer to its args, either.
 		// (And all args would be scanned above anyway.)
-		for _, obj := range objs {
+		for i, obj := range objs {
 			off := obj.off
 			base := frame.varp // locals base pointer
 			if off >= 0 {
@@ -940,9 +939,9 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
 				continue
 			}
 			if stackTraceDebug {
-				println("stkobj at", hex(ptr), "of type", obj.typ.string())
+				println("stkobj at", hex(ptr), "of size", obj.size)
 			}
-			state.addObject(ptr, obj.typ)
+			state.addObject(ptr, &objs[i])
 		}
 	}
 }
@@ -1252,12 +1251,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 	}
 
 	var i uintptr
-	for i = 0; i < n; i += sys.PtrSize {
-		// Find bits for this word.
-		if i != 0 {
-			// Avoid needless hbits.next() on last iteration.
-			hbits = hbits.next()
-		}
+	for i = 0; i < n; i, hbits = i+sys.PtrSize, hbits.next() {
 		// Load bits once. See CL 22712 and issue 16973 for discussion.
 		bits := hbits.bits()
 		if bits&bitScan == 0 {
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
new file mode 100644
index 0000000000..9338359de7
--- /dev/null
+++ b/src/runtime/mgcpacer.go
@@ -0,0 +1,848 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// gcGoalUtilization is the goal CPU utilization for
+	// marking as a fraction of GOMAXPROCS.
+	gcGoalUtilization = 0.30
+
+	// gcBackgroundUtilization is the fixed CPU utilization for background
+	// marking. It must be <= gcGoalUtilization. The difference between
+	// gcGoalUtilization and gcBackgroundUtilization will be made up by
+	// mark assists. The scheduler will aim to use within 50% of this
+	// goal.
+	//
+	// Setting this to < gcGoalUtilization avoids saturating the trigger
+	// feedback controller when there are no assists, which allows it to
+	// better control CPU and heap growth. However, the larger the gap,
+	// the more mutator assists are expected to happen, which impact
+	// mutator latency.
+	gcBackgroundUtilization = 0.25
+
+	// gcCreditSlack is the amount of scan work credit that can
+	// accumulate locally before updating gcController.scanWork and,
+	// optionally, gcController.bgScanCredit. Lower values give a more
+	// accurate assist ratio and make it more likely that assists will
+	// successfully steal background credit. Higher values reduce memory
+	// contention.
+	gcCreditSlack = 2000
+
+	// gcAssistTimeSlack is the nanoseconds of mutator assist time that
+	// can accumulate on a P before updating gcController.assistTime.
+	gcAssistTimeSlack = 5000
+
+	// gcOverAssistWork determines how many extra units of scan work a GC
+	// assist does when an assist happens. This amortizes the cost of an
+	// assist by pre-paying for this many bytes of future allocations.
+	gcOverAssistWork = 64 << 10
+
+	// defaultHeapMinimum is the value of heapMinimum for GOGC==100.
+	defaultHeapMinimum = 4 << 20
+)
+
+func init() {
+	if offset := unsafe.Offsetof(gcController.heapLive); offset%8 != 0 {
+		println(offset)
+		throw("gcController.heapLive not aligned to 8 bytes")
+	}
+}
+
+// gcController implements the GC pacing controller that determines
+// when to trigger concurrent garbage collection and how much marking
+// work to do in mutator assists and background marking.
+//
+// It uses a feedback control algorithm to adjust the gcController.trigger
+// trigger based on the heap growth and GC CPU utilization each cycle.
+// This algorithm optimizes for heap growth to match GOGC and for CPU
+// utilization between assist and background marking to be 25% of
+// GOMAXPROCS. The high-level design of this algorithm is documented
+// at https://golang.org/s/go15gcpacing.
+//
+// All fields of gcController are used only during a single mark
+// cycle.
+var gcController gcControllerState
+
+type gcControllerState struct {
+	// Initialized from $GOGC. GOGC=off means no GC.
+	gcPercent int32
+
+	_ uint32 // padding so following 64-bit values are 8-byte aligned
+
+	// heapMinimum is the minimum heap size at which to trigger GC.
+	// For small heaps, this overrides the usual GOGC*live set rule.
+	//
+	// When there is a very small live set but a lot of allocation, simply
+	// collecting when the heap reaches GOGC*live results in many GC
+	// cycles and high total per-GC overhead. This minimum amortizes this
+	// per-GC overhead while keeping the heap reasonably small.
+	//
+	// During initialization this is set to 4MB*GOGC/100. In the case of
+	// GOGC==0, this will set heapMinimum to 0, resulting in constant
+	// collection even when the heap size is small, which is useful for
+	// debugging.
+	heapMinimum uint64
+
+	// triggerRatio is the heap growth ratio that triggers marking.
+	//
+	// E.g., if this is 0.6, then GC should start when the live
+	// heap has reached 1.6 times the heap size marked by the
+	// previous cycle. This should be ≤ GOGC/100 so the trigger
+	// heap size is less than the goal heap size. This is set
+	// during mark termination for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	triggerRatio float64
+
+	// trigger is the heap size that triggers marking.
+	//
+	// When heapLive ≥ trigger, the mark phase will start.
+	// This is also the heap size by which proportional sweeping
+	// must be complete.
+	//
+	// This is computed from triggerRatio during mark termination
+	// for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	trigger uint64
+
+	// heapGoal is the goal heapLive for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	heapGoal uint64
+
+	// lastHeapGoal is the value of heapGoal for the previous GC.
+	// Note that this is distinct from the last value heapGoal had,
+	// because it could change if e.g. gcPercent changes.
+	//
+	// Read and written with the world stopped or with mheap_.lock held.
+	lastHeapGoal uint64
+
+	// heapLive is the number of bytes considered live by the GC.
+	// That is: retained by the most recent GC plus allocated
+	// since then. heapLive ≤ memstats.heapAlloc, since heapAlloc includes
+	// unmarked objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heapLive excludes these
+	// objects (and hence only goes up between GCs).
+	//
+	// This is updated atomically without locking. To reduce
+	// contention, this is updated only when obtaining a span from
+	// an mcentral and at this point it counts all of the
+	// unallocated slots in that span (which will be allocated
+	// before that mcache obtains another span from that
+	// mcentral). Hence, it slightly overestimates the "true" live
+	// heap size. It's better to overestimate than to
+	// underestimate because 1) this triggers the GC earlier than
+	// necessary rather than potentially too late and 2) this
+	// leads to a conservative GC rate rather than a GC rate that
+	// is potentially too low.
+	//
+	// Reads should likewise be atomic (or during STW).
+	//
+	// Whenever this is updated, call traceHeapAlloc() and
+	// this gcControllerState's revise() method.
+	heapLive uint64
+
+	// heapScan is the number of bytes of "scannable" heap. This
+	// is the live heap (as counted by heapLive), but omitting
+	// no-scan objects and no-scan tails of objects.
+	//
+	// Whenever this is updated, call this gcControllerState's
+	// revise() method.
+	//
+	// Read and written atomically or with the world stopped.
+	heapScan uint64
+
+	// heapMarked is the number of bytes marked by the previous
+	// GC. After mark termination, heapLive == heapMarked, but
+	// unlike heapLive, heapMarked does not change until the
+	// next mark termination.
+	heapMarked uint64
+
+	// scanWork is the total scan work performed this cycle. This
+	// is updated atomically during the cycle. Updates occur in
+	// bounded batches, since it is both written and read
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
+	//
+	// Currently this is the bytes of heap scanned. For most uses,
+	// this is an opaque unit of work, but for estimation the
+	// definition is important.
+	scanWork int64
+
+	// bgScanCredit is the scan work credit accumulated by the
+	// concurrent background scan. This credit is accumulated by
+	// the background scan and stolen by mutator assists. This is
+	// updated atomically. Updates occur in bounded batches, since
+	// it is both written and read throughout the cycle.
+	bgScanCredit int64
+
+	// assistTime is the nanoseconds spent in mutator assists
+	// during this cycle. This is updated atomically. Updates
+	// occur in bounded batches, since it is both written and read
+	// throughout the cycle.
+	assistTime int64
+
+	// dedicatedMarkTime is the nanoseconds spent in dedicated
+	// mark workers during this cycle. This is updated atomically
+	// at the end of the concurrent mark phase.
+	dedicatedMarkTime int64
+
+	// fractionalMarkTime is the nanoseconds spent in the
+	// fractional mark worker during this cycle. This is updated
+	// atomically throughout the cycle and will be up-to-date if
+	// the fractional mark worker is not currently running.
+	fractionalMarkTime int64
+
+	// idleMarkTime is the nanoseconds spent in idle marking
+	// during this cycle. This is updated atomically throughout
+	// the cycle.
+	idleMarkTime int64
+
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
+
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark
+	// workers that need to be started. This is computed at the
+	// beginning of each cycle and decremented atomically as
+	// dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded int64
+
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
+	// computed at the beginning of each cycle and updated every
+	// time heapScan is updated.
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
+
+	// fractionalUtilizationGoal is the fraction of wall clock
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If the goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
+	fractionalUtilizationGoal float64
+
+	_ cpu.CacheLinePad
+}
+
+func (c *gcControllerState) init(gcPercent int32) {
+	c.heapMinimum = defaultHeapMinimum
+
+	// Set a reasonable initial GC trigger.
+	c.triggerRatio = 7 / 8.0
+
+	// Fake a heapMarked value so it looks like a trigger at
+	// heapMinimum is the appropriate growth from heapMarked.
+	// This will go into computing the initial GC goal.
+	c.heapMarked = uint64(float64(c.heapMinimum) / (1 + c.triggerRatio))
+
+	// This will also compute and set the GC trigger and goal.
+	c.setGCPercent(gcPercent)
+}
+
+// startCycle resets the GC controller's state and computes estimates
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
+func (c *gcControllerState) startCycle() {
+	c.scanWork = 0
+	c.bgScanCredit = 0
+	c.assistTime = 0
+	c.dedicatedMarkTime = 0
+	c.fractionalMarkTime = 0
+	c.idleMarkTime = 0
+
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed gcController.heapLive
+	// over trigger is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if c.heapGoal < c.heapLive+1024*1024 {
+		c.heapGoal = c.heapLive + 1024*1024
+	}
+
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
+	} else {
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// In STW mode, we just want dedicated workers.
+	if debug.gcstoptheworld > 0 {
+		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// Clear per-P state
+	for _, p := range allp {
+		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
+	}
+
+	// Compute initial values for controls that are updated
+	// throughout the cycle.
+	c.revise()
+
+	if debug.gcpacertrace > 0 {
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
+			" (scan ", gcController.heapScan>>20, " MB in ",
+			work.initialHeapLive>>20, "->",
+			c.heapGoal>>20, " MB)",
+			" workers=", c.dedicatedMarkWorkersNeeded,
+			"+", c.fractionalUtilizationGoal, "\n")
+	}
+}
+
+// revise updates the assist ratio during the GC cycle to account for
+// improved estimates. This should be called whenever gcController.heapScan,
+// gcController.heapLive, or gcController.heapGoal is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+func (c *gcControllerState) revise() {
+	gcPercent := c.gcPercent
+	if gcPercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcPercent = 100000
+	}
+	live := atomic.Load64(&c.heapLive)
+	scan := atomic.Load64(&c.heapScan)
+	work := atomic.Loadint64(&c.scanWork)
+
+	// Assume we're under the soft goal. Pace GC to complete at
+	// heapGoal assuming the heap is in steady-state.
+	heapGoal := int64(atomic.Load64(&c.heapGoal))
+
+	// Compute the expected scan work remaining.
+	//
+	// This is estimated based on the expected
+	// steady-state scannable heap. For example, with
+	// GOGC=100, only half of the scannable heap is
+	// expected to be live, so that's what we target.
+	//
+	// (This is a float calculation to avoid overflowing on
+	// 100*heapScan.)
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcPercent))
+
+	if int64(live) > heapGoal || work > scanWorkExpected {
+		// We're past the soft goal, or we've already done more scan
+		// work than we expected. Pace GC so that in the worst case it
+		// will complete by the hard goal.
+		const maxOvershoot = 1.1
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+		// Compute the upper bound on the scan work remaining.
+		scanWorkExpected = int64(scan)
+	}
+
+	// Compute the remaining scan work estimate.
+	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heapScan) and scan work completed
+	// (scanWork), so allocation will change this difference
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - work
+	if scanWorkRemaining < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapRemaining = 1
+	}
+
+	// Compute the mutator assist ratio so by the time the mutator
+	// allocates the remaining heap bytes up to heapGoal, it will
+	// have done (or stolen) the remaining amount of scan work.
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
+}
+
+// endCycle computes the trigger ratio for the next cycle.
+// userForced indicates whether the current GC cycle was forced
+// by the application.
+func (c *gcControllerState) endCycle(userForced bool) float64 {
+	if userForced {
+		// Forced GC means this cycle didn't start at the
+		// trigger, so where it finished isn't good
+		// information about how to adjust the trigger.
+		// Just leave it where it is.
+		return c.triggerRatio
+	}
+
+	// Proportional response gain for the trigger controller. Must
+	// be in [0, 1]. Lower values smooth out transient effects but
+	// take longer to respond to phase changes. Higher values
+	// react to phase changes quickly, but are more affected by
+	// transient changes. Values near 1 may be unstable.
+	const triggerGain = 0.5
+
+	// Compute next cycle trigger ratio. First, this computes the
+	// "error" for this cycle; that is, how far off the trigger
+	// was from what it should have been, accounting for both heap
+	// growth and GC CPU utilization. We compute the actual heap
+	// growth during this cycle and scale that by how far off from
+	// the goal CPU utilization we were (to estimate the heap
+	// growth if we had the desired CPU utilization). The
+	// difference between this estimate and the GOGC-based goal
+	// heap growth is the error.
+	goalGrowthRatio := c.effectiveGrowthRatio()
+	actualGrowthRatio := float64(c.heapLive)/float64(c.heapMarked) - 1
+	assistDuration := nanotime() - c.markStartTime
+
+	// Assume background mark hit its utilization goal.
+	utilization := gcBackgroundUtilization
+	// Add assist utilization; avoid divide by zero.
+	if assistDuration > 0 {
+		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
+	}
+
+	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+
+	// Finally, we adjust the trigger for next time by this error,
+	// damped by the proportional gain.
+	triggerRatio := c.triggerRatio + triggerGain*triggerError
+
+	if debug.gcpacertrace > 0 {
+		// Print controller state in terms of the design
+		// document.
+		H_m_prev := c.heapMarked
+		h_t := c.triggerRatio
+		H_T := c.trigger
+		h_a := actualGrowthRatio
+		H_a := c.heapLive
+		h_g := goalGrowthRatio
+		H_g := int64(float64(H_m_prev) * (1 + h_g))
+		u_a := utilization
+		u_g := gcGoalUtilization
+		W_a := c.scanWork
+		print("pacer: H_m_prev=", H_m_prev,
+			" h_t=", h_t, " H_T=", H_T,
+			" h_a=", h_a, " H_a=", H_a,
+			" h_g=", h_g, " H_g=", H_g,
+			" u_a=", u_a, " u_g=", u_g,
+			" W_a=", W_a,
+			" goalΔ=", goalGrowthRatio-h_t,
+			" actualΔ=", h_a-h_t,
+			" u_a/u_g=", u_a/u_g,
+			"\n")
+	}
+
+	return triggerRatio
+}
+
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	// If there are idle Ps, wake one so it will run an idle worker.
+	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	//
+	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//		wakep()
+	//		return
+	//	}
+
+	// There are no idle Ps. If we need more dedicated workers,
+	// try to preempt a running P so it will switch to a worker.
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrandn(uint32(gomaxprocs - 1)))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
+// findRunnableGCWorker returns a background mark worker for _p_ if it
+// should be run. This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
+	if gcBlackenEnabled == 0 {
+		throw("gcControllerState.findRunnable: blackening not enabled")
+	}
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
+		return nil
+	}
+
+	// Grab a worker before we commit to running below.
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		// There is at least one worker per P, so normally there are
+		// enough workers to run on all Ps, if necessary. However, once
+		// a worker enters gcMarkDone it may park without rejoining the
+		// pool, thus freeing a P with no corresponding worker.
+		// gcMarkDone never depends on another worker doing work, so it
+		// is safe to simply do nothing here.
+		//
+		// If gcMarkDone bails out without completing the mark phase,
+		// it will always do so with queued global work. Thus, that P
+		// will be immediately eligible to re-run the worker G it was
+		// just using, ensuring work can complete.
+		return nil
+	}
+
+	decIfPositive := func(ptr *int64) bool {
+		for {
+			v := atomic.Loadint64(ptr)
+			if v <= 0 {
+				return false
+			}
+
+			if atomic.Casint64(ptr, v, v-1) {
+				return true
+			}
+		}
+	}
+
+	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
+		// This P is now dedicated to marking until the end of
+		// the concurrent mark phase.
+		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		gcBgMarkWorkerPool.push(&node.node)
+		return nil
+	} else {
+		// Is this P behind on the fractional utilization
+		// goal?
+		//
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - c.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
+			gcBgMarkWorkerPool.push(&node.node)
+			return nil
+		}
+		// Run a fractional worker.
+		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+	}
+
+	// Run the background mark worker.
+	gp := node.gp.ptr()
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.enabled {
+		traceGoUnpark(gp, 0)
+	}
+	return gp
+}
+
+// commit sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcPercent, gcController.heapMarked, and
+// gcController.heapLive. These must be up to date.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) commit(triggerRatio float64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over the heap marked by the last
+	// cycle.
+	goal := ^uint64(0)
+	if c.gcPercent >= 0 {
+		goal = c.heapMarked + c.heapMarked*uint64(c.gcPercent)/100
+	}
+
+	// Set the trigger ratio, capped to reasonable bounds.
+	if c.gcPercent >= 0 {
+		scalingFactor := float64(c.gcPercent) / 100
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		maxTriggerRatio := 0.95 * scalingFactor
+		if triggerRatio > maxTriggerRatio {
+			triggerRatio = maxTriggerRatio
+		}
+
+		// If we let triggerRatio go too low, then if the application
+		// is allocating very rapidly we might end up in a situation
+		// where we're allocating black during a nearly always-on GC.
+		// The result of this is a growing heap and ultimately an
+		// increase in RSS. By capping us at a point >0, we're essentially
+		// saying that we're OK using more CPU during the GC to prevent
+		// this growth in RSS.
+		//
+		// The current constant was chosen empirically: given a sufficiently
+		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+		// to <0.05, this constant causes applications to retain the same peak
+		// RSS compared to not having this allocator.
+		minTriggerRatio := 0.6 * scalingFactor
+		if triggerRatio < minTriggerRatio {
+			triggerRatio = minTriggerRatio
+		}
+	} else if triggerRatio < 0 {
+		// gcPercent < 0, so just make sure we're not getting a negative
+		// triggerRatio. This case isn't expected to happen in practice,
+		// and doesn't really matter because if gcPercent < 0 then we won't
+		// ever consume triggerRatio further on in this function, but let's
+		// just be defensive here; the triggerRatio being negative is almost
+		// certainly undesirable.
+		triggerRatio = 0
+	}
+	c.triggerRatio = triggerRatio
+
+	// Compute the absolute GC trigger from the trigger ratio.
+	//
+	// We trigger the next GC cycle when the allocated heap has
+	// grown by the trigger ratio over the marked heap size.
+	trigger := ^uint64(0)
+	if c.gcPercent >= 0 {
+		trigger = uint64(float64(c.heapMarked) * (1 + triggerRatio))
+		// Don't trigger below the minimum heap size.
+		minTrigger := c.heapMinimum
+		if !isSweepDone() {
+			// Concurrent sweep happens in the heap growth
+			// from gcController.heapLive to trigger, so ensure
+			// that concurrent sweep has some heap growth
+			// in which to perform sweeping before we
+			// start the next GC cycle.
+			sweepMin := atomic.Load64(&c.heapLive) + sweepMinHeapDistance
+			if sweepMin > minTrigger {
+				minTrigger = sweepMin
+			}
+		}
+		if trigger < minTrigger {
+			trigger = minTrigger
+		}
+		if int64(trigger) < 0 {
+			print("runtime: heapGoal=", c.heapGoal, " heapMarked=", c.heapMarked, " gcController.heapLive=", c.heapLive, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
+			throw("trigger underflow")
+		}
+		if trigger > goal {
+			// The trigger ratio is always less than GOGC/100, but
+			// other bounds on the trigger may have raised it.
+			// Push up the goal, too.
+			goal = trigger
+		}
+	}
+
+	// Commit to the trigger and goal.
+	c.trigger = trigger
+	atomic.Store64(&c.heapGoal, goal)
+	if trace.enabled {
+		traceHeapGoal()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		c.revise()
+	}
+
+	// Update sweep pacing.
+	if isSweepDone() {
+		mheap_.sweepPagesPerByte = 0
+	} else {
+		// Concurrent sweep needs to sweep all of the in-use
+		// pages by the time the allocated heap reaches the GC
+		// trigger. Compute the ratio of in-use pages to sweep
+		// per byte allocated, accounting for the fact that
+		// some might already be swept.
+		heapLiveBasis := atomic.Load64(&c.heapLive)
+		heapDistance := int64(trigger) - int64(heapLiveBasis)
+		// Add a little margin so rounding errors and
+		// concurrent sweep are less likely to leave pages
+		// unswept when GC starts.
+		heapDistance -= 1024 * 1024
+		if heapDistance < _PageSize {
+			// Avoid setting the sweep ratio extremely high
+			heapDistance = _PageSize
+		}
+		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
+		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
+		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
+		if sweepDistancePages <= 0 {
+			mheap_.sweepPagesPerByte = 0
+		} else {
+			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
+			mheap_.sweepHeapLiveBasis = heapLiveBasis
+			// Write pagesSweptBasis last, since this
+			// signals concurrent sweeps to recompute
+			// their debt.
+			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
+		}
+	}
+
+	gcPaceScavenger()
+}
+
+// effectiveGrowthRatio returns the current effective heap growth
+// ratio (GOGC/100) based on heapMarked from the previous GC and
+// heapGoal for the current GC.
+//
+// This may differ from gcPercent/100 because of various upper and
+// lower bounds on gcPercent. For example, if the heap is smaller than
+// heapMinimum, this can be higher than gcPercent/100.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) effectiveGrowthRatio() float64 {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	egogc := float64(atomic.Load64(&c.heapGoal)-c.heapMarked) / float64(c.heapMarked)
+	if egogc < 0 {
+		// Shouldn't happen, but just in case.
+		egogc = 0
+	}
+	return egogc
+}
+
+// setGCPercent updates gcPercent and all related pacer state.
+// Returns the old value of gcPercent.
+//
+// The world must be stopped, or mheap_.lock must be held.
+func (c *gcControllerState) setGCPercent(in int32) int32 {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	out := c.gcPercent
+	if in < 0 {
+		in = -1
+	}
+	c.gcPercent = in
+	c.heapMinimum = defaultHeapMinimum * uint64(c.gcPercent) / 100
+	// Update pacing in response to gcPercent change.
+	c.commit(c.triggerRatio)
+
+	return out
+}
+
+//go:linkname setGCPercent runtime/debug.setGCPercent
+func setGCPercent(in int32) (out int32) {
+	// Run on the system stack since we grab the heap lock.
+	systemstack(func() {
+		lock(&mheap_.lock)
+		out = gcController.setGCPercent(in)
+		unlock(&mheap_.lock)
+	})
+
+	// If we just disabled GC, wait for any concurrent GC mark to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		gcWaitOnMark(atomic.Load(&work.cycles))
+	}
+
+	return out
+}
+
+func readGOGC() int32 {
+	p := gogetenv("GOGC")
+	if p == "off" {
+		return -1
+	}
+	if n, ok := atoi32(p); ok {
+		return n
+	}
+	return 100
+}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index a7c5bc49b8..7578129f9d 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -18,12 +18,12 @@
 // application down to a goal.
 //
 // That goal is defined as:
-//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
+//   (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * last_heap_inuse
 //
 // Essentially, we wish to have the application's RSS track the heap goal, but
 // the heap goal is defined in terms of bytes of objects, rather than pages like
 // RSS. As a result, we need to take into account for fragmentation internal to
-// spans. next_gc / last_next_gc defines the ratio between the current heap goal
+// spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal
 // and the last heap goal, which tells us by how much the heap is growing and
 // shrinking. We estimate what the heap will grow to in terms of pages by taking
 // this ratio and multiplying it by heap_inuse at the end of the last GC, which
@@ -118,12 +118,12 @@ func gcPaceScavenger() {
 	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
 	// information about the heap yet) so this is fine, and avoids a fault
 	// or garbage data later.
-	if memstats.last_next_gc == 0 {
+	if gcController.lastHeapGoal == 0 {
 		mheap_.scavengeGoal = ^uint64(0)
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&gcController.heapGoal)) / float64(gcController.lastHeapGoal)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -207,7 +207,7 @@ func wakeScavenger() {
 		// Ready the goroutine by injecting it. We use injectglist instead
 		// of ready or goready in order to allow us to run this function
 		// without a P. injectglist also avoids placing the goroutine in
-		// the current P's runnext slot, which is desireable to prevent
+		// the current P's runnext slot, which is desirable to prevent
 		// the scavenger from interfering with user goroutine scheduling
 		// too much.
 		var list gList
@@ -249,7 +249,7 @@ func scavengeSleep(ns int64) int64 {
 // The background scavenger maintains the RSS of the application below
 // the line described by the proportional scavenging statistics in
 // the mheap struct.
-func bgscavenge(c chan int) {
+func bgscavenge() {
 	scavenge.g = getg()
 
 	lockInit(&scavenge.lock, lockRankScavenge)
@@ -261,7 +261,7 @@ func bgscavenge(c chan int) {
 		wakeScavenger()
 	}
 
-	c <- 1
+	gcenable_setup <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
 	// Exponentially-weighted moving average of the fraction of time this
@@ -372,7 +372,7 @@ func bgscavenge(c chan int) {
 		// Due to OS-related anomalies we may "sleep" for an inordinate amount
 		// of time. Let's avoid letting the ratio get out of hand by bounding
 		// the sleep time we use in our EWMA.
-		const minFraction = 1 / 1000
+		const minFraction = 1.0 / 1000.0
 		if fraction < minFraction {
 			fraction = minFraction
 		}
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 250343077f..3b12a2e1e6 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -152,12 +152,6 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 			max:   PallocChunkPages,
 			want:  BitRange{0, uint(m)},
 		}
-		tests["StartFree"+suffix] = test{
-			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
-			min:   m,
-			max:   PallocChunkPages,
-			want:  BitRange{0, uint(m)},
-		}
 		tests["EndFree"+suffix] = test{
 			alloc: []BitRange{{0, PallocChunkPages - uint(m)}},
 			min:   m,
diff --git a/src/runtime/mgcstack.go b/src/runtime/mgcstack.go
index 8eb941a328..92d58485f6 100644
--- a/src/runtime/mgcstack.go
+++ b/src/runtime/mgcstack.go
@@ -150,19 +150,19 @@ func init() {
 //
 //go:notinheap
 type stackObject struct {
-	off   uint32       // offset above stack.lo
-	size  uint32       // size of object
-	typ   *_type       // type info (for ptr/nonptr bits). nil if object has been scanned.
-	left  *stackObject // objects with lower addresses
-	right *stackObject // objects with higher addresses
+	off   uint32             // offset above stack.lo
+	size  uint32             // size of object
+	r     *stackObjectRecord // info of the object (for ptr/nonptr bits). nil if object has been scanned.
+	left  *stackObject       // objects with lower addresses
+	right *stackObject       // objects with higher addresses
 }
 
-// obj.typ = typ, but with no write barrier.
+// obj.r = r, but with no write barrier.
 //go:nowritebarrier
-func (obj *stackObject) setType(typ *_type) {
+func (obj *stackObject) setRecord(r *stackObjectRecord) {
 	// Types of stack objects are always in read-only memory, not the heap.
 	// So not using a write barrier is ok.
-	*(*uintptr)(unsafe.Pointer(&obj.typ)) = uintptr(unsafe.Pointer(typ))
+	*(*uintptr)(unsafe.Pointer(&obj.r)) = uintptr(unsafe.Pointer(r))
 }
 
 // A stackScanState keeps track of the state used during the GC walk
@@ -271,7 +271,7 @@ func (s *stackScanState) getPtr() (p uintptr, conservative bool) {
 }
 
 // addObject adds a stack object at addr of type typ to the set of stack objects.
-func (s *stackScanState) addObject(addr uintptr, typ *_type) {
+func (s *stackScanState) addObject(addr uintptr, r *stackObjectRecord) {
 	x := s.tail
 	if x == nil {
 		// initial setup
@@ -294,8 +294,8 @@ func (s *stackScanState) addObject(addr uintptr, typ *_type) {
 	obj := &x.obj[x.nobj]
 	x.nobj++
 	obj.off = uint32(addr - s.stack.lo)
-	obj.size = uint32(typ.size)
-	obj.setType(typ)
+	obj.size = uint32(r.size)
+	obj.setRecord(r)
 	// obj.left and obj.right will be initialized by buildIndex before use.
 	s.nobjs++
 }
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 76bc4246e5..8fe3a65340 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -153,13 +153,13 @@ func finishsweep_m() {
 	nextMarkBitArenaEpoch()
 }
 
-func bgsweep(c chan int) {
+func bgsweep() {
 	sweep.g = getg()
 
 	lockInit(&sweep.lock, lockRankSweep)
 	lock(&sweep.lock)
 	sweep.parked = true
-	c <- 1
+	gcenable_setup <- 1
 	goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
 
 	for {
@@ -183,66 +183,133 @@ func bgsweep(c chan int) {
 	}
 }
 
+// sweepLocker acquires sweep ownership of spans and blocks sweep
+// completion.
+type sweepLocker struct {
+	// sweepGen is the sweep generation of the heap.
+	sweepGen uint32
+	// blocking indicates that this tracker is blocking sweep
+	// completion, usually as a result of acquiring sweep
+	// ownership of at least one span.
+	blocking bool
+}
+
+// sweepLocked represents sweep ownership of a span.
+type sweepLocked struct {
+	*mspan
+}
+
+func newSweepLocker() sweepLocker {
+	return sweepLocker{
+		sweepGen: mheap_.sweepgen,
+	}
+}
+
+// tryAcquire attempts to acquire sweep ownership of span s. If it
+// successfully acquires ownership, it blocks sweep completion.
+func (l *sweepLocker) tryAcquire(s *mspan) (sweepLocked, bool) {
+	// Check before attempting to CAS.
+	if atomic.Load(&s.sweepgen) != l.sweepGen-2 {
+		return sweepLocked{}, false
+	}
+	// Add ourselves to sweepers before potentially taking
+	// ownership.
+	l.blockCompletion()
+	// Attempt to acquire sweep ownership of s.
+	if !atomic.Cas(&s.sweepgen, l.sweepGen-2, l.sweepGen-1) {
+		return sweepLocked{}, false
+	}
+	return sweepLocked{s}, true
+}
+
+// blockCompletion blocks sweep completion without acquiring any
+// specific spans.
+func (l *sweepLocker) blockCompletion() {
+	if !l.blocking {
+		atomic.Xadd(&mheap_.sweepers, +1)
+		l.blocking = true
+	}
+}
+
+func (l *sweepLocker) dispose() {
+	if !l.blocking {
+		return
+	}
+	// Decrement the number of active sweepers and if this is the
+	// last one, mark sweep as complete.
+	l.blocking = false
+	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepDrained) != 0 {
+		l.sweepIsDone()
+	}
+}
+
+func (l *sweepLocker) sweepIsDone() {
+	if debug.gcpacertrace > 0 {
+		print("pacer: sweep done at heap size ", gcController.heapLive>>20, "MB; allocated ", (gcController.heapLive-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+	}
+}
+
 // sweepone sweeps some unswept heap span and returns the number of pages returned
 // to the heap, or ^uintptr(0) if there was nothing to sweep.
 func sweepone() uintptr {
 	_g_ := getg()
-	sweepRatio := mheap_.sweepPagesPerByte // For debugging
 
 	// increment locks to ensure that the goroutine is not preempted
 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
 	_g_.m.locks++
-	if atomic.Load(&mheap_.sweepdone) != 0 {
+	if atomic.Load(&mheap_.sweepDrained) != 0 {
 		_g_.m.locks--
 		return ^uintptr(0)
 	}
-	atomic.Xadd(&mheap_.sweepers, +1)
+	// TODO(austin): sweepone is almost always called in a loop;
+	// lift the sweepLocker into its callers.
+	sl := newSweepLocker()
 
 	// Find a span to sweep.
-	var s *mspan
-	sg := mheap_.sweepgen
+	npages := ^uintptr(0)
+	var noMoreWork bool
 	for {
-		s = mheap_.nextSpanForSweep()
+		s := mheap_.nextSpanForSweep()
 		if s == nil {
-			atomic.Store(&mheap_.sweepdone, 1)
+			noMoreWork = atomic.Cas(&mheap_.sweepDrained, 0, 1)
 			break
 		}
 		if state := s.state.get(); state != mSpanInUse {
 			// This can happen if direct sweeping already
 			// swept this span, but in that case the sweep
 			// generation should always be up-to-date.
-			if !(s.sweepgen == sg || s.sweepgen == sg+3) {
-				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+			if !(s.sweepgen == sl.sweepGen || s.sweepgen == sl.sweepGen+3) {
+				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sl.sweepGen, "\n")
 				throw("non in-use span in unswept list")
 			}
 			continue
 		}
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
+			// Sweep the span we found.
+			npages = s.npages
+			if s.sweep(false) {
+				// Whole span was freed. Count it toward the
+				// page reclaimer credit since these pages can
+				// now be used for span allocation.
+				atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
+			} else {
+				// Span is still in-use, so this returned no
+				// pages to the heap and the span needs to
+				// move to the swept in-use list.
+				npages = 0
+			}
 			break
 		}
 	}
 
-	// Sweep the span we found.
-	npages := ^uintptr(0)
-	if s != nil {
-		npages = s.npages
-		if s.sweep(false) {
-			// Whole span was freed. Count it toward the
-			// page reclaimer credit since these pages can
-			// now be used for span allocation.
-			atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
-		} else {
-			// Span is still in-use, so this returned no
-			// pages to the heap and the span needs to
-			// move to the swept in-use list.
-			npages = 0
-		}
-	}
+	sl.dispose()
 
-	// Decrement the number of active sweepers and if this is the
-	// last one print trace information.
-	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
-		// Since the sweeper is done, move the scavenge gen forward (signalling
+	if noMoreWork {
+		// The sweep list is empty. There may still be
+		// concurrent sweeps running, but we're at least very
+		// close to done sweeping.
+
+		// Move the scavenge gen forward (signalling
 		// that there's new work to do) and wake the scavenger.
 		//
 		// The scavenger is signaled by the last sweeper because once
@@ -262,23 +329,23 @@ func sweepone() uintptr {
 		// for us to wake the scavenger directly via wakeScavenger, since
 		// it could allocate. Ask sysmon to do it for us instead.
 		readyForScavenger()
-
-		if debug.gcpacertrace > 0 {
-			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
-		}
 	}
+
 	_g_.m.locks--
 	return npages
 }
 
-// isSweepDone reports whether all spans are swept or currently being swept.
+// isSweepDone reports whether all spans are swept.
 //
 // Note that this condition may transition from false to true at any
 // time as the sweeper runs. It may transition from true to false if a
 // GC runs; to prevent that the caller must be non-preemptible or must
 // somehow block GC progress.
 func isSweepDone() bool {
-	return mheap_.sweepdone != 0
+	// Check that all spans have at least begun sweeping and there
+	// are no active sweepers. If both are true, then all spans
+	// have finished sweeping.
+	return atomic.Load(&mheap_.sweepDrained) != 0 && atomic.Load(&mheap_.sweepers) == 0
 }
 
 // Returns only when span s has been swept.
@@ -292,20 +359,19 @@ func (s *mspan) ensureSwept() {
 		throw("mspan.ensureSwept: m is not locked")
 	}
 
-	sg := mheap_.sweepgen
-	spangen := atomic.Load(&s.sweepgen)
-	if spangen == sg || spangen == sg+3 {
-		return
-	}
+	sl := newSweepLocker()
 	// The caller must be sure that the span is a mSpanInUse span.
-	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+	if s, ok := sl.tryAcquire(s); ok {
 		s.sweep(false)
+		sl.dispose()
 		return
 	}
+	sl.dispose()
+
 	// unfortunate condition, and we don't have efficient means to wait
 	for {
 		spangen := atomic.Load(&s.sweepgen)
-		if spangen == sg || spangen == sg+3 {
+		if spangen == sl.sweepGen || spangen == sl.sweepGen+3 {
 			break
 		}
 		osyield()
@@ -317,13 +383,21 @@ func (s *mspan) ensureSwept() {
 // Returns true if the span was returned to heap.
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
-func (s *mspan) sweep(preserve bool) bool {
+func (sl *sweepLocked) sweep(preserve bool) bool {
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
 		throw("mspan.sweep: m is not locked")
 	}
+
+	s := sl.mspan
+	if !preserve {
+		// We'll release ownership of this span. Nil it out to
+		// prevent the caller from accidentally using it.
+		sl.mspan = nil
+	}
+
 	sweepgen := mheap_.sweepgen
 	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
 		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
@@ -356,11 +430,10 @@ func (s *mspan) sweep(preserve bool) bool {
 	//    If such object is not marked, we need to queue all finalizers at once.
 	// Both 1 and 2 are possible at the same time.
 	hadSpecials := s.specials != nil
-	specialp := &s.specials
-	special := *specialp
-	for special != nil {
+	siter := newSpecialsIter(s)
+	for siter.valid() {
 		// A finalizer can be set for an inner byte of an object, find object beginning.
-		objIndex := uintptr(special.offset) / size
+		objIndex := uintptr(siter.s.offset) / size
 		p := s.base() + objIndex*size
 		mbits := s.markBitsForIndex(objIndex)
 		if !mbits.isMarked() {
@@ -368,7 +441,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			// Pass 1: see if it has at least one finalizer.
 			hasFin := false
 			endOffset := p - s.base() + size
-			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+			for tmp := siter.s; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
 				if tmp.kind == _KindSpecialFinalizer {
 					// Stop freeing of object if it has a finalizer.
 					mbits.setMarkedNonAtomic()
@@ -377,27 +450,31 @@ func (s *mspan) sweep(preserve bool) bool {
 				}
 			}
 			// Pass 2: queue all finalizers _or_ handle profile record.
-			for special != nil && uintptr(special.offset) < endOffset {
+			for siter.valid() && uintptr(siter.s.offset) < endOffset {
 				// Find the exact byte for which the special was setup
 				// (as opposed to object beginning).
+				special := siter.s
 				p := s.base() + uintptr(special.offset)
 				if special.kind == _KindSpecialFinalizer || !hasFin {
-					// Splice out special record.
-					y := special
-					special = special.next
-					*specialp = special
-					freespecial(y, unsafe.Pointer(p), size)
+					siter.unlinkAndNext()
+					freeSpecial(special, unsafe.Pointer(p), size)
 				} else {
-					// This is profile record, but the object has finalizers (so kept alive).
-					// Keep special record.
-					specialp = &special.next
-					special = *specialp
+					// The object has finalizers, so we're keeping it alive.
+					// All other specials only apply when an object is freed,
+					// so just keep the special record.
+					siter.next()
 				}
 			}
 		} else {
-			// object is still live: keep special record
-			specialp = &special.next
-			special = *specialp
+			// object is still live
+			if siter.s.kind == _KindSpecialReachable {
+				special := siter.unlinkAndNext()
+				(*specialReachable)(unsafe.Pointer(special)).reachable = true
+				freeSpecial(special, unsafe.Pointer(p), size)
+			} else {
+				// keep special record
+				siter.next()
+			}
 		}
 	}
 	if hadSpecials && s.specials == nil {
@@ -645,7 +722,7 @@ retry:
 	sweptBasis := atomic.Load64(&mheap_.pagesSweptBasis)
 
 	// Fix debt if necessary.
-	newHeapLive := uintptr(atomic.Load64(&memstats.heap_live)-mheap_.sweepHeapLiveBasis) + spanBytes
+	newHeapLive := uintptr(atomic.Load64(&gcController.heapLive)-mheap_.sweepHeapLiveBasis) + spanBytes
 	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
 	for pagesTarget > int64(atomic.Load64(&mheap_.pagesSwept)-sweptBasis) {
 		if sweepone() == ^uintptr(0) {
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index b3a068661e..667c7afa97 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -398,7 +398,7 @@ func getempty() *workbuf {
 }
 
 // putempty puts a workbuf onto the work.empty list.
-// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
+// Upon entry this goroutine owns b. The lfstack.push relinquishes ownership.
 //go:nowritebarrier
 func putempty(b *workbuf) {
 	b.checkempty()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1855330da5..84c00ce8f8 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -62,11 +62,12 @@ const (
 type mheap struct {
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
-	lock      mutex
-	pages     pageAlloc // page allocation data structure
-	sweepgen  uint32    // sweep generation, see comment in mspan; written during STW
-	sweepdone uint32    // all spans are swept
-	sweepers  uint32    // number of active sweepone calls
+	lock  mutex
+	pages pageAlloc // page allocation data structure
+
+	sweepgen     uint32 // sweep generation, see comment in mspan; written during STW
+	sweepDrained uint32 // all spans are swept or are being swept
+	sweepers     uint32 // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -85,14 +86,14 @@ type mheap struct {
 
 	// Proportional sweep
 	//
-	// These parameters represent a linear function from heap_live
+	// These parameters represent a linear function from gcController.heapLive
 	// to page sweep count. The proportional sweep system works to
 	// stay in the black by keeping the current page sweep count
-	// above this line at the current heap_live.
+	// above this line at the current gcController.heapLive.
 	//
 	// The line has slope sweepPagesPerByte and passes through a
 	// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
-	// any given time, the system is at (memstats.heap_live,
+	// any given time, the system is at (gcController.heapLive,
 	// pagesSwept) in this space.
 	//
 	// It's important that the line pass through a point we
@@ -104,7 +105,7 @@ type mheap struct {
 	pagesInUse         uint64  // pages of spans in stats mSpanInUse; updated atomically
 	pagesSwept         uint64  // pages swept this cycle; updated atomically
 	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
-	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
+	sweepHeapLiveBasis uint64  // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
 	sweepPagesPerByte  float64 // proportional sweep ratio; written with lock, read without
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
@@ -212,6 +213,7 @@ type mheap struct {
 	cachealloc            fixalloc // allocator for mcache*
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
+	specialReachableAlloc fixalloc // allocator for specialReachable
 	speciallock           mutex    // lock for special record allocators.
 	arenaHintAlloc        fixalloc // allocator for arenaHints
 
@@ -451,14 +453,11 @@ type mspan struct {
 	// h->sweepgen is incremented by 2 after every GC
 
 	sweepgen    uint32
-	divMul      uint16        // for divide by elemsize - divMagic.mul
-	baseMask    uint16        // if non-0, elemsize is a power of 2, & this will get object allocation base
+	divMul      uint32        // for divide by elemsize
 	allocCount  uint16        // number of allocated objects
 	spanclass   spanClass     // size class and noscan (uint8)
 	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
 	needzero    uint8         // needs to be zeroed before allocation
-	divShift    uint8         // for divide by elemsize - divMagic.shift
-	divShift2   uint8         // for divide by elemsize - divMagic.shift2
 	elemsize    uintptr       // computed from sizeclass or from npages
 	limit       uintptr       // end of data in span
 	speciallock mutex         // guards specials list
@@ -706,6 +705,7 @@ func (h *mheap) init() {
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
 	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+	h.specialReachableAlloc.init(unsafe.Sizeof(specialReachable{}), nil, nil, &memstats.other_sys)
 	h.arenaHintAlloc.init(unsafe.Sizeof(arenaHint{}), nil, nil, &memstats.other_sys)
 
 	// Don't zero mspan allocations. Background sweeping can
@@ -818,7 +818,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 
 	n0 := n
 	var nFreed uintptr
-	sg := h.sweepgen
+	sl := newSweepLocker()
 	for n > 0 {
 		ai := arenas[pageIdx/pagesPerArena]
 		ha := h.arenas[ai.l1()][ai.l2()]
@@ -843,7 +843,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 			for j := uint(0); j < 8; j++ {
 				if inUseUnmarked&(1<<j) != 0 {
 					s := ha.spans[arenaPage+uint(i)*8+j]
-					if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+					if s, ok := sl.tryAcquire(s); ok {
 						npages := s.npages
 						unlock(&h.lock)
 						if s.sweep(false) {
@@ -864,6 +864,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		pageIdx += uintptr(len(inUse) * 8)
 		n -= uintptr(len(inUse) * 8)
 	}
+	sl.dispose()
 	if trace.enabled {
 		unlock(&h.lock)
 		// Account for pages scanned but not reclaimed.
@@ -896,7 +897,9 @@ func (s spanAllocType) manual() bool {
 // spanclass indicates the span's size class and scannability.
 //
 // If needzero is true, the memory for the returned span will be zeroed.
-func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan {
+// The boolean returned indicates whether the returned span contains zeroes,
+// either because this was requested, or because it was already zeroed.
+func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) (*mspan, bool) {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
@@ -904,19 +907,22 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 	systemstack(func() {
 		// To prevent excessive heap growth, before allocating n pages
 		// we need to sweep and reclaim at least n pages.
-		if h.sweepdone == 0 {
+		if !isSweepDone() {
 			h.reclaim(npages)
 		}
 		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
-	if s != nil {
-		if needzero && s.needzero != 0 {
-			memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
-		}
-		s.needzero = 0
+	if s == nil {
+		return nil, false
 	}
-	return s
+	isZeroed := s.needzero == 0
+	if needzero && !isZeroed {
+		memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		isZeroed = true
+	}
+	s.needzero = 0
+	return s, isZeroed
 }
 
 // allocManual allocates a manually-managed span of npage pages.
@@ -1224,20 +1230,11 @@ HaveSpan:
 		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
 			s.elemsize = nbytes
 			s.nelems = 1
-
-			s.divShift = 0
 			s.divMul = 0
-			s.divShift2 = 0
-			s.baseMask = 0
 		} else {
 			s.elemsize = uintptr(class_to_size[sizeclass])
 			s.nelems = nbytes / s.elemsize
-
-			m := &class_to_divmagic[sizeclass]
-			s.divShift = m.shift
-			s.divMul = m.mul
-			s.divShift2 = m.shift2
-			s.baseMask = m.baseMask
+			s.divMul = class_to_divmagic[sizeclass]
 		}
 
 		// Initialize mark and allocation structures.
@@ -1332,6 +1329,10 @@ func (h *mheap) grow(npage uintptr) bool {
 	assertLockHeld(&h.lock)
 
 	// We must grow the heap in whole palloc chunks.
+	// We call sysMap below but note that because we
+	// round up to pallocChunkPages which is on the order
+	// of MiB (generally >= to the huge page size) we
+	// won't be calling it too much.
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
 	totalGrowth := uintptr(0)
@@ -1358,6 +1359,17 @@ func (h *mheap) grow(npage uintptr) bool {
 			// remains of the current space and switch to
 			// the new space. This should be rare.
 			if size := h.curArena.end - h.curArena.base; size != 0 {
+				// Transition this space from Reserved to Prepared and mark it
+				// as released since we'll be able to start using it after updating
+				// the page allocator and releasing the lock at any time.
+				sysMap(unsafe.Pointer(h.curArena.base), size, &memstats.heap_sys)
+				// Update stats.
+				atomic.Xadd64(&memstats.heap_released, int64(size))
+				stats := memstats.heapStats.acquire()
+				atomic.Xaddint64(&stats.released, int64(size))
+				memstats.heapStats.release()
+				// Update the page allocator's structures to make this
+				// space ready for allocation.
 				h.pages.grow(h.curArena.base, size)
 				totalGrowth += size
 			}
@@ -1366,17 +1378,6 @@ func (h *mheap) grow(npage uintptr) bool {
 			h.curArena.end = uintptr(av) + asize
 		}
 
-		// The memory just allocated counts as both released
-		// and idle, even though it's not yet backed by spans.
-		//
-		// The allocation is always aligned to the heap arena
-		// size which is always > physPageSize, so its safe to
-		// just add directly to heap_released.
-		atomic.Xadd64(&memstats.heap_released, int64(asize))
-		stats := memstats.heapStats.acquire()
-		atomic.Xaddint64(&stats.released, int64(asize))
-		memstats.heapStats.release()
-
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
 		// a valid region starting at h.curArena.base which is at
@@ -1387,6 +1388,23 @@ func (h *mheap) grow(npage uintptr) bool {
 	// Grow into the current arena.
 	v := h.curArena.base
 	h.curArena.base = nBase
+
+	// Transition the space we're going to use from Reserved to Prepared.
+	sysMap(unsafe.Pointer(v), nBase-v, &memstats.heap_sys)
+
+	// The memory just allocated counts as both released
+	// and idle, even though it's not yet backed by spans.
+	//
+	// The allocation is always aligned to the heap arena
+	// size which is always > physPageSize, so its safe to
+	// just add directly to heap_released.
+	atomic.Xadd64(&memstats.heap_released, int64(nBase-v))
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.released, int64(nBase-v))
+	memstats.heapStats.release()
+
+	// Update the page allocator's structures to make this
+	// space ready for allocation.
 	h.pages.grow(v, nBase-v)
 	totalGrowth += nBase - v
 
@@ -1640,6 +1658,9 @@ func (list *mSpanList) takeAll(other *mSpanList) {
 const (
 	_KindSpecialFinalizer = 1
 	_KindSpecialProfile   = 2
+	// _KindSpecialReachable is a special used for tracking
+	// reachability during testing.
+	_KindSpecialReachable = 3
 	// Note: The finalizer special must be first because if we're freeing
 	// an object, a finalizer special will cause the freeing operation
 	// to abort, and we want to keep the other special records around
@@ -1845,9 +1866,45 @@ func setprofilebucket(p unsafe.Pointer, b *bucket) {
 	}
 }
 
-// Do whatever cleanup needs to be done to deallocate s. It has
-// already been unlinked from the mspan specials list.
-func freespecial(s *special, p unsafe.Pointer, size uintptr) {
+// specialReachable tracks whether an object is reachable on the next
+// GC cycle. This is used by testing.
+type specialReachable struct {
+	special   special
+	done      bool
+	reachable bool
+}
+
+// specialsIter helps iterate over specials lists.
+type specialsIter struct {
+	pprev **special
+	s     *special
+}
+
+func newSpecialsIter(span *mspan) specialsIter {
+	return specialsIter{&span.specials, span.specials}
+}
+
+func (i *specialsIter) valid() bool {
+	return i.s != nil
+}
+
+func (i *specialsIter) next() {
+	i.pprev = &i.s.next
+	i.s = *i.pprev
+}
+
+// unlinkAndNext removes the current special from the list and moves
+// the iterator to the next special. It returns the unlinked special.
+func (i *specialsIter) unlinkAndNext() *special {
+	cur := i.s
+	i.s = cur.next
+	*i.pprev = i.s
+	return cur
+}
+
+// freeSpecial performs any cleanup on special s and deallocates it.
+// s must already be unlinked from the specials list.
+func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
 		sf := (*specialfinalizer)(unsafe.Pointer(s))
@@ -1861,6 +1918,10 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr) {
 		lock(&mheap_.speciallock)
 		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
 		unlock(&mheap_.speciallock)
+	case _KindSpecialReachable:
+		sp := (*specialReachable)(unsafe.Pointer(s))
+		sp.done = true
+		// The creator frees these.
 	default:
 		throw("bad special kind")
 		panic("not reached")
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index ef297f073e..8632fe08a3 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // runtime·duffzero is a Duff's device for zeroing memory.
diff --git a/src/runtime/mkfastlog2table.go b/src/runtime/mkfastlog2table.go
index d650292394..8d78a3923a 100644
--- a/src/runtime/mkfastlog2table.go
+++ b/src/runtime/mkfastlog2table.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // fastlog2Table contains log2 approximations for 5 binary digits.
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 1d614dd003..3a9e6cc478 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // mkpreempt generates the asyncPreempt functions for each
@@ -229,12 +230,16 @@ func genAMD64() {
 		if reg == "SP" || reg == "BP" {
 			continue
 		}
-		if strings.HasPrefix(reg, "X") {
-			l.add("MOVUPS", reg, 16)
-		} else {
+		if !strings.HasPrefix(reg, "X") {
 			l.add("MOVQ", reg, 8)
 		}
 	}
+	lSSE := layout{stack: l.stack, sp: "SP"}
+	for _, reg := range regNamesAMD64 {
+		if strings.HasPrefix(reg, "X") {
+			lSSE.add("MOVUPS", reg, 16)
+		}
+	}
 
 	// TODO: MXCSR register?
 
@@ -243,10 +248,12 @@ func genAMD64() {
 	p("// Save flags before clobbering them")
 	p("PUSHFQ")
 	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
-	p("ADJSP $%d", l.stack)
+	p("ADJSP $%d", lSSE.stack)
 	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
 	p("NOP SP")
 
+	l.save()
+
 	// Apparently, the signal handling code path in darwin kernel leaves
 	// the upper bits of Y registers in a dirty state, which causes
 	// many SSE operations (128-bit and narrower) become much slower.
@@ -258,10 +265,11 @@ func genAMD64() {
 	p("VZEROUPPER")
 	p("#endif")
 
-	l.save()
+	lSSE.save()
 	p("CALL ·asyncPreempt2(SB)")
+	lSSE.restore()
 	l.restore()
-	p("ADJSP $%d", -l.stack)
+	p("ADJSP $%d", -lSSE.stack)
 	p("POPFQ")
 	p("POPQ BP")
 	p("RET")
diff --git a/src/runtime/mksizeclasses.go b/src/runtime/mksizeclasses.go
index b92d1fed5f..b1b10e9e02 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/runtime/mksizeclasses.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // Generate tables for small malloc size classes.
@@ -36,6 +37,8 @@ import (
 	"go/format"
 	"io"
 	"log"
+	"math"
+	"math/bits"
 	"os"
 )
 
@@ -86,11 +89,6 @@ const (
 type class struct {
 	size   int // max size
 	npages int // number of pages
-
-	mul    int
-	shift  uint
-	shift2 uint
-	mask   int
 }
 
 func powerOfTwo(x int) bool {
@@ -167,9 +165,9 @@ func makeClasses() []class {
 	return classes
 }
 
-// computeDivMagic computes some magic constants to implement
-// the division required to compute object number from span offset.
-// n / c.size is implemented as n >> c.shift * c.mul >> c.shift2
+// computeDivMagic checks that the division required to compute object
+// index from span offset can be computed using 32-bit multiplication.
+// n / c.size is implemented as (n * (^uint32(0)/uint32(c.size) + 1)) >> 32
 // for all 0 <= n <= c.npages * pageSize
 func computeDivMagic(c *class) {
 	// divisor
@@ -181,68 +179,67 @@ func computeDivMagic(c *class) {
 	// maximum input value for which the formula needs to work.
 	max := c.npages * pageSize
 
+	// As reported in [1], if n and d are unsigned N-bit integers, we
+	// can compute n / d as ⌊n * c / 2^F⌋, where c is ⌈2^F / d⌉ and F is
+	// computed with:
+	//
+	// 	Algorithm 2: Algorithm to select the number of fractional bits
+	// 	and the scaled approximate reciprocal in the case of unsigned
+	// 	integers.
+	//
+	// 	if d is a power of two then
+	// 		Let F ← log₂(d) and c = 1.
+	// 	else
+	// 		Let F ← N + L where L is the smallest integer
+	// 		such that d ≤ (2^(N+L) mod d) + 2^L.
+	// 	end if
+	//
+	// [1] "Faster Remainder by Direct Computation: Applications to
+	// Compilers and Software Libraries" Daniel Lemire, Owen Kaser,
+	// Nathan Kurz arXiv:1902.01961
+	//
+	// To minimize the risk of introducing errors, we implement the
+	// algorithm exactly as stated, rather than trying to adapt it to
+	// fit typical Go idioms.
+	N := bits.Len(uint(max))
+	var F int
 	if powerOfTwo(d) {
-		// If the size is a power of two, heapBitsForObject can divide even faster by masking.
-		// Compute this mask.
-		if max >= 1<<16 {
-			panic("max too big for power of two size")
+		F = int(math.Log2(float64(d)))
+		if d != 1<<F {
+			panic("imprecise log2")
 		}
-		c.mask = 1<<16 - d
-	}
-
-	// Compute pre-shift by factoring power of 2 out of d.
-	for d%2 == 0 {
-		c.shift++
-		d >>= 1
-		max >>= 1
-	}
-
-	// Find the smallest k that works.
-	// A small k allows us to fit the math required into 32 bits
-	// so we can use 32-bit multiplies and shifts on 32-bit platforms.
-nextk:
-	for k := uint(0); ; k++ {
-		mul := (int(1)<<k + d - 1) / d //  ⌈2^k / d⌉
-
-		// Test to see if mul works.
-		for n := 0; n <= max; n++ {
-			if n*mul>>k != n/d {
-				continue nextk
+	} else {
+		for L := 0; ; L++ {
+			if d <= ((1<<(N+L))%d)+(1<<L) {
+				F = N + L
+				break
 			}
 		}
-		if mul >= 1<<16 {
-			panic("mul too big")
-		}
-		if uint64(mul)*uint64(max) >= 1<<32 {
-			panic("mul*max too big")
-		}
-		c.mul = mul
-		c.shift2 = k
-		break
 	}
 
-	// double-check.
+	// Also, noted in the paper, F is the smallest number of fractional
+	// bits required. We use 32 bits, because it works for all size
+	// classes and is fast on all CPU architectures that we support.
+	if F > 32 {
+		fmt.Printf("d=%d max=%d N=%d F=%d\n", c.size, max, N, F)
+		panic("size class requires more than 32 bits of precision")
+	}
+
+	// Brute force double-check with the exact computation that will be
+	// done by the runtime.
+	m := ^uint32(0)/uint32(c.size) + 1
 	for n := 0; n <= max; n++ {
-		if n*c.mul>>c.shift2 != n/d {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
-			panic("bad multiply magic")
-		}
-		// Also check the exact computations that will be done by the runtime,
-		// for both 32 and 64 bit operations.
-		if uint32(n)*uint32(c.mul)>>uint8(c.shift2) != uint32(n/d) {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
+		if uint32((uint64(n)*uint64(m))>>32) != uint32(n/c.size) {
+			fmt.Printf("d=%d max=%d m=%d n=%d\n", d, max, m, n)
 			panic("bad 32-bit multiply magic")
 		}
-		if uint64(n)*uint64(c.mul)>>uint8(c.shift2) != uint64(n/d) {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
-			panic("bad 64-bit multiply magic")
-		}
 	}
 }
 
 func printComment(w io.Writer, classes []class) {
-	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste")
+	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste", "min align")
 	prevSize := 0
+	var minAligns [pageShift + 1]int
 	for i, c := range classes {
 		if i == 0 {
 			continue
@@ -251,8 +248,32 @@ func printComment(w io.Writer, classes []class) {
 		objects := spanSize / c.size
 		tailWaste := spanSize - c.size*(spanSize/c.size)
 		maxWaste := float64((c.size-prevSize-1)*objects+tailWaste) / float64(spanSize)
+		alignBits := bits.TrailingZeros(uint(c.size))
+		if alignBits > pageShift {
+			// object alignment is capped at page alignment
+			alignBits = pageShift
+		}
+		for i := range minAligns {
+			if i > alignBits {
+				minAligns[i] = 0
+			} else if minAligns[i] == 0 {
+				minAligns[i] = c.size
+			}
+		}
 		prevSize = c.size
-		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste)
+		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%  %9d\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste, 1<<alignBits)
+	}
+	fmt.Fprintf(w, "\n")
+
+	fmt.Fprintf(w, "// %-9s  %-4s  %-12s\n", "alignment", "bits", "min obj size")
+	for bits, size := range minAligns {
+		if size == 0 {
+			break
+		}
+		if bits+1 < len(minAligns) && size == minAligns[bits+1] {
+			continue
+		}
+		fmt.Fprintf(w, "// %9d  %4d  %12d\n", 1<<bits, bits, size)
 	}
 	fmt.Fprintf(w, "\n")
 }
@@ -279,15 +300,13 @@ func printClasses(w io.Writer, classes []class) {
 	}
 	fmt.Fprintln(w, "}")
 
-	fmt.Fprintln(w, "type divMagic struct {")
-	fmt.Fprintln(w, "  shift uint8")
-	fmt.Fprintln(w, "  shift2 uint8")
-	fmt.Fprintln(w, "  mul uint16")
-	fmt.Fprintln(w, "  baseMask uint16")
-	fmt.Fprintln(w, "}")
-	fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]divMagic {")
+	fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]uint32 {")
 	for _, c := range classes {
-		fmt.Fprintf(w, "{%d,%d,%d,%d},", c.shift, c.shift2, c.mul, c.mask)
+		if c.size == 0 {
+			fmt.Fprintf(w, "0,")
+			continue
+		}
+		fmt.Fprintf(w, "^uint32(0)/%d+1,", c.size)
 	}
 	fmt.Fprintln(w, "}")
 
diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 1b1848b79e..7460eb3104 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !aix && !darwin && !js && (!linux || !amd64) && (!linux || !arm64) && !openbsd && !plan9 && !solaris && !windows
 // +build !aix
 // +build !darwin
 // +build !js
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index dac1f39969..071f1fc274 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -395,6 +395,9 @@ func (p *pageAlloc) grow(base, size uintptr) {
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
 			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			if r == nil {
+				throw("pageAlloc: out of memory")
+			}
 			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
 		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 331dadade9..fceb4e7a18 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386 || arm || mips || mipsle || wasm || (ios && arm64)
 // +build 386 arm mips mipsle wasm ios,arm64
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index ffacb46c18..16577346a7 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 || (!ios && arm64) || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x
 // +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
 
 // See mpagealloc_32bit.go for why ios/arm64 is excluded here.
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 128498d69b..5235b898e4 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -133,7 +133,7 @@ func (a *memRecordCycle) add(b *memRecordCycle) {
 // A blockRecord is the bucket data for a bucket of type blockProfile,
 // which is used in blocking and mutex profiles.
 type blockRecord struct {
-	count  int64
+	count  float64
 	cycles int64
 }
 
@@ -398,20 +398,23 @@ func blockevent(cycles int64, skip int) {
 	if cycles <= 0 {
 		cycles = 1
 	}
-	if blocksampled(cycles) {
-		saveblockevent(cycles, skip+1, blockProfile)
+
+	rate := int64(atomic.Load64(&blockprofilerate))
+	if blocksampled(cycles, rate) {
+		saveblockevent(cycles, rate, skip+1, blockProfile)
 	}
 }
 
-func blocksampled(cycles int64) bool {
-	rate := int64(atomic.Load64(&blockprofilerate))
+// blocksampled returns true for all events where cycles >= rate. Shorter
+// events have a cycles/rate random chance of returning true.
+func blocksampled(cycles, rate int64) bool {
 	if rate <= 0 || (rate > cycles && int64(fastrand())%rate > cycles) {
 		return false
 	}
 	return true
 }
 
-func saveblockevent(cycles int64, skip int, which bucketType) {
+func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	gp := getg()
 	var nstk int
 	var stk [maxStack]uintptr
@@ -422,8 +425,15 @@ func saveblockevent(cycles int64, skip int, which bucketType) {
 	}
 	lock(&proflock)
 	b := stkbucket(which, 0, stk[:nstk], true)
-	b.bp().count++
-	b.bp().cycles += cycles
+
+	if which == blockProfile && cycles < rate {
+		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
+		b.bp().count += float64(rate) / float64(cycles)
+		b.bp().cycles += rate
+	} else {
+		b.bp().count++
+		b.bp().cycles += cycles
+	}
 	unlock(&proflock)
 }
 
@@ -454,7 +464,7 @@ func mutexevent(cycles int64, skip int) {
 	// TODO(pjw): measure impact of always calling fastrand vs using something
 	// like malloc.go:nextSample()
 	if rate > 0 && int64(fastrand())%rate == 0 {
-		saveblockevent(cycles, skip+1, mutexProfile)
+		saveblockevent(cycles, rate, skip+1, mutexProfile)
 	}
 }
 
@@ -490,7 +500,22 @@ func (r *StackRecord) Stack() []uintptr {
 // memory profiling rate should do so just once, as early as
 // possible in the execution of the program (for example,
 // at the beginning of main).
-var MemProfileRate int = 512 * 1024
+var MemProfileRate int = defaultMemProfileRate(512 * 1024)
+
+// defaultMemProfileRate returns 0 if disableMemoryProfiling is set.
+// It exists primarily for the godoc rendering of MemProfileRate
+// above.
+func defaultMemProfileRate(v int) int {
+	if disableMemoryProfiling {
+		return 0
+	}
+	return v
+}
+
+// disableMemoryProfiling is set by the linker if runtime.MemProfile
+// is not used and the link type guarantees nobody else could use it
+// elsewhere.
+var disableMemoryProfiling bool
 
 // A MemProfileRecord describes the live objects allocated
 // by a particular call sequence (stack trace).
@@ -641,7 +666,12 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 		for b := bbuckets; b != nil; b = b.allnext {
 			bp := b.bp()
 			r := &p[0]
-			r.Count = bp.count
+			r.Count = int64(bp.count)
+			// Prevent callers from having to worry about division by zero errors.
+			// See discussion on http://golang.org/cl/299991.
+			if r.Count == 0 {
+				r.Count = 1
+			}
 			r.Cycles = bp.cycles
 			if raceenabled {
 				racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(), funcPC(BlockProfile))
@@ -731,12 +761,13 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
 
 	stopTheWorld("profile")
 
+	// World is stopped, no locking required.
 	n = 1
-	for _, gp1 := range allgs {
+	forEachGRace(func(gp1 *g) {
 		if isOK(gp1) {
 			n++
 		}
-	}
+	})
 
 	if n <= len(p) {
 		ok = true
@@ -757,21 +788,23 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
 		}
 
 		// Save other goroutines.
-		for _, gp1 := range allgs {
-			if isOK(gp1) {
-				if len(r) == 0 {
-					// Should be impossible, but better to return a
-					// truncated profile than to crash the entire process.
-					break
-				}
-				saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
-				if labels != nil {
-					lbl[0] = gp1.labels
-					lbl = lbl[1:]
-				}
-				r = r[1:]
+		forEachGRace(func(gp1 *g) {
+			if !isOK(gp1) {
+				return
 			}
-		}
+
+			if len(r) == 0 {
+				// Should be impossible, but better to return a
+				// truncated profile than to crash the entire process.
+				return
+			}
+			saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
+			if labels != nil {
+				lbl[0] = gp1.labels
+				lbl = lbl[1:]
+			}
+			r = r[1:]
+		})
 	}
 
 	startTheWorld()
diff --git a/src/runtime/msan.go b/src/runtime/msan.go
index 6a5960b0a8..25aaf94e26 100644
--- a/src/runtime/msan.go
+++ b/src/runtime/msan.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build msan
 // +build msan
 
 package runtime
diff --git a/src/runtime/msan0.go b/src/runtime/msan0.go
index 374d13f30b..b1096a6750 100644
--- a/src/runtime/msan0.go
+++ b/src/runtime/msan0.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !msan
 // +build !msan
 
 // Dummy MSan support API, used when not built with -msan.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 6defaedabe..eeb2a7b4bc 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -62,12 +63,6 @@ type mstats struct {
 
 	// Statistics about the garbage collector.
 
-	// next_gc is the goal heap_live for when next GC ends.
-	// Set to ^uint64(0) if disabled.
-	//
-	// Read and written atomically, unless the world is stopped.
-	next_gc uint64
-
 	// Protected by mheap or stopping the world during GC.
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
@@ -92,69 +87,8 @@ type mstats struct {
 	_ [1 - _NumSizeClasses%2]uint32
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
-	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
-	last_next_gc     uint64 // next_gc for the previous GC
 	last_heap_inuse  uint64 // heap_inuse at mark termination of the previous GC
 
-	// triggerRatio is the heap growth ratio that triggers marking.
-	//
-	// E.g., if this is 0.6, then GC should start when the live
-	// heap has reached 1.6 times the heap size marked by the
-	// previous cycle. This should be ≤ GOGC/100 so the trigger
-	// heap size is less than the goal heap size. This is set
-	// during mark termination for the next cycle's trigger.
-	triggerRatio float64
-
-	// gc_trigger is the heap size that triggers marking.
-	//
-	// When heap_live ≥ gc_trigger, the mark phase will start.
-	// This is also the heap size by which proportional sweeping
-	// must be complete.
-	//
-	// This is computed from triggerRatio during mark termination
-	// for the next cycle's trigger.
-	gc_trigger uint64
-
-	// heap_live is the number of bytes considered live by the GC.
-	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= alloc, since alloc includes unmarked
-	// objects that have not yet been swept (and hence goes up as we
-	// allocate and down as we sweep) while heap_live excludes these
-	// objects (and hence only goes up between GCs).
-	//
-	// This is updated atomically without locking. To reduce
-	// contention, this is updated only when obtaining a span from
-	// an mcentral and at this point it counts all of the
-	// unallocated slots in that span (which will be allocated
-	// before that mcache obtains another span from that
-	// mcentral). Hence, it slightly overestimates the "true" live
-	// heap size. It's better to overestimate than to
-	// underestimate because 1) this triggers the GC earlier than
-	// necessary rather than potentially too late and 2) this
-	// leads to a conservative GC rate rather than a GC rate that
-	// is potentially too low.
-	//
-	// Reads should likewise be atomic (or during STW).
-	//
-	// Whenever this is updated, call traceHeapAlloc() and
-	// gcController.revise().
-	heap_live uint64
-
-	// heap_scan is the number of bytes of "scannable" heap. This
-	// is the live heap (as counted by heap_live), but omitting
-	// no-scan objects and no-scan tails of objects.
-	//
-	// Whenever this is updated, call gcController.revise().
-	//
-	// Read and written atomically or with the world stopped.
-	heap_scan uint64
-
-	// heap_marked is the number of bytes marked by the previous
-	// GC. After mark termination, heap_live == heap_marked, but
-	// unlike heap_live, heap_marked does not change until the
-	// next mark termination.
-	heap_marked uint64
-
 	// heapStats is a set of statistics
 	heapStats consistentHeapStats
 
@@ -443,10 +377,6 @@ type MemStats struct {
 }
 
 func init() {
-	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
-		println(offset)
-		throw("memstats.heap_live not aligned to 8 bytes")
-	}
 	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
 		println(offset)
 		throw("memstats.heapStats not aligned to 8 bytes")
@@ -523,7 +453,7 @@ func readmemstats_m(stats *MemStats) {
 	// at a more granular level in the runtime.
 	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 	stats.OtherSys = memstats.other_sys.load()
-	stats.NextGC = memstats.next_gc
+	stats.NextGC = gcController.heapGoal
 	stats.LastGC = memstats.last_gc_unix
 	stats.PauseTotalNs = memstats.pause_total_ns
 	stats.PauseNs = memstats.pause_ns
@@ -656,8 +586,8 @@ func updatememstats() {
 	}
 
 	// Account for tiny allocations.
-	memstats.nfree += memstats.tinyallocs
-	memstats.nmalloc += memstats.tinyallocs
+	memstats.nfree += uint64(consStats.tinyAllocCount)
+	memstats.nmalloc += uint64(consStats.tinyAllocCount)
 
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
@@ -773,6 +703,7 @@ type heapStatsDelta struct {
 	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
 
 	// Allocator stats.
+	tinyAllocCount  uintptr                  // number of tiny allocations
 	largeAlloc      uintptr                  // bytes allocated for large objects
 	largeAllocCount uintptr                  // number of large object allocations
 	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
@@ -782,7 +713,7 @@ type heapStatsDelta struct {
 
 	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
 	// Only necessary on 32-bit platforms.
-	// _ [(sys.PtrSize / 4) % 2]uint32
+	_ [(sys.PtrSize / 4) % 2]uint32
 }
 
 // merge adds in the deltas from b into a.
@@ -794,6 +725,7 @@ func (a *heapStatsDelta) merge(b *heapStatsDelta) {
 	a.inWorkBufs += b.inWorkBufs
 	a.inPtrScalarBits += b.inPtrScalarBits
 
+	a.tinyAllocCount += b.tinyAllocCount
 	a.largeAlloc += b.largeAlloc
 	a.largeAllocCount += b.largeAllocCount
 	for i := range b.smallAllocCount {
diff --git a/src/runtime/nbpipe_fcntl_libc_test.go b/src/runtime/nbpipe_fcntl_libc_test.go
index b38c58399b..076a722eb7 100644
--- a/src/runtime/nbpipe_fcntl_libc_test.go
+++ b/src/runtime/nbpipe_fcntl_libc_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || solaris
 // +build aix darwin solaris
 
 package runtime_test
diff --git a/src/runtime/nbpipe_fcntl_unix_test.go b/src/runtime/nbpipe_fcntl_unix_test.go
index 75acdb62dd..6d5e4ff021 100644
--- a/src/runtime/nbpipe_fcntl_unix_test.go
+++ b/src/runtime/nbpipe_fcntl_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux || netbsd || openbsd
 // +build dragonfly freebsd linux netbsd openbsd
 
 package runtime_test
diff --git a/src/runtime/nbpipe_pipe.go b/src/runtime/nbpipe_pipe.go
index 822b2944db..b17257e9ec 100644
--- a/src/runtime/nbpipe_pipe.go
+++ b/src/runtime/nbpipe_pipe.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly
+//go:build aix || darwin
+// +build aix darwin
 
 package runtime
 
diff --git a/src/runtime/nbpipe_pipe2.go b/src/runtime/nbpipe_pipe2.go
index e3639d99b1..f22b2b591f 100644
--- a/src/runtime/nbpipe_pipe2.go
+++ b/src/runtime/nbpipe_pipe2.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd linux netbsd openbsd solaris
+//go:build dragonfly || freebsd || linux || netbsd || openbsd || solaris
+// +build dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
 
diff --git a/src/runtime/nbpipe_test.go b/src/runtime/nbpipe_test.go
index d739f57864..1d6a9b525c 100644
--- a/src/runtime/nbpipe_test.go
+++ b/src/runtime/nbpipe_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime_test
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 77eb3aa4c6..6c26fdbbeb 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris || windows
 // +build aix darwin dragonfly freebsd js,wasm linux netbsd openbsd solaris windows
 
 package runtime
@@ -22,6 +23,9 @@ import (
 //     Arm edge-triggered notifications for fd. The pd argument is to pass
 //     back to netpollready when fd is ready. Return an errno value.
 //
+// func netpollclose(fd uintptr) int32
+//     Disable notifications for fd. Return an errno value.
+//
 // func netpoll(delta int64) gList
 //     Poll the network. If delta < 0, block indefinitely. If delta == 0,
 //     poll without blocking. If delta > 0, block for up to delta nanoseconds.
@@ -161,9 +165,12 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd.self = pd
 	unlock(&pd.lock)
 
-	var errno int32
-	errno = netpollopen(fd, pd)
-	return pd, int(errno)
+	errno := netpollopen(fd, pd)
+	if errno != 0 {
+		pollcache.free(pd)
+		return nil, int(errno)
+	}
+	return pd, 0
 }
 
 //go:linkname poll_runtime_pollClose internal/poll.runtime_pollClose
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index 58f4fa8754..371ac59f8e 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux
 // +build linux
 
 package runtime
diff --git a/src/runtime/netpoll_fake.go b/src/runtime/netpoll_fake.go
index b2af3b89b2..8366f28914 100644
--- a/src/runtime/netpoll_fake.go
+++ b/src/runtime/netpoll_fake.go
@@ -5,6 +5,7 @@
 // Fake network poller for wasm/js.
 // Should never be used, because wasm/js network connections do not honor "SetNonblock".
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 3bd93c1f20..80d1b0cf18 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || dragonfly || freebsd || netbsd || openbsd
 // +build darwin dragonfly freebsd netbsd openbsd
 
 package runtime
diff --git a/src/runtime/netpoll_stub.go b/src/runtime/netpoll_stub.go
index 3599f2d01b..33ab8eba58 100644
--- a/src/runtime/netpoll_stub.go
+++ b/src/runtime/netpoll_stub.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build plan9
 // +build plan9
 
 package runtime
diff --git a/src/runtime/norace_linux_test.go b/src/runtime/norace_linux_test.go
index 3517a5ddbe..94b7c7a467 100644
--- a/src/runtime/norace_linux_test.go
+++ b/src/runtime/norace_linux_test.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // The file contains tests that cannot run under race detector for some reason.
+//go:build !race
 // +build !race
 
 package runtime_test
diff --git a/src/runtime/norace_test.go b/src/runtime/norace_test.go
index e90128bb6d..9ad5dde382 100644
--- a/src/runtime/norace_test.go
+++ b/src/runtime/norace_test.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // The file contains tests that cannot run under race detector for some reason.
+//go:build !race
 // +build !race
 
 package runtime_test
diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index b6ee98cab6..c5dc23de8b 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go
@@ -100,7 +100,7 @@ func sighandler(_ureg *ureg, note *byte, gp *g) int {
 		if usesLR {
 			c.setpc(funcPC(sigpanictramp))
 		} else {
-			c.setpc(funcPC(sigpanic))
+			c.setpc(funcPC(sigpanic0))
 		}
 		return _NCONT
 	}
diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index 4b65139eb8..39ef831acf 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go
@@ -531,7 +531,7 @@ func usleep(µs uint32) {
 	usleep1(µs)
 }
 
-func walltime1() (sec int64, nsec int32) {
+func walltime() (sec int64, nsec int32) {
 	var ts mts
 	sysvicall2(&libc_clock_gettime, _CLOCK_REALTIME, uintptr(unsafe.Pointer(&ts)))
 	return ts.tv_sec, int32(ts.tv_nsec)
diff --git a/src/runtime/os_aix.go b/src/runtime/os_aix.go
index 303f0876de..4fb1c8e845 100644
--- a/src/runtime/os_aix.go
+++ b/src/runtime/os_aix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix
 // +build aix
 
 package runtime
@@ -335,7 +336,7 @@ func nanotime1() int64 {
 	return tp.tv_sec*1000000000 + tp.tv_nsec
 }
 
-func walltime1() (sec int64, nsec int32) {
+func walltime() (sec int64, nsec int32) {
 	ts := &timespec{}
 	if clock_gettime(_CLOCK_REALTIME, ts) != 0 {
 		throw("syscall clock_gettime failed")
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 470698d0a3..00139351ab 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"internal/abi"
+	"unsafe"
+)
 
 type mOS struct {
 	initialized bool
@@ -221,7 +224,7 @@ func newosproc(mp *m) {
 	// setup and then calls mstart.
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	err = pthread_create(&attr, funcPC(mstart_stub), unsafe.Pointer(mp))
+	err = pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if err != 0 {
 		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
@@ -361,11 +364,11 @@ func setsig(i uint32, fn uintptr) {
 	var sa usigactiont
 	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTART
 	sa.sa_mask = ^uint32(0)
-	if fn == funcPC(sighandler) {
+	if fn == funcPC(sighandler) { // funcPC(sighandler) matches the callers in signal_unix.go
 		if iscgo {
-			fn = funcPC(cgoSigtramp)
+			fn = abi.FuncPCABI0(cgoSigtramp)
 		} else {
-			fn = funcPC(sigtramp)
+			fn = abi.FuncPCABI0(sigtramp)
 		}
 	}
 	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
index b786c8ab5f..5c688a3109 100644
--- a/src/runtime/os_dragonfly.go
+++ b/src/runtime/os_dragonfly.go
@@ -60,12 +60,11 @@ func kqueue() int32
 
 //go:noescape
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
-func closeonexec(fd int32)
-func setNonblock(fd int32)
 
 func pipe() (r, w int32, errno int32)
-
-const stackSystem = 0
+func pipe2(flags int32) (r, w int32, errno int32)
+func closeonexec(fd int32)
+func setNonblock(fd int32)
 
 // From DragonFly's <sys/sysctl.h>
 const (
diff --git a/src/runtime/os_freebsd2.go b/src/runtime/os_freebsd2.go
index 6947a05d04..fde6fbf1b1 100644
--- a/src/runtime/os_freebsd2.go
+++ b/src/runtime/os_freebsd2.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build freebsd && !amd64
 // +build freebsd,!amd64
 
 package runtime
diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go
index 01efb9b7c9..8fe0cb6718 100644
--- a/src/runtime/os_freebsd_noauxv.go
+++ b/src/runtime/os_freebsd_noauxv.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd
-// +build !arm
+//go:build freebsd && !arm
+// +build freebsd,!arm
 
 package runtime
 
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index 5b2c53795a..52b64e7602 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 21d3ae653e..c8b29e396c 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -385,7 +385,7 @@ func mdestroy(mp *m) {
 //#endif
 
 func sigreturn()
-func sigtramp(sig uint32, info *siginfo, ctx unsafe.Pointer)
+func sigtramp() // Called via C ABI
 func cgoSigtramp()
 
 //go:noescape
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index c5fd742048..5260f22f57 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build arm64
 // +build arm64
 
 package runtime
diff --git a/src/runtime/os_linux_be64.go b/src/runtime/os_linux_be64.go
index 9860002ee4..498d7cec6d 100644
--- a/src/runtime/os_linux_be64.go
+++ b/src/runtime/os_linux_be64.go
@@ -2,8 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// The standard GNU/Linux sigset type on big-endian 64-bit machines.
+// The standard Linux sigset type on big-endian 64-bit machines.
 
+//go:build linux && (ppc64 || s390x)
 // +build linux
 // +build ppc64 s390x
 
diff --git a/src/runtime/os_linux_generic.go b/src/runtime/os_linux_generic.go
index e1d0952ddf..fe1973dbde 100644
--- a/src/runtime/os_linux_generic.go
+++ b/src/runtime/os_linux_generic.go
@@ -2,13 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !mips
-// +build !mipsle
-// +build !mips64
-// +build !mips64le
-// +build !s390x
-// +build !ppc64
-// +build linux
+//go:build !mips && !mipsle && !mips64 && !mips64le && !s390x && !ppc64 && linux
+// +build !mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,linux
 
 package runtime
 
diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
index 815a83a04b..bd76442dbd 100644
--- a/src/runtime/os_linux_mips64x.go
+++ b/src/runtime/os_linux_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips64 || mips64le)
 // +build linux
 // +build mips64 mips64le
 
diff --git a/src/runtime/os_linux_mipsx.go b/src/runtime/os_linux_mipsx.go
index 00fb02e4bf..ef8b3f7d43 100644
--- a/src/runtime/os_linux_mipsx.go
+++ b/src/runtime/os_linux_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips || mipsle)
 // +build linux
 // +build mips mipsle
 
diff --git a/src/runtime/os_linux_noauxv.go b/src/runtime/os_linux_noauxv.go
index 895b4cd5f4..59b5aacaeb 100644
--- a/src/runtime/os_linux_noauxv.go
+++ b/src/runtime/os_linux_noauxv.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux
-// +build !arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
+//go:build linux && !arm && !arm64 && !mips && !mipsle && !mips64 && !mips64le && !s390x && !ppc64 && !ppc64le
+// +build linux,!arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/src/runtime/os_linux_novdso.go b/src/runtime/os_linux_novdso.go
index 155f415e71..8104f63627 100644
--- a/src/runtime/os_linux_novdso.go
+++ b/src/runtime/os_linux_novdso.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux
-// +build !386,!amd64,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le
+//go:build linux && !386 && !amd64 && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le
+// +build linux,!386,!amd64,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/src/runtime/os_linux_ppc64x.go b/src/runtime/os_linux_ppc64x.go
index 3aedc23ef9..c093d2ec0f 100644
--- a/src/runtime/os_linux_ppc64x.go
+++ b/src/runtime/os_linux_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (ppc64 || ppc64le)
 // +build linux
 // +build ppc64 ppc64le
 
diff --git a/src/runtime/os_linux_x86.go b/src/runtime/os_linux_x86.go
index d91fa1a0d1..5667774d82 100644
--- a/src/runtime/os_linux_x86.go
+++ b/src/runtime/os_linux_x86.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (386 || amd64)
 // +build linux
 // +build 386 amd64
 
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index 0328fa57ae..6fbb3aa694 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -228,7 +228,11 @@ func newosproc(mp *m) {
 	}
 }
 
-// netbsdMStart is the function call that starts executing a newly
+// mstart is the entry-point for new Ms.
+// It is written in assembly, uses ABI0, is marked TOPFRAME, and calls netbsdMstart0.
+func netbsdMstart()
+
+// netbsdMStart0 is the function call that starts executing a newly
 // created thread. On NetBSD, a new thread inherits the signal stack
 // of the creating thread. That confuses minit, so we remove that
 // signal stack here before calling the regular mstart. It's a bit
@@ -236,10 +240,10 @@ func newosproc(mp *m) {
 // it's a simple change that keeps NetBSD working like other OS's.
 // At this point all signals are blocked, so there is no race.
 //go:nosplit
-func netbsdMstart() {
+func netbsdMstart0() {
 	st := stackt{ss_flags: _SS_DISABLE}
 	sigaltstack(&st, nil)
-	mstart()
+	mstart0()
 }
 
 func osinit() {
diff --git a/src/runtime/os_nonopenbsd.go b/src/runtime/os_nonopenbsd.go
index e65697bdb3..6134b6c02f 100644
--- a/src/runtime/os_nonopenbsd.go
+++ b/src/runtime/os_nonopenbsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !openbsd
 // +build !openbsd
 
 package runtime
diff --git a/src/runtime/os_only_solaris.go b/src/runtime/os_only_solaris.go
index e2f5409354..3829683c80 100644
--- a/src/runtime/os_only_solaris.go
+++ b/src/runtime/os_only_solaris.go
@@ -4,6 +4,7 @@
 
 // Solaris code that doesn't also apply to illumos.
 
+//go:build !illumos
 // +build !illumos
 
 package runtime
diff --git a/src/runtime/os_openbsd_libc.go b/src/runtime/os_openbsd_libc.go
index 2edb0358b0..cff5a092d7 100644
--- a/src/runtime/os_openbsd_libc.go
+++ b/src/runtime/os_openbsd_libc.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,amd64 openbsd,arm64
+//go:build (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
diff --git a/src/runtime/os_openbsd_syscall.go b/src/runtime/os_openbsd_syscall.go
index 16ff2b8e25..5315487961 100644
--- a/src/runtime/os_openbsd_syscall.go
+++ b/src/runtime/os_openbsd_syscall.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,!amd64
-// +build openbsd,!arm64
+//go:build openbsd && !386 && openbsd && !amd64 && openbsd && !arm64
+// +build openbsd,!386,openbsd,!amd64,openbsd,!arm64
 
 package runtime
 
diff --git a/src/runtime/os_openbsd_syscall1.go b/src/runtime/os_openbsd_syscall1.go
index f37da04194..ecae67aa2f 100644
--- a/src/runtime/os_openbsd_syscall1.go
+++ b/src/runtime/os_openbsd_syscall1.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,!amd64,!arm64
+//go:build openbsd && !386 && !amd64 && !arm64
+// +build openbsd,!386,!amd64,!arm64
 
 package runtime
 
diff --git a/src/runtime/os_openbsd_syscall2.go b/src/runtime/os_openbsd_syscall2.go
index 81cfb085aa..6aa57a99da 100644
--- a/src/runtime/os_openbsd_syscall2.go
+++ b/src/runtime/os_openbsd_syscall2.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,!amd64,!arm64
+//go:build openbsd && !386 && !amd64 && !arm64
+// +build openbsd,!386,!amd64,!arm64
 
 package runtime
 
@@ -97,4 +98,4 @@ func sigaltstack(new, old *stackt)
 func closeonexec(fd int32)
 func setNonblock(fd int32)
 
-func walltime1() (sec int64, nsec int32)
+func walltime() (sec int64, nsec int32)
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 77665f461a..4d428346f0 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -325,7 +325,23 @@ func crash() {
 
 //go:nosplit
 func getRandomData(r []byte) {
-	extendRandom(r, 0)
+	// inspired by wyrand see hash32.go for detail
+	t := nanotime()
+	v := getg().m.procid ^ uint64(t)
+
+	for len(r) > 0 {
+		v ^= 0xa0761d6478bd642f
+		v *= 0xe7037ed1a0b428db
+		size := 8
+		if len(r) < 8 {
+			size = len(r)
+		}
+		for i := 0; i < size; i++ {
+			r[i] = byte(v >> (8 * i))
+		}
+		r = r[size:]
+		v = v>>32 | v<<32
+	}
 }
 
 func initsig(preinit bool) {
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index f4e21a93ed..f0935264ac 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -18,6 +18,7 @@ const (
 //go:cgo_import_dynamic runtime._AddVectoredExceptionHandler AddVectoredExceptionHandler%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CloseHandle CloseHandle%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateEventA CreateEventA%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateFileA CreateFileA%7 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
@@ -67,6 +68,7 @@ var (
 	_AddVectoredExceptionHandler,
 	_CloseHandle,
 	_CreateEventA,
+	_CreateFileA,
 	_CreateIoCompletionPort,
 	_CreateThread,
 	_CreateWaitableTimerA,
@@ -132,7 +134,9 @@ var (
 	// Load ntdll.dll manually during startup, otherwise Mingw
 	// links wrong printf function to cgo executable (see issue
 	// 12030 for details).
-	_NtWaitForSingleObject stdFunction
+	_NtWaitForSingleObject  stdFunction
+	_RtlGetCurrentPeb       stdFunction
+	_RtlGetNtVersionNumbers stdFunction
 
 	// These are from non-kernel32.dll, so we prefer to LoadLibraryEx them.
 	_timeBeginPeriod,
@@ -145,9 +149,6 @@ var (
 // to start new os thread.
 func tstart_stdcall(newm *m)
 
-// Called by OS using stdcall ABI.
-func ctrlhandler()
-
 // Init-time helper
 func wintls()
 
@@ -219,21 +220,22 @@ func windowsFindfunc(lib uintptr, name []byte) stdFunction {
 	return stdFunction(unsafe.Pointer(f))
 }
 
-var sysDirectory [521]byte
+const _MAX_PATH = 260 // https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
+var sysDirectory [_MAX_PATH + 1]byte
 var sysDirectoryLen uintptr
 
 func windowsLoadSystemLib(name []byte) uintptr {
+	if sysDirectoryLen == 0 {
+		l := stdcall2(_GetSystemDirectoryA, uintptr(unsafe.Pointer(&sysDirectory[0])), uintptr(len(sysDirectory)-1))
+		if l == 0 || l > uintptr(len(sysDirectory)-1) {
+			throw("Unable to determine system directory")
+		}
+		sysDirectory[l] = '\\'
+		sysDirectoryLen = l + 1
+	}
 	if useLoadLibraryEx {
 		return stdcall3(_LoadLibraryExA, uintptr(unsafe.Pointer(&name[0])), 0, _LOAD_LIBRARY_SEARCH_SYSTEM32)
 	} else {
-		if sysDirectoryLen == 0 {
-			l := stdcall2(_GetSystemDirectoryA, uintptr(unsafe.Pointer(&sysDirectory[0])), uintptr(len(sysDirectory)-1))
-			if l == 0 || l > uintptr(len(sysDirectory)-1) {
-				throw("Unable to determine system directory")
-			}
-			sysDirectory[l] = '\\'
-			sysDirectoryLen = l + 1
-		}
 		absName := append(sysDirectory[:sysDirectoryLen], name...)
 		return stdcall1(_LoadLibraryA, uintptr(unsafe.Pointer(&absName[0])))
 	}
@@ -266,6 +268,8 @@ func loadOptionalSyscalls() {
 		throw("ntdll.dll not found")
 	}
 	_NtWaitForSingleObject = windowsFindfunc(n32, []byte("NtWaitForSingleObject\000"))
+	_RtlGetCurrentPeb = windowsFindfunc(n32, []byte("RtlGetCurrentPeb\000"))
+	_RtlGetNtVersionNumbers = windowsFindfunc(n32, []byte("RtlGetNtVersionNumbers\000"))
 
 	if !haveCputicksAsm {
 		_QueryPerformanceCounter = windowsFindfunc(k32, []byte("QueryPerformanceCounter\000"))
@@ -382,7 +386,6 @@ const (
 )
 
 // in sys_windows_386.s and sys_windows_amd64.s:
-func externalthreadhandler()
 func getlasterror() uint32
 
 // When loading DLLs, we prefer to use LoadLibraryEx with
@@ -471,6 +474,74 @@ func initHighResTimer() {
 	}
 }
 
+//go:linkname canUseLongPaths os.canUseLongPaths
+var canUseLongPaths bool
+
+// We want this to be large enough to hold the contents of sysDirectory, *plus*
+// a slash and another component that itself is greater than MAX_PATH.
+var longFileName [(_MAX_PATH+1)*2 + 1]byte
+
+// initLongPathSupport initializes the canUseLongPaths variable, which is
+// linked into os.canUseLongPaths for determining whether or not long paths
+// need to be fixed up. In the best case, this function is running on newer
+// Windows 10 builds, which have a bit field member of the PEB called
+// "IsLongPathAwareProcess." When this is set, we don't need to go through the
+// error-prone fixup function in order to access long paths. So this init
+// function first checks the Windows build number, sets the flag, and then
+// tests to see if it's actually working. If everything checks out, then
+// canUseLongPaths is set to true, and later when called, os.fixLongPath
+// returns early without doing work.
+func initLongPathSupport() {
+	const (
+		IsLongPathAwareProcess = 0x80
+		PebBitFieldOffset      = 3
+		OPEN_EXISTING          = 3
+		ERROR_PATH_NOT_FOUND   = 3
+	)
+
+	// Check that we're ≥ 10.0.15063.
+	var maj, min, build uint32
+	stdcall3(_RtlGetNtVersionNumbers, uintptr(unsafe.Pointer(&maj)), uintptr(unsafe.Pointer(&min)), uintptr(unsafe.Pointer(&build)))
+	if maj < 10 || (maj == 10 && min == 0 && build&0xffff < 15063) {
+		return
+	}
+
+	// Set the IsLongPathAwareProcess flag of the PEB's bit field.
+	bitField := (*byte)(unsafe.Pointer(stdcall0(_RtlGetCurrentPeb) + PebBitFieldOffset))
+	originalBitField := *bitField
+	*bitField |= IsLongPathAwareProcess
+
+	// Check that this actually has an effect, by constructing a large file
+	// path and seeing whether we get ERROR_PATH_NOT_FOUND, rather than
+	// some other error, which would indicate the path is too long, and
+	// hence long path support is not successful. This whole section is NOT
+	// strictly necessary, but is a nice validity check for the near to
+	// medium term, when this functionality is still relatively new in
+	// Windows.
+	getRandomData(longFileName[len(longFileName)-33 : len(longFileName)-1])
+	start := copy(longFileName[:], sysDirectory[:sysDirectoryLen])
+	const dig = "0123456789abcdef"
+	for i := 0; i < 32; i++ {
+		longFileName[start+i*2] = dig[longFileName[len(longFileName)-33+i]>>4]
+		longFileName[start+i*2+1] = dig[longFileName[len(longFileName)-33+i]&0xf]
+	}
+	start += 64
+	for i := start; i < len(longFileName)-1; i++ {
+		longFileName[i] = 'A'
+	}
+	stdcall7(_CreateFileA, uintptr(unsafe.Pointer(&longFileName[0])), 0, 0, 0, OPEN_EXISTING, 0, 0)
+	// The ERROR_PATH_NOT_FOUND error value is distinct from
+	// ERROR_FILE_NOT_FOUND or ERROR_INVALID_NAME, the latter of which we
+	// expect here due to the final component being too long.
+	if getlasterror() == ERROR_PATH_NOT_FOUND {
+		*bitField = originalBitField
+		println("runtime: warning: IsLongPathAwareProcess failed to enable long paths; proceeding in fixup mode")
+		return
+	}
+
+	canUseLongPaths = true
+}
+
 func osinit() {
 	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
 
@@ -482,11 +553,11 @@ func osinit() {
 
 	initExceptionHandler()
 
-	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
-
 	initHighResTimer()
 	timeBeginPeriodRetValue = osRelax(false)
 
+	initLongPathSupport()
+
 	ncpu = getproccount()
 
 	physPageSize = getPageSize()
@@ -608,8 +679,12 @@ func goenvs() {
 
 	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
 
-	// We call this all the way here, late in init, so that malloc works
-	// for the callback function this generates.
+	// We call these all the way here, late in init, so that malloc works
+	// for the callback functions these generate.
+	var fn interface{} = ctrlHandler
+	ctrlHandlerPC := compileCallback(*efaceOf(&fn), true)
+	stdcall2(_SetConsoleCtrlHandler, ctrlHandlerPC, 1)
+
 	monitorSuspendResume()
 }
 
@@ -729,9 +804,6 @@ func writeConsoleUTF16(handle uintptr, b []uint16) {
 	return
 }
 
-// walltime1 isn't implemented on Windows, but will never be called.
-func walltime1() (sec int64, nsec int32)
-
 //go:nosplit
 func semasleep(ns int64) int32 {
 	const (
@@ -1009,6 +1081,7 @@ func stdcall0(fn stdFunction) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall1(fn stdFunction, a0 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 1
@@ -1017,6 +1090,7 @@ func stdcall1(fn stdFunction, a0 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 2
@@ -1025,6 +1099,7 @@ func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 3
@@ -1033,6 +1108,7 @@ func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 4
@@ -1041,6 +1117,7 @@ func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 5
@@ -1049,6 +1126,7 @@ func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 6
@@ -1057,6 +1135,7 @@ func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
 }
 
 //go:nosplit
+//go:cgo_unsafe_args
 func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
 	mp := getg().m
 	mp.libcall.n = 7
@@ -1099,7 +1178,7 @@ func usleep(us uint32) {
 	})
 }
 
-func ctrlhandler1(_type uint32) uint32 {
+func ctrlHandler(_type uint32) uintptr {
 	var s uint32
 
 	switch _type {
@@ -1122,9 +1201,6 @@ func ctrlhandler1(_type uint32) uint32 {
 	return 0
 }
 
-// in sys_windows_386.s and sys_windows_amd64.s
-func profileloop()
-
 // called from zcallback_windows_*.s to sys_windows_*.s
 func callbackasm1()
 
@@ -1157,13 +1233,18 @@ func gFromSP(mp *m, sp uintptr) *g {
 	return nil
 }
 
-func profileloop1(param uintptr) uint32 {
+func profileLoop() {
 	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
 
 	for {
 		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
 		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
 		for mp := first; mp != nil; mp = mp.alllink {
+			if mp == getg().m {
+				// Don't profile ourselves.
+				continue
+			}
+
 			lock(&mp.threadLock)
 			// Do not profile threads blocked on Notes,
 			// this includes idle worker threads,
@@ -1175,8 +1256,8 @@ func profileloop1(param uintptr) uint32 {
 			// Acquire our own handle to the thread.
 			var thread uintptr
 			if stdcall7(_DuplicateHandle, currentProcess, mp.thread, currentProcess, uintptr(unsafe.Pointer(&thread)), 0, 0, _DUPLICATE_SAME_ACCESS) == 0 {
-				print("runtime.profileloop1: duplicatehandle failed; errno=", getlasterror(), "\n")
-				throw("runtime.profileloop1: duplicatehandle failed")
+				print("runtime: duplicatehandle failed; errno=", getlasterror(), "\n")
+				throw("duplicatehandle failed")
 			}
 			unlock(&mp.threadLock)
 
@@ -1204,9 +1285,7 @@ func setProcessCPUProfiler(hz int32) {
 	if profiletimer == 0 {
 		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
 		atomic.Storeuintptr(&profiletimer, timer)
-		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
-		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
-		stdcall1(_CloseHandle, thread)
+		newm(profileLoop, nil, -1)
 	}
 }
 
@@ -1303,7 +1382,7 @@ func preemptM(mp *m) {
 
 	// Does it want a preemption and is it safe to preempt?
 	gp := gFromSP(mp, c.sp())
-	if wantAsyncPreempt(gp) {
+	if gp != nil && wantAsyncPreempt(gp) {
 		if ok, newpc := isAsyncSafePoint(gp, c.ip(), c.sp(), c.lr()); ok {
 			// Inject call to asyncPreempt
 			targetPC := funcPC(asyncPreempt)
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index e320eaa596..f6c38aafcc 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"internal/abi"
+	"internal/goexperiment"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -159,6 +160,12 @@ func goPanicSlice3CU(x uint, y int) {
 	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3C})
 }
 
+// failures in the conversion (*[x]T)s, 0 <= x <= y, x == cap(s)
+func goPanicSliceConvert(x int, y int) {
+	panicCheck1(getcallerpc(), "slice length too short to convert to pointer to array")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsConvert})
+}
+
 // Implemented in assembly, as they take arguments in registers.
 // Declared here to mark them as ABIInternal.
 func panicIndex(x int, y int)
@@ -177,6 +184,7 @@ func panicSlice3B(x int, y int)
 func panicSlice3BU(x uint, y int)
 func panicSlice3C(x int, y int)
 func panicSlice3CU(x uint, y int)
+func panicSliceConvert(x int, y int)
 
 var shiftError = error(errorString("negative shift amount"))
 
@@ -228,6 +236,11 @@ func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn
 		throw("defer on system stack")
 	}
 
+	if goexperiment.RegabiDefer && siz != 0 {
+		// TODO: Make deferproc just take a func().
+		throw("defer with non-empty frame")
+	}
+
 	// the arguments of fn are in a perilous state. The stack map
 	// for deferproc does not describe them. So we can't let garbage
 	// collection or stack copying trigger until we've copied them out
@@ -280,6 +293,9 @@ func deferprocStack(d *_defer) {
 		// go code on the system stack can't defer
 		throw("defer on system stack")
 	}
+	if goexperiment.RegabiDefer && d.siz != 0 {
+		throw("defer with non-empty frame")
+	}
 	// siz and fn are already set.
 	// The other fields are junk on entry to deferprocStack and
 	// are initialized here.
@@ -374,6 +390,19 @@ func deferArgs(d *_defer) unsafe.Pointer {
 	return add(unsafe.Pointer(d), unsafe.Sizeof(*d))
 }
 
+// deferFunc returns d's deferred function. This is temporary while we
+// support both modes of GOEXPERIMENT=regabidefer. Once we commit to
+// that experiment, we should change the type of d.fn.
+//go:nosplit
+func deferFunc(d *_defer) func() {
+	if !goexperiment.RegabiDefer {
+		throw("requires GOEXPERIMENT=regabidefer")
+	}
+	var fn func()
+	*(**funcval)(unsafe.Pointer(&fn)) = d.fn
+	return fn
+}
+
 var deferType *_type // type of _defer struct
 
 func init() {
@@ -519,10 +548,8 @@ func freedeferfn() {
 // modifying the caller's frame in order to reuse the frame to call the deferred
 // function.
 //
-// The single argument isn't actually used - it just has its address
-// taken so it can be matched against pending defers.
 //go:nosplit
-func deferreturn(arg0 uintptr) {
+func deferreturn() {
 	gp := getg()
 	d := gp._defer
 	if d == nil {
@@ -548,13 +575,14 @@ func deferreturn(arg0 uintptr) {
 	// nosplit because the garbage collector won't know the form
 	// of the arguments until the jmpdefer can flip the PC over to
 	// fn.
+	argp := getcallersp() + sys.MinFrameSize
 	switch d.siz {
 	case 0:
 		// Do nothing.
 	case sys.PtrSize:
-		*(*uintptr)(unsafe.Pointer(&arg0)) = *(*uintptr)(deferArgs(d))
+		*(*uintptr)(unsafe.Pointer(argp)) = *(*uintptr)(deferArgs(d))
 	default:
-		memmove(unsafe.Pointer(&arg0), deferArgs(d), uintptr(d.siz))
+		memmove(unsafe.Pointer(argp), deferArgs(d), uintptr(d.siz))
 	}
 	fn := d.fn
 	d.fn = nil
@@ -566,7 +594,7 @@ func deferreturn(arg0 uintptr) {
 	// stack, because the stack trace can be incorrect in that case - see
 	// issue #8153).
 	_ = fn.fn
-	jmpdefer(fn, uintptr(unsafe.Pointer(&arg0)))
+	jmpdefer(fn, argp)
 }
 
 // Goexit terminates the goroutine that calls it. No other goroutine is affected.
@@ -627,10 +655,15 @@ func Goexit() {
 				addOneOpenDeferFrame(gp, 0, nil)
 			}
 		} else {
-
-			// Save the pc/sp in reflectcallSave(), so we can "recover" back to this
-			// loop if necessary.
-			reflectcallSave(&p, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz))
+			if goexperiment.RegabiDefer {
+				// Save the pc/sp in deferCallSave(), so we can "recover" back to this
+				// loop if necessary.
+				deferCallSave(&p, deferFunc(d))
+			} else {
+				// Save the pc/sp in reflectcallSave(), so we can "recover" back to this
+				// loop if necessary.
+				reflectcallSave(&p, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz))
+			}
 		}
 		if p.aborted {
 			// We had a recursive panic in the defer d we started, and
@@ -699,7 +732,7 @@ func printpanics(p *_panic) {
 // specified by sp. If sp is nil, it uses the sp from the current defer record
 // (which has just been finished). Hence, it continues the stack scan from the
 // frame of the defer that just finished. It skips any frame that already has an
-// open-coded _defer record, which would have been been created from a previous
+// open-coded _defer record, which would have been created from a previous
 // (unrecovered) panic.
 //
 // Note: All entries of the defer chain (including this new open-coded entry) have
@@ -824,6 +857,9 @@ func runOpenDeferFrame(gp *g, d *_defer) bool {
 		argWidth, fd = readvarintUnsafe(fd)
 		closureOffset, fd = readvarintUnsafe(fd)
 		nArgs, fd = readvarintUnsafe(fd)
+		if goexperiment.RegabiDefer && argWidth != 0 {
+			throw("defer with non-empty frame")
+		}
 		if deferBits&(1<<i) == 0 {
 			for j := uint32(0); j < nArgs; j++ {
 				_, fd = readvarintUnsafe(fd)
@@ -849,7 +885,11 @@ func runOpenDeferFrame(gp *g, d *_defer) bool {
 		deferBits = deferBits &^ (1 << i)
 		*(*uint8)(unsafe.Pointer(d.varp - uintptr(deferBitsOffset))) = deferBits
 		p := d._panic
-		reflectcallSave(p, unsafe.Pointer(closure), deferArgs, argWidth)
+		if goexperiment.RegabiDefer {
+			deferCallSave(p, deferFunc(d))
+		} else {
+			reflectcallSave(p, unsafe.Pointer(closure), deferArgs, argWidth)
+		}
 		if p != nil && p.aborted {
 			break
 		}
@@ -869,17 +909,20 @@ func runOpenDeferFrame(gp *g, d *_defer) bool {
 // panic record. This allows the runtime to return to the Goexit defer processing
 // loop, in the unusual case where the Goexit may be bypassed by a successful
 // recover.
+//
+// This is marked as a wrapper by the compiler so it doesn't appear in
+// tracebacks.
 func reflectcallSave(p *_panic, fn, arg unsafe.Pointer, argsize uint32) {
+	if goexperiment.RegabiDefer {
+		throw("not allowed with GOEXPERIMENT=regabidefer")
+	}
 	if p != nil {
-		p.argp = unsafe.Pointer(getargp(0))
+		p.argp = unsafe.Pointer(getargp())
 		p.pc = getcallerpc()
 		p.sp = unsafe.Pointer(getcallersp())
 	}
-	// Pass a dummy RegArgs for now since no function actually implements
-	// the register-based ABI.
-	//
-	// TODO(mknyszek): Implement this properly, setting up arguments in
-	// registers as necessary in the caller.
+	// Pass a dummy RegArgs since we'll only take this path if
+	// we're not using the register ABI.
 	var regs abi.RegArgs
 	reflectcall(nil, fn, arg, argsize, argsize, argsize, &regs)
 	if p != nil {
@@ -888,6 +931,29 @@ func reflectcallSave(p *_panic, fn, arg unsafe.Pointer, argsize uint32) {
 	}
 }
 
+// deferCallSave calls fn() after saving the caller's pc and sp in the
+// panic record. This allows the runtime to return to the Goexit defer
+// processing loop, in the unusual case where the Goexit may be
+// bypassed by a successful recover.
+//
+// This is marked as a wrapper by the compiler so it doesn't appear in
+// tracebacks.
+func deferCallSave(p *_panic, fn func()) {
+	if !goexperiment.RegabiDefer {
+		throw("only allowed with GOEXPERIMENT=regabidefer")
+	}
+	if p != nil {
+		p.argp = unsafe.Pointer(getargp())
+		p.pc = getcallerpc()
+		p.sp = unsafe.Pointer(getcallersp())
+	}
+	fn()
+	if p != nil {
+		p.pc = 0
+		p.sp = unsafe.Pointer(nil)
+	}
+}
+
 // The implementation of the predeclared function panic.
 func gopanic(e interface{}) {
 	gp := getg()
@@ -959,7 +1025,7 @@ func gopanic(e interface{}) {
 
 		// Mark defer as started, but keep on list, so that traceback
 		// can find and update the defer's argument frame if stack growth
-		// or a garbage collection happens before reflectcall starts executing d.fn.
+		// or a garbage collection happens before executing d.fn.
 		d.started = true
 
 		// Record the panic that is running the defer.
@@ -974,14 +1040,21 @@ func gopanic(e interface{}) {
 				addOneOpenDeferFrame(gp, 0, nil)
 			}
 		} else {
-			p.argp = unsafe.Pointer(getargp(0))
+			p.argp = unsafe.Pointer(getargp())
 
-			var regs abi.RegArgs
-			reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz), uint32(d.siz), &regs)
+			if goexperiment.RegabiDefer {
+				fn := deferFunc(d)
+				fn()
+			} else {
+				// Pass a dummy RegArgs since we'll only take this path if
+				// we're not using the register ABI.
+				var regs abi.RegArgs
+				reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz), uint32(d.siz), &regs)
+			}
 		}
 		p.argp = nil
 
-		// reflectcall did not panic. Remove d.
+		// Deferred function did not panic. Remove d.
 		if gp._defer != d {
 			throw("bad defer entry in panic")
 		}
@@ -1079,9 +1152,8 @@ func gopanic(e interface{}) {
 // writes outgoing function call arguments.
 //go:nosplit
 //go:noinline
-func getargp(x int) uintptr {
-	// x is an argument mainly so that we can return its address.
-	return uintptr(noescape(unsafe.Pointer(&x)))
+func getargp() uintptr {
+	return getcallersp() + sys.MinFrameSize
 }
 
 // The implementation of the predeclared function recover.
@@ -1418,5 +1490,9 @@ func shouldPushSigpanic(gp *g, pc, lr uintptr) bool {
 //
 //go:nosplit
 func isAbortPC(pc uintptr) bool {
-	return pc == funcPC(abort) || ((GOARCH == "arm" || GOARCH == "arm64") && pc == funcPC(abort)+sys.PCQuantum)
+	f := findfunc(pc)
+	if !f.valid() {
+		return false
+	}
+	return f.funcID == funcID_abort
 }
diff --git a/src/runtime/panic32.go b/src/runtime/panic32.go
index aea8401a37..acbdd1ff45 100644
--- a/src/runtime/panic32.go
+++ b/src/runtime/panic32.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386 || arm || mips || mipsle
 // +build 386 arm mips mipsle
 
 package runtime
diff --git a/src/runtime/plugin.go b/src/runtime/plugin.go
index 5e05be71ec..cd7fc5f848 100644
--- a/src/runtime/plugin.go
+++ b/src/runtime/plugin.go
@@ -115,7 +115,8 @@ func pluginftabverify(md *moduledata) {
 			entry2 = f2.entry
 		}
 		badtable = true
-		println("ftab entry outside pc range: ", hex(entry), "/", hex(entry2), ": ", name, "/", name2)
+		println("ftab entry", hex(entry), "/", hex(entry2), ": ",
+			name, "/", name2, "outside pc range:[", hex(md.minpc), ",", hex(md.maxpc), "], modulename=", md.modulename, ", pluginpath=", md.pluginpath)
 	}
 	if badtable {
 		throw("runtime: plugin has bad symbol table")
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index c11a45fd69..3ef40d3de7 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !js
 // +build !js
 
 package pprof
@@ -93,31 +94,31 @@ func TestMemoryProfiler(t *testing.T) {
 	}{{
 		stk: []string{"runtime/pprof.allocatePersistent1K", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:47
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:82
+#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:48
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:83
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient1M", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:24
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:79
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:25
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:80
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2M", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:30
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:80
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:31
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:81
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2MInline", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:34
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:81
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:35
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:82
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateReflectTransient"},
 		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:55
+#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:56
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}}
 
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index d3b7df3c1b..99eda10f1c 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -843,45 +843,7 @@ func countMutex() int {
 
 // writeBlock writes the current blocking profile to w.
 func writeBlock(w io.Writer, debug int) error {
-	var p []runtime.BlockProfileRecord
-	n, ok := runtime.BlockProfile(nil)
-	for {
-		p = make([]runtime.BlockProfileRecord, n+50)
-		n, ok = runtime.BlockProfile(p)
-		if ok {
-			p = p[:n]
-			break
-		}
-	}
-
-	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
-
-	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", scaleBlockProfile, p)
-	}
-
-	b := bufio.NewWriter(w)
-	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-	w = tw
-
-	fmt.Fprintf(w, "--- contention:\n")
-	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
-	for i := range p {
-		r := &p[i]
-		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
-		for _, pc := range r.Stack() {
-			fmt.Fprintf(w, " %#x", pc)
-		}
-		fmt.Fprint(w, "\n")
-		if debug > 0 {
-			printStackRecord(w, r.Stack(), true)
-		}
-	}
-
-	if tw != nil {
-		tw.Flush()
-	}
-	return b.Flush()
+	return writeProfileInternal(w, debug, "contention", runtime.BlockProfile, scaleBlockProfile)
 }
 
 func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
@@ -894,12 +856,16 @@ func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
 
 // writeMutex writes the current mutex profile to w.
 func writeMutex(w io.Writer, debug int) error {
-	// TODO(pjw): too much common code with writeBlock. FIX!
+	return writeProfileInternal(w, debug, "mutex", runtime.MutexProfile, scaleMutexProfile)
+}
+
+// writeProfileInternal writes the current blocking or mutex profile depending on the passed parameters
+func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile func([]runtime.BlockProfileRecord) (int, bool), scaleProfile func(int64, float64) (int64, float64)) error {
 	var p []runtime.BlockProfileRecord
-	n, ok := runtime.MutexProfile(nil)
+	n, ok := runtimeProfile(nil)
 	for {
 		p = make([]runtime.BlockProfileRecord, n+50)
-		n, ok = runtime.MutexProfile(p)
+		n, ok = runtimeProfile(p)
 		if ok {
 			p = p[:n]
 			break
@@ -909,16 +875,18 @@ func writeMutex(w io.Writer, debug int) error {
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
 	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", scaleMutexProfile, p)
+		return printCountCycleProfile(w, "contentions", "delay", scaleProfile, p)
 	}
 
 	b := bufio.NewWriter(w)
 	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
 	w = tw
 
-	fmt.Fprintf(w, "--- mutex:\n")
+	fmt.Fprintf(w, "--- %v:\n", name)
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
-	fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
+	if name == "mutex" {
+		fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
+	}
 	for i := range p {
 		r := &p[i]
 		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
diff --git a/src/runtime/pprof/pprof_norusage.go b/src/runtime/pprof/pprof_norusage.go
index 6fdcc6cc38..e175dd380c 100644
--- a/src/runtime/pprof/pprof_norusage.go
+++ b/src/runtime/pprof/pprof_norusage.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !darwin && !linux
 // +build !darwin,!linux
 
 package pprof
diff --git a/src/runtime/pprof/pprof_rusage.go b/src/runtime/pprof/pprof_rusage.go
index 7954673811..269f21bc2f 100644
--- a/src/runtime/pprof/pprof_rusage.go
+++ b/src/runtime/pprof/pprof_rusage.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || linux
 // +build darwin linux
 
 package pprof
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 14321b0934..7cbb4fc7ae 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !js
 // +build !js
 
 package pprof
@@ -11,9 +12,9 @@ import (
 	"context"
 	"fmt"
 	"internal/profile"
-	"internal/race"
 	"internal/testenv"
 	"io"
+	"math"
 	"math/big"
 	"os"
 	"os/exec"
@@ -24,6 +25,7 @@ import (
 	"sync/atomic"
 	"testing"
 	"time"
+	_ "unsafe"
 )
 
 func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
@@ -275,7 +277,8 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 
 	broken := false
 	switch runtime.GOOS {
-	case "ios", "dragonfly", "netbsd", "illumos", "solaris":
+	// See https://golang.org/issue/45170 for AIX.
+	case "ios", "dragonfly", "netbsd", "illumos", "solaris", "aix":
 		broken = true
 	case "openbsd":
 		if runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" {
@@ -584,18 +587,6 @@ func stackContainsAll(spec string, count uintptr, stk []*profile.Location, label
 }
 
 func TestMorestack(t *testing.T) {
-	if runtime.GOOS == "darwin" && race.Enabled {
-		// For whatever reason, using the race detector on macOS keeps us
-		// from finding the newstack/growstack calls in the profile.
-		// Not worth worrying about.
-		// https://build.golang.org/log/280d387327806e17c8aabeb38b9503dbbd942ed1
-		t.Skip("skipping on darwin race detector")
-	}
-	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		// For whatever reason, darwin/arm64 also doesn't work.
-		// https://build.golang.org/log/c45e82cc25f152642e6fb90d882ef5a8cd130ce5
-		t.Skip("skipping on darwin/arm64")
-	}
 	testCPUProfile(t, stackContainsAll, []string{"runtime.newstack,runtime/pprof.growstack"}, avoidFunctions(), func(duration time.Duration) {
 		t := time.After(duration)
 		c := make(chan bool)
@@ -615,17 +606,20 @@ func TestMorestack(t *testing.T) {
 
 //go:noinline
 func growstack1() {
-	growstack()
+	growstack(10)
 }
 
 //go:noinline
-func growstack() {
-	var buf [8 << 10]byte
+func growstack(n int) {
+	var buf [8 << 16]byte
 	use(buf)
+	if n > 0 {
+		growstack(n - 1)
+	}
 }
 
 //go:noinline
-func use(x [8 << 10]byte) {}
+func use(x [8 << 16]byte) {}
 
 func TestBlockProfile(t *testing.T) {
 	type TestCase struct {
@@ -903,6 +897,74 @@ func blockCond() {
 	mu.Unlock()
 }
 
+// See http://golang.org/cl/299991.
+func TestBlockProfileBias(t *testing.T) {
+	rate := int(1000) // arbitrary value
+	runtime.SetBlockProfileRate(rate)
+	defer runtime.SetBlockProfileRate(0)
+
+	// simulate blocking events
+	blockFrequentShort(rate)
+	blockInfrequentLong(rate)
+
+	var w bytes.Buffer
+	Lookup("block").WriteTo(&w, 0)
+	p, err := profile.Parse(&w)
+	if err != nil {
+		t.Fatalf("failed to parse profile: %v", err)
+	}
+	t.Logf("parsed proto: %s", p)
+
+	il := float64(-1) // blockInfrequentLong duration
+	fs := float64(-1) // blockFrequentShort duration
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			for _, line := range l.Line {
+				if len(s.Value) < 2 {
+					t.Fatal("block profile has less than 2 sample types")
+				}
+
+				if line.Function.Name == "runtime/pprof.blockInfrequentLong" {
+					il = float64(s.Value[1])
+				} else if line.Function.Name == "runtime/pprof.blockFrequentShort" {
+					fs = float64(s.Value[1])
+				}
+			}
+		}
+	}
+	if il == -1 || fs == -1 {
+		t.Fatal("block profile is missing expected functions")
+	}
+
+	// stddev of bias from 100 runs on local machine multiplied by 10x
+	const threshold = 0.2
+	if bias := (il - fs) / il; math.Abs(bias) > threshold {
+		t.Fatalf("bias: abs(%f) > %f", bias, threshold)
+	} else {
+		t.Logf("bias: abs(%f) < %f", bias, threshold)
+	}
+}
+
+// blockFrequentShort produces 100000 block events with an average duration of
+// rate / 10.
+func blockFrequentShort(rate int) {
+	for i := 0; i < 100000; i++ {
+		blockevent(int64(rate/10), 1)
+	}
+}
+
+// blockFrequentShort produces 10000 block events with an average duration of
+// rate.
+func blockInfrequentLong(rate int) {
+	for i := 0; i < 10000; i++ {
+		blockevent(int64(rate), 1)
+	}
+}
+
+// Used by TestBlockProfileBias.
+//go:linkname blockevent runtime.blockevent
+func blockevent(cycles int64, skip int)
+
 func TestMutexProfile(t *testing.T) {
 	// Generate mutex profile
 
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 92c664d79a..dc7af806d3 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -13,11 +13,6 @@ TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	ADJSP $368
 	// But vet doesn't know ADJSP, so suppress vet stack checking
 	NOP SP
-	#ifdef GOOS_darwin
-	CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
-	JE 2(PC)
-	VZEROUPPER
-	#endif
 	MOVQ AX, 0(SP)
 	MOVQ CX, 8(SP)
 	MOVQ DX, 16(SP)
@@ -32,6 +27,11 @@ TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ R13, 88(SP)
 	MOVQ R14, 96(SP)
 	MOVQ R15, 104(SP)
+	#ifdef GOOS_darwin
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
+	JE 2(PC)
+	VZEROUPPER
+	#endif
 	MOVUPS X0, 112(SP)
 	MOVUPS X1, 128(SP)
 	MOVUPS X2, 144(SP)
diff --git a/src/runtime/preempt_nonwindows.go b/src/runtime/preempt_nonwindows.go
index 3066a1521e..365e86a611 100644
--- a/src/runtime/preempt_nonwindows.go
+++ b/src/runtime/preempt_nonwindows.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !windows
 // +build !windows
 
 package runtime
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index dbb430fd25..d9f8c65530 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -5,15 +5,14 @@
 package runtime
 
 import (
-	"internal/bytealg"
+	"internal/abi"
 	"internal/cpu"
+	"internal/goexperiment"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
 
-var buildVersion = sys.TheVersion
-
 // set using cmd/go/internal/modload.ModInfoProg
 var modinfo string
 
@@ -52,33 +51,64 @@ var modinfo string
 //    any work to do.
 //
 // The current approach:
-// We unpark an additional thread when we ready a goroutine if (1) there is an
-// idle P and there are no "spinning" worker threads. A worker thread is considered
-// spinning if it is out of local work and did not find work in global run queue/
-// netpoller; the spinning state is denoted in m.spinning and in sched.nmspinning.
-// Threads unparked this way are also considered spinning; we don't do goroutine
-// handoff so such threads are out of work initially. Spinning threads do some
-// spinning looking for work in per-P run queues before parking. If a spinning
+//
+// This approach applies to three primary sources of potential work: readying a
+// goroutine, new/modified-earlier timers, and idle-priority GC. See below for
+// additional details.
+//
+// We unpark an additional thread when we submit work if (this is wakep()):
+// 1. There is an idle P, and
+// 2. There are no "spinning" worker threads.
+//
+// A worker thread is considered spinning if it is out of local work and did
+// not find work in the global run queue or netpoller; the spinning state is
+// denoted in m.spinning and in sched.nmspinning. Threads unparked this way are
+// also considered spinning; we don't do goroutine handoff so such threads are
+// out of work initially. Spinning threads spin on looking for work in per-P
+// run queues and timer heaps or from the GC before parking. If a spinning
 // thread finds work it takes itself out of the spinning state and proceeds to
-// execution. If it does not find work it takes itself out of the spinning state
-// and then parks.
-// If there is at least one spinning thread (sched.nmspinning>1), we don't unpark
-// new threads when readying goroutines. To compensate for that, if the last spinning
-// thread finds work and stops spinning, it must unpark a new spinning thread.
-// This approach smooths out unjustified spikes of thread unparking,
-// but at the same time guarantees eventual maximal CPU parallelism utilization.
+// execution. If it does not find work it takes itself out of the spinning
+// state and then parks.
+//
+// If there is at least one spinning thread (sched.nmspinning>1), we don't
+// unpark new threads when submitting work. To compensate for that, if the last
+// spinning thread finds work and stops spinning, it must unpark a new spinning
+// thread.  This approach smooths out unjustified spikes of thread unparking,
+// but at the same time guarantees eventual maximal CPU parallelism
+// utilization.
 //
-// The main implementation complication is that we need to be very careful during
-// spinning->non-spinning thread transition. This transition can race with submission
-// of a new goroutine, and either one part or another needs to unpark another worker
-// thread. If they both fail to do that, we can end up with semi-persistent CPU
-// underutilization. The general pattern for goroutine readying is: submit a goroutine
-// to local work queue, #StoreLoad-style memory barrier, check sched.nmspinning.
-// The general pattern for spinning->non-spinning transition is: decrement nmspinning,
-// #StoreLoad-style memory barrier, check all per-P work queues for new work.
-// Note that all this complexity does not apply to global run queue as we are not
-// sloppy about thread unparking when submitting to global queue. Also see comments
-// for nmspinning manipulation.
+// The main implementation complication is that we need to be very careful
+// during spinning->non-spinning thread transition. This transition can race
+// with submission of new work, and either one part or another needs to unpark
+// another worker thread. If they both fail to do that, we can end up with
+// semi-persistent CPU underutilization.
+//
+// The general pattern for submission is:
+// 1. Submit work to the local run queue, timer heap, or GC state.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check sched.nmspinning.
+//
+// The general pattern for spinning->non-spinning transition is:
+// 1. Decrement nmspinning.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check all per-P work queues and GC for new work.
+//
+// Note that all this complexity does not apply to global run queue as we are
+// not sloppy about thread unparking when submitting to global queue. Also see
+// comments for nmspinning manipulation.
+//
+// How these different sources of work behave varies, though it doesn't affect
+// the synchronization approach:
+// * Ready goroutine: this is an obvious source of work; the goroutine is
+//   immediately ready and must run on some thread eventually.
+// * New/modified-earlier timer: The current timer implementation (see time.go)
+//   uses netpoll in a thread with no work available to wait for the soonest
+//   timer. If there is no thread waiting, we want a new spinning thread to go
+//   wait.
+// * Idle-priority GC: The GC wakes a stopped idle thread to contribute to
+//   background GC work (note: currently disabled per golang.org/issue/19112).
+//   Also see golang.org/issue/44313, as this should be extended to all GC
+//   workers.
 
 var (
 	m0           m
@@ -541,6 +571,30 @@ func atomicAllGIndex(ptr **g, i uintptr) *g {
 	return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
 }
 
+// forEachG calls fn on every G from allgs.
+//
+// forEachG takes a lock to exclude concurrent addition of new Gs.
+func forEachG(fn func(gp *g)) {
+	lock(&allglock)
+	for _, gp := range allgs {
+		fn(gp)
+	}
+	unlock(&allglock)
+}
+
+// forEachGRace calls fn on every G from allgs.
+//
+// forEachGRace avoids locking, but does not exclude addition of new Gs during
+// execution, which may be missed.
+func forEachGRace(fn func(gp *g)) {
+	ptr, length := atomicAllG()
+	for i := uintptr(0); i < length; i++ {
+		gp := atomicAllGIndex(ptr, i)
+		fn(gp)
+	}
+	return
+}
+
 const (
 	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -644,6 +698,11 @@ func schedinit() {
 	sigsave(&_g_.m.sigmask)
 	initSigmask = _g_.m.sigmask
 
+	if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
+		println(offset)
+		throw("sched.timeToRun not aligned to 8 bytes")
+	}
+
 	goargs()
 	goenvs()
 	parsedebugvars()
@@ -920,6 +979,37 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
+
+	// Handle tracking for scheduling latencies.
+	if oldval == _Grunning {
+		// Track every 8th time a goroutine transitions out of running.
+		if gp.trackingSeq%gTrackingPeriod == 0 {
+			gp.tracking = true
+		}
+		gp.trackingSeq++
+	}
+	if gp.tracking {
+		now := nanotime()
+		if oldval == _Grunnable {
+			// We transitioned out of runnable, so measure how much
+			// time we spent in this state and add it to
+			// runnableTime.
+			gp.runnableTime += now - gp.runnableStamp
+			gp.runnableStamp = 0
+		}
+		if newval == _Grunnable {
+			// We just transitioned into runnable, so record what
+			// time that happened.
+			gp.runnableStamp = now
+		} else if newval == _Grunning {
+			// We're transitioning into running, so turn off
+			// tracking and record how much time we spent in
+			// runnable.
+			gp.tracking = false
+			sched.timeToRun.record(gp.runnableTime)
+			gp.runnableTime = 0
+		}
+	}
 }
 
 // casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
@@ -1213,7 +1303,7 @@ func usesLibcall() bool {
 	case "aix", "darwin", "illumos", "ios", "solaris", "windows":
 		return true
 	case "openbsd":
-		return GOARCH == "amd64" || GOARCH == "arm64"
+		return GOARCH == "386" || GOARCH == "amd64" || GOARCH == "arm64"
 	}
 	return false
 }
@@ -1226,7 +1316,7 @@ func mStackIsSystemAllocated() bool {
 		return true
 	case "openbsd":
 		switch GOARCH {
-		case "amd64", "arm64":
+		case "386", "amd64", "arm64":
 			return true
 		}
 	}
@@ -1234,7 +1324,7 @@ func mStackIsSystemAllocated() bool {
 }
 
 // mstart is the entry-point for new Ms.
-// It is written in assembly, marked TOPFRAME, and calls mstart0.
+// It is written in assembly, uses ABI0, is marked TOPFRAME, and calls mstart0.
 func mstart()
 
 // mstart0 is the Go entry-point for new Ms.
@@ -1294,7 +1384,7 @@ func mstart1() {
 		throw("bad runtime·mstart")
 	}
 
-	// Set up m.g0.sched as a label returning returning to just
+	// Set up m.g0.sched as a label returning to just
 	// after the mstart1 call in mstart0 above, for use by goexit0 and mcall.
 	// We're never coming back to mstart1 after we call schedule,
 	// so other calls can reuse the current frame.
@@ -1349,6 +1439,9 @@ func mPark() {
 	g := getg()
 	for {
 		notesleep(&g.m.park)
+		// Note, because of signal handling by this parked m,
+		// a preemptive mDoFixup() may actually occur via
+		// mDoFixupAndOSYield(). (See golang.org/issue/44193)
 		noteclear(&g.m.park)
 		if !mDoFixup() {
 			return
@@ -1582,6 +1675,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
 	for atomic.Load(&sched.sysmonStarting) != 0 {
 		osyield()
 	}
+
+	// We don't want this thread to handle signals for the
+	// duration of this critical section. The underlying issue
+	// being that this locked coordinating m is the one monitoring
+	// for fn() execution by all the other m's of the runtime,
+	// while no regular go code execution is permitted (the world
+	// is stopped). If this present m were to get distracted to
+	// run signal handling code, and find itself waiting for a
+	// second thread to execute go code before being able to
+	// return from that signal handling, a deadlock will result.
+	// (See golang.org/issue/44193.)
+	lockOSThread()
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock(false)
+
 	stopTheWorldGC("doAllThreadsSyscall")
 	if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
 		// Ensure that there are no in-flight thread
@@ -1633,6 +1742,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
 			// the possibility of racing with mp.
 			lock(&mp.mFixup.lock)
 			mp.mFixup.fn = fn
+			atomic.Store(&mp.mFixup.used, 1)
 			if mp.doesPark {
 				// For non-service threads this will
 				// cause the wakeup to be short lived
@@ -1649,9 +1759,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
 				if mp.procid == tid {
 					continue
 				}
-				lock(&mp.mFixup.lock)
-				done = done && (mp.mFixup.fn == nil)
-				unlock(&mp.mFixup.lock)
+				done = atomic.Load(&mp.mFixup.used) == 0
 			}
 			if done {
 				break
@@ -1678,6 +1786,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
 		unlock(&mFixupRace.lock)
 	}
 	startTheWorldGC()
+	msigrestore(sigmask)
+	unlockOSThread()
 }
 
 // runSafePointFn runs the safe point function, if any, for this P.
@@ -1859,6 +1969,10 @@ func needm() {
 	// Store the original signal mask for use by minit.
 	mp.sigmask = sigmask
 
+	// Install TLS on some platforms (previously setg
+	// would do this if necessary).
+	osSetupTLS(mp)
+
 	// Install g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
 	// how big the stack is, like we don't know how big any
@@ -1909,7 +2023,7 @@ func oneNewExtraM() {
 	// the goroutine stack ends.
 	mp := allocm(nil, nil, -1)
 	gp := malg(4096)
-	gp.sched.pc = funcPC(goexit) + sys.PCQuantum
+	gp.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum
 	gp.sched.sp = gp.stack.hi
 	gp.sched.sp -= 4 * sys.PtrSize // extra space in case of reads slightly beyond frame
 	gp.sched.lr = 0
@@ -2168,9 +2282,21 @@ var mFixupRace struct {
 // mDoFixup runs any outstanding fixup function for the running m.
 // Returns true if a fixup was outstanding and actually executed.
 //
+// Note: to avoid deadlocks, and the need for the fixup function
+// itself to be async safe, signals are blocked for the working m
+// while it holds the mFixup lock. (See golang.org/issue/44193)
+//
 //go:nosplit
 func mDoFixup() bool {
 	_g_ := getg()
+	if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
+		return false
+	}
+
+	// slow path - if fixup fn is used, block signals and lock.
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock(false)
 	lock(&_g_.m.mFixup.lock)
 	fn := _g_.m.mFixup.fn
 	if fn != nil {
@@ -2187,7 +2313,6 @@ func mDoFixup() bool {
 			// is more obviously safe.
 			throw("GC must be disabled to protect validity of fn value")
 		}
-		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
 		if _g_.racectx != 0 || !raceenabled {
 			fn(false)
 		} else {
@@ -2202,11 +2327,24 @@ func mDoFixup() bool {
 			_g_.racectx = 0
 			unlock(&mFixupRace.lock)
 		}
+		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+		atomic.Store(&_g_.m.mFixup.used, 0)
 	}
 	unlock(&_g_.m.mFixup.lock)
+	msigrestore(sigmask)
 	return fn != nil
 }
 
+// mDoFixupAndOSYield is called when an m is unable to send a signal
+// because the allThreadsSyscall mechanism is in progress. That is, an
+// mPark() has been interrupted with this signal handler so we need to
+// ensure the fixup is executed from this context.
+//go:nosplit
+func mDoFixupAndOSYield() {
+	mDoFixup()
+	osyield()
+}
+
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
 // may not be in a good state.
@@ -2624,85 +2762,40 @@ top:
 		}
 	}
 
-	// Steal work from other P's.
+	// Spinning Ms: steal work from other Ps.
+	//
+	// Limit the number of spinning Ms to half the number of busy Ps.
+	// This is necessary to prevent excessive CPU consumption when
+	// GOMAXPROCS>>1 but the program parallelism is low.
 	procs := uint32(gomaxprocs)
-	ranTimer := false
-	// If number of spinning M's >= number of busy P's, block.
-	// This is necessary to prevent excessive CPU consumption
-	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) {
-		goto stop
-	}
-	if !_g_.m.spinning {
-		_g_.m.spinning = true
-		atomic.Xadd(&sched.nmspinning, 1)
-	}
-	const stealTries = 4
-	for i := 0; i < stealTries; i++ {
-		stealTimersOrRunNextG := i == stealTries-1
-
-		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
-			if sched.gcwaiting != 0 {
-				goto top
-			}
-			p2 := allp[enum.position()]
-			if _p_ == p2 {
-				continue
-			}
-
-			// Steal timers from p2. This call to checkTimers is the only place
-			// where we might hold a lock on a different P's timers. We do this
-			// once on the last pass before checking runnext because stealing
-			// from the other P's runnext should be the last resort, so if there
-			// are timers to steal do that first.
-			//
-			// We only check timers on one of the stealing iterations because
-			// the time stored in now doesn't change in this loop and checking
-			// the timers for each P more than once with the same value of now
-			// is probably a waste of time.
-			//
-			// timerpMask tells us whether the P may have timers at all. If it
-			// can't, no need to check at all.
-			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
-				tnow, w, ran := checkTimers(p2, now)
-				now = tnow
-				if w != 0 && (pollUntil == 0 || w < pollUntil) {
-					pollUntil = w
-				}
-				if ran {
-					// Running the timers may have
-					// made an arbitrary number of G's
-					// ready and added them to this P's
-					// local run queue. That invalidates
-					// the assumption of runqsteal
-					// that is always has room to add
-					// stolen G's. So check now if there
-					// is a local G to run.
-					if gp, inheritTime := runqget(_p_); gp != nil {
-						return gp, inheritTime
-					}
-					ranTimer = true
-				}
-			}
+	if _g_.m.spinning || 2*atomic.Load(&sched.nmspinning) < procs-atomic.Load(&sched.npidle) {
+		if !_g_.m.spinning {
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+		}
 
-			// Don't bother to attempt to steal if p2 is idle.
-			if !idlepMask.read(enum.position()) {
-				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
-					return gp, false
-				}
-			}
+		gp, inheritTime, tnow, w, newWork := stealWork(now)
+		now = tnow
+		if gp != nil {
+			// Successfully stole.
+			return gp, inheritTime
+		}
+		if newWork {
+			// There may be new timer or GC work; restart to
+			// discover.
+			goto top
+		}
+		if w != 0 && (pollUntil == 0 || w < pollUntil) {
+			// Earlier timer to wait for.
+			pollUntil = w
 		}
 	}
-	if ranTimer {
-		// Running a timer may have made some goroutine ready.
-		goto top
-	}
-
-stop:
 
-	// We have nothing to do. If we're in the GC mark phase, can
-	// safely scan and blacken objects, and have work to do, run
-	// idle-time marking rather than give up the P.
+	// We have nothing to do.
+	//
+	// If we're in the GC mark phase, can safely scan and blacken objects,
+	// and have work to do, run idle-time marking rather than give up the
+	// P.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
 		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
 		if node != nil {
@@ -2716,17 +2809,11 @@ stop:
 		}
 	}
 
-	delta := int64(-1)
-	if pollUntil != 0 {
-		// checkTimers ensures that polluntil > now.
-		delta = pollUntil - now
-	}
-
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
 	// then wake event handler goroutine which pauses execution
 	// until a callback was triggered.
-	gp, otherReady := beforeIdle(delta)
+	gp, otherReady := beforeIdle(now, pollUntil)
 	if gp != nil {
 		casgstatus(gp, _Gwaiting, _Grunnable)
 		if trace.enabled {
@@ -2765,18 +2852,25 @@ stop:
 	pidleput(_p_)
 	unlock(&sched.lock)
 
-	// Delicate dance: thread transitions from spinning to non-spinning state,
-	// potentially concurrently with submission of new goroutines. We must
-	// drop nmspinning first and then check all per-P queues again (with
-	// #StoreLoad memory barrier in between). If we do it the other way around,
-	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as a result nobody will unpark a thread
-	// to run the goroutine.
+	// Delicate dance: thread transitions from spinning to non-spinning
+	// state, potentially concurrently with submission of new work. We must
+	// drop nmspinning first and then check all sources again (with
+	// #StoreLoad memory barrier in between). If we do it the other way
+	// around, another thread can submit work after we've checked all
+	// sources but before we drop nmspinning; as a result nobody will
+	// unpark a thread to run the work.
+	//
+	// This applies to the following sources of work:
+	//
+	// * Goroutines added to a per-P run queue.
+	// * New/modified-earlier timers on a per-P timer heap.
+	// * Idle-priority GC work (barring golang.org/issue/19112).
+	//
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
 	// than one starving goroutine). However, if after discovering new work
-	// we also observe no idle Ps, it is OK to just park the current thread:
-	// the system is fully loaded so no spinning threads are required.
+	// we also observe no idle Ps it is OK to skip unparking a new worker
+	// thread: the system is fully loaded so no spinning threads are required.
 	// Also see "Worker thread parking/unparking" comment at the top of the file.
 	wasSpinning := _g_.m.spinning
 	if _g_.m.spinning {
@@ -2784,97 +2878,48 @@ stop:
 		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
 			throw("findrunnable: negative nmspinning")
 		}
-	}
 
-	// check all runqueues once again
-	for id, _p_ := range allpSnapshot {
-		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
-				if wasSpinning {
-					_g_.m.spinning = true
-					atomic.Xadd(&sched.nmspinning, 1)
-				}
-				goto top
-			}
-			break
-		}
-	}
-
-	// Similar to above, check for timer creation or expiry concurrently with
-	// transitioning from spinning to non-spinning. Note that we cannot use
-	// checkTimers here because it calls adjusttimers which may need to allocate
-	// memory, and that isn't allowed when we don't have an active P.
-	for id, _p_ := range allpSnapshot {
-		if timerpMaskSnapshot.read(uint32(id)) {
-			w := nobarrierWakeTime(_p_)
-			if w != 0 && (pollUntil == 0 || w < pollUntil) {
-				pollUntil = w
-			}
-		}
-	}
-	if pollUntil != 0 {
-		if now == 0 {
-			now = nanotime()
-		}
-		delta = pollUntil - now
-		if delta < 0 {
-			delta = 0
-		}
-	}
+		// Note the for correctness, only the last M transitioning from
+		// spinning to non-spinning must perform these rechecks to
+		// ensure no missed work. We are performing it on every M that
+		// transitions as a conservative change to monitor effects on
+		// latency. See golang.org/issue/43997.
 
-	// Check for idle-priority GC work again.
-	//
-	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
-	// must check again after acquiring a P.
-	if atomic.Load(&gcBlackenEnabled) != 0 && gcMarkWorkAvailable(nil) {
-		// Work is available; we can start an idle GC worker only if
-		// there is an available P and available worker G.
-		//
-		// We can attempt to acquire these in either order. Workers are
-		// almost always available (see comment in findRunnableGCWorker
-		// for the one case there may be none). Since we're slightly
-		// less likely to find a P, check for that first.
-		lock(&sched.lock)
-		var node *gcBgMarkWorkerNode
-		_p_ = pidleget()
+		// Check all runqueues once again.
+		_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
 		if _p_ != nil {
-			// Now that we own a P, gcBlackenEnabled can't change
-			// (as it requires STW).
-			if gcBlackenEnabled != 0 {
-				node = (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
-				if node == nil {
-					pidleput(_p_)
-					_p_ = nil
-				}
-			} else {
-				pidleput(_p_)
-				_p_ = nil
-			}
+			acquirep(_p_)
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+			goto top
 		}
-		unlock(&sched.lock)
+
+		// Check for idle-priority GC work again.
+		_p_, gp = checkIdleGCNoP()
 		if _p_ != nil {
 			acquirep(_p_)
-			if wasSpinning {
-				_g_.m.spinning = true
-				atomic.Xadd(&sched.nmspinning, 1)
-			}
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
 
 			// Run the idle worker.
 			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
-			gp := node.gp.ptr()
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
 			}
 			return gp, false
 		}
+
+		// Finally, check for timer creation or expiry concurrently with
+		// transitioning from spinning to non-spinning.
+		//
+		// Note that we cannot use checkTimers here because it calls
+		// adjusttimers which may need to allocate memory, and that isn't
+		// allowed when we don't have an active P.
+		pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil)
 	}
 
-	// poll network
+	// Poll network until next timer.
 	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
 		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
 		if _g_.m.p != 0 {
@@ -2883,11 +2928,21 @@ stop:
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
+		delay := int64(-1)
+		if pollUntil != 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			delay = pollUntil - now
+			if delay < 0 {
+				delay = 0
+			}
+		}
 		if faketime != 0 {
 			// When using fake time, just poll.
-			delta = 0
+			delay = 0
 		}
-		list := netpoll(delta) // block until new work is available
+		list := netpoll(delay) // block until new work is available
 		atomic.Store64(&sched.pollUntil, 0)
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
 		if faketime != 0 && list.empty() {
@@ -2949,6 +3004,168 @@ func pollWork() bool {
 	return false
 }
 
+// stealWork attempts to steal a runnable goroutine or timer from any P.
+//
+// If newWork is true, new work may have been readied.
+//
+// If now is not 0 it is the current time. stealWork returns the passed time or
+// the current time if now was passed as 0.
+func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWork bool) {
+	pp := getg().m.p.ptr()
+
+	ranTimer := false
+
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
+		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
+			if sched.gcwaiting != 0 {
+				// GC work may be available.
+				return nil, false, now, pollUntil, true
+			}
+			p2 := allp[enum.position()]
+			if pp == p2 {
+				continue
+			}
+
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// timerpMask tells us whether the P may have timers at all. If it
+			// can't, no need to check at all.
+			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that it always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(pp); gp != nil {
+						return gp, inheritTime, now, pollUntil, ranTimer
+					}
+					ranTimer = true
+				}
+			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(pp, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false, now, pollUntil, ranTimer
+				}
+			}
+		}
+	}
+
+	// No goroutines found to steal. Regardless, running a timer may have
+	// made some goroutine ready that we missed. Indicate the next timer to
+	// wait for.
+	return nil, false, now, pollUntil, ranTimer
+}
+
+// Check all Ps for a runnable G to steal.
+//
+// On entry we have no P. If a G is available to steal and a P is available,
+// the P is returned which the caller should acquire and attempt to steal the
+// work to.
+func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p {
+	for id, p2 := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
+			lock(&sched.lock)
+			pp := pidleget()
+			unlock(&sched.lock)
+			if pp != nil {
+				return pp
+			}
+
+			// Can't get a P, don't bother checking remaining Ps.
+			break
+		}
+	}
+
+	return nil
+}
+
+// Check all Ps for a timer expiring sooner than pollUntil.
+//
+// Returns updated pollUntil value.
+func checkTimersNoP(allpSnapshot []*p, timerpMaskSnapshot pMask, pollUntil int64) int64 {
+	for id, p2 := range allpSnapshot {
+		if timerpMaskSnapshot.read(uint32(id)) {
+			w := nobarrierWakeTime(p2)
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+
+	return pollUntil
+}
+
+// Check for idle-priority GC, without a P on entry.
+//
+// If some GC work, a P, and a worker G are all available, the P and G will be
+// returned. The returned P has not been wired yet.
+func checkIdleGCNoP() (*p, *g) {
+	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
+	// must check again after acquiring a P.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		return nil, nil
+	}
+	if !gcMarkWorkAvailable(nil) {
+		return nil, nil
+	}
+
+	// Work is available; we can start an idle GC worker only if
+	// there is an available P and available worker G.
+	//
+	// We can attempt to acquire these in either order. Workers are
+	// almost always available (see comment in findRunnableGCWorker
+	// for the one case there may be none). Since we're slightly
+	// less likely to find a P, check for that first.
+	lock(&sched.lock)
+	pp := pidleget()
+	unlock(&sched.lock)
+	if pp == nil {
+		return nil, nil
+	}
+
+	// Now that we own a P, gcBlackenEnabled can't change
+	// (as it requires STW).
+	if gcBlackenEnabled == 0 {
+		lock(&sched.lock)
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		lock(&sched.lock)
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	return pp, node.gp.ptr()
+}
+
 // wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
 // going to wake up before the when argument; or it wakes an idle P to service
 // timers and the network poller if there isn't one already.
@@ -3115,7 +3332,9 @@ top:
 	}
 	if gp == nil && gcBlackenEnabled != 0 {
 		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
-		tryWakeP = tryWakeP || gp != nil
+		if gp != nil {
+			tryWakeP = true
+		}
 	}
 	if gp == nil {
 		// Check the global runnable queue once in a while to ensure fairness.
@@ -3191,7 +3410,7 @@ func dropg() {
 
 // checkTimers runs any timers for the P that are ready.
 // If now is not 0 it is the current time.
-// It returns the current time or 0 if it is not known,
+// It returns the passed time or the current time if now was passed as 0.
 // and the time when the next timer should run or 0 if there is no next timer,
 // and reports whether it ran any timers.
 // If the time when the next timer should run is not 0,
@@ -3828,15 +4047,15 @@ func exitsyscallfast_pidle() bool {
 // exitsyscall slow path on g0.
 // Failed to acquire P, enqueue gp as runnable.
 //
+// Called via mcall, so gp is the calling g from this M.
+//
 //go:nowritebarrierrec
 func exitsyscall0(gp *g) {
-	_g_ := getg()
-
 	casgstatus(gp, _Gsyscall, _Grunnable)
 	dropg()
 	lock(&sched.lock)
 	var _p_ *p
-	if schedEnabled(_g_) {
+	if schedEnabled(gp) {
 		_p_ = pidleget()
 	}
 	if _p_ == nil {
@@ -3850,8 +4069,11 @@ func exitsyscall0(gp *g) {
 		acquirep(_p_)
 		execute(gp, false) // Never returns.
 	}
-	if _g_.m.lockedg != 0 {
+	if gp.lockedm != 0 {
 		// Wait until another thread schedules gp and so m again.
+		//
+		// N.B. lockedm must be this M, as this g was running on this M
+		// before entersyscall.
 		stoplockedm()
 		execute(gp, false) // Never returns.
 	}
@@ -4016,6 +4238,14 @@ func newproc(siz int32, fn *funcval) {
 //
 //go:systemstack
 func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerpc uintptr) *g {
+	if goexperiment.RegabiDefer && narg != 0 {
+		// TODO: When we commit to GOEXPERIMENT=regabidefer,
+		// rewrite the comments for newproc and newproc1.
+		// newproc will no longer have a funny stack layout or
+		// need to be nosplit.
+		throw("go with non-empty frame")
+	}
+
 	_g_ := getg()
 
 	if fn == nil {
@@ -4081,7 +4311,7 @@ func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerp
 	memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
 	newg.sched.sp = sp
 	newg.stktopsp = sp
-	newg.sched.pc = funcPC(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
+	newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
 	newg.sched.g = guintptr(unsafe.Pointer(newg))
 	gostartcallfn(&newg.sched, fn)
 	newg.gopc = callerpc
@@ -4093,6 +4323,11 @@ func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerp
 	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
+	// Track initial transition?
+	newg.trackingSeq = uint8(fastrand())
+	if newg.trackingSeq%gTrackingPeriod == 0 {
+		newg.tracking = true
+	}
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -4170,17 +4405,25 @@ func gfput(_p_ *p, gp *g) {
 	_p_.gFree.push(gp)
 	_p_.gFree.n++
 	if _p_.gFree.n >= 64 {
-		lock(&sched.gFree.lock)
+		var (
+			inc      int32
+			stackQ   gQueue
+			noStackQ gQueue
+		)
 		for _p_.gFree.n >= 32 {
-			_p_.gFree.n--
 			gp = _p_.gFree.pop()
+			_p_.gFree.n--
 			if gp.stack.lo == 0 {
-				sched.gFree.noStack.push(gp)
+				noStackQ.push(gp)
 			} else {
-				sched.gFree.stack.push(gp)
+				stackQ.push(gp)
 			}
-			sched.gFree.n++
+			inc++
 		}
+		lock(&sched.gFree.lock)
+		sched.gFree.noStack.pushAll(noStackQ)
+		sched.gFree.stack.pushAll(stackQ)
+		sched.gFree.n += inc
 		unlock(&sched.gFree.lock)
 	}
 }
@@ -4232,17 +4475,25 @@ retry:
 
 // Purge all cached G's from gfree list to the global list.
 func gfpurge(_p_ *p) {
-	lock(&sched.gFree.lock)
+	var (
+		inc      int32
+		stackQ   gQueue
+		noStackQ gQueue
+	)
 	for !_p_.gFree.empty() {
 		gp := _p_.gFree.pop()
 		_p_.gFree.n--
 		if gp.stack.lo == 0 {
-			sched.gFree.noStack.push(gp)
+			noStackQ.push(gp)
 		} else {
-			sched.gFree.stack.push(gp)
+			stackQ.push(gp)
 		}
-		sched.gFree.n++
+		inc++
 	}
+	lock(&sched.gFree.lock)
+	sched.gFree.noStack.pushAll(noStackQ)
+	sched.gFree.stack.pushAll(stackQ)
+	sched.gFree.n += inc
 	unlock(&sched.gFree.lock)
 }
 
@@ -4953,11 +5204,9 @@ func checkdead() {
 	}
 
 	grunning := 0
-	lock(&allglock)
-	for i := 0; i < len(allgs); i++ {
-		gp := allgs[i]
+	forEachG(func(gp *g) {
 		if isSystemGoroutine(gp, false) {
-			continue
+			return
 		}
 		s := readgstatus(gp)
 		switch s &^ _Gscan {
@@ -4970,8 +5219,7 @@ func checkdead() {
 			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
 			throw("checkdead: runnable g")
 		}
-	}
-	unlock(&allglock)
+	})
 	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
 		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
 		throw("no goroutines (main called runtime.Goexit) - deadlock!")
@@ -5275,7 +5523,7 @@ func preemptall() bool {
 
 // Tell the goroutine running on processor P to stop.
 // This function is purely best-effort. It can incorrectly fail to inform the
-// goroutine. It can send inform the wrong goroutine. Even if it informs the
+// goroutine. It can inform the wrong goroutine. Even if it informs the
 // correct goroutine, that goroutine might ignore the request if it is
 // simultaneously executing newstack.
 // No lock needs to be held.
@@ -5295,7 +5543,7 @@ func preemptone(_p_ *p) bool {
 
 	gp.preempt = true
 
-	// Every call in a go routine checks for stack overflow by
+	// Every call in a goroutine checks for stack overflow by
 	// comparing the current stack pointer to gp->stackguard0.
 	// Setting gp->stackguard0 to StackPreempt folds
 	// preemption into the normal stack overflow check.
@@ -5374,9 +5622,7 @@ func schedtrace(detailed bool) {
 		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
 	}
 
-	lock(&allglock)
-	for gi := 0; gi < len(allgs); gi++ {
-		gp := allgs[gi]
+	forEachG(func(gp *g) {
 		mp := gp.m
 		lockedm := gp.lockedm.ptr()
 		id1 := int64(-1)
@@ -5388,8 +5634,7 @@ func schedtrace(detailed bool) {
 			id2 = lockedm.id
 		}
 		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=", id1, " lockedm=", id2, "\n")
-	}
-	unlock(&allglock)
+	})
 	unlock(&sched.lock)
 }
 
@@ -5484,6 +5729,8 @@ func globrunqputhead(gp *g) {
 // Put a batch of runnable goroutines on the global runnable queue.
 // This clears *batch.
 // sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
 func globrunqputbatch(batch *gQueue, n int32) {
 	assertLockHeld(&sched.lock)
 
@@ -5799,6 +6046,45 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 	}
 }
 
+// runqdrain drains the local runnable queue of _p_ and returns all goroutines in it.
+// Executed only by the owner P.
+func runqdrain(_p_ *p) (drainQ gQueue, n uint32) {
+	oldNext := _p_.runnext
+	if oldNext != 0 && _p_.runnext.cas(oldNext, 0) {
+		drainQ.pushBack(oldNext.ptr())
+		n++
+	}
+
+retry:
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+	t := _p_.runqtail
+	qn := t - h
+	if qn == 0 {
+		return
+	}
+	if qn > uint32(len(_p_.runq)) { // read inconsistent h and t
+		goto retry
+	}
+
+	if !atomic.CasRel(&_p_.runqhead, h, h+qn) { // cas-release, commits consume
+		goto retry
+	}
+
+	// We've inverted the order in which it gets G's from the local P's runnable queue
+	// and then advances the head pointer because we don't want to mess up the statuses of G's
+	// while runqdrain() and runqsteal() are running in parallel.
+	// Thus we should advance the head pointer before draining the local P into a gQueue,
+	// so that we can update any gp.schedlink only after we take the full ownership of G,
+	// meanwhile, other P's can't access to all G's in local P's runnable queue and steal them.
+	// See https://groups.google.com/g/golang-dev/c/0pTKxEKhHSc/m/6Q85QjdVBQAJ for more details.
+	for i := uint32(0); i < qn; i++ {
+		gp := _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+		drainQ.pushBack(gp)
+		n++
+	}
+	return
+}
+
 // Grabs a batch of goroutines from _p_'s runnable queue into batch.
 // Batch is a ring buffer starting at batchHead.
 // Returns number of grabbed goroutines.
@@ -5992,26 +6278,6 @@ func setMaxThreads(in int) (out int) {
 	return
 }
 
-func haveexperiment(name string) bool {
-	x := sys.Goexperiment
-	for x != "" {
-		xname := ""
-		i := bytealg.IndexByteString(x, ',')
-		if i < 0 {
-			xname, x = x, ""
-		} else {
-			xname, x = x[:i], x[i+1:]
-		}
-		if xname == name {
-			return true
-		}
-		if len(xname) > 2 && xname[:2] == "no" && xname[2:] == name {
-			return false
-		}
-	}
-	return false
-}
-
 //go:nosplit
 func procPin() int {
 	_g_ := getg()
@@ -6148,7 +6414,7 @@ var inittrace tracestat
 
 type tracestat struct {
 	active bool   // init tracing activation status
-	id     int64  // init go routine id
+	id     int64  // init goroutine id
 	allocs uint64 // heap allocations
 	bytes  uint64 // heap allocated bytes
 }
@@ -6180,7 +6446,7 @@ func doInit(t *initTask) {
 
 		if inittrace.active {
 			start = nanotime()
-			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			// Load stats non-atomically since tracinit is updated only by this init goroutine.
 			before = inittrace
 		}
 
@@ -6193,7 +6459,7 @@ func doInit(t *initTask) {
 
 		if inittrace.active {
 			end := nanotime()
-			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			// Load stats non-atomically since tracinit is updated only by this init goroutine.
 			after := inittrace
 
 			pkg := funcpkgpath(findfunc(funcPC(firstFunc)))
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 767bde15b4..53cafe8907 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -692,6 +692,55 @@ func BenchmarkCreateGoroutinesCapture(b *testing.B) {
 	}
 }
 
+// warmupScheduler ensures the scheduler has at least targetThreadCount threads
+// in its thread pool.
+func warmupScheduler(targetThreadCount int) {
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < targetThreadCount; i++ {
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(&count, 1)
+			for atomic.LoadInt32(&count) < int32(targetThreadCount) {
+				// spin until all threads started
+			}
+
+			// spin a bit more to ensure they are all running on separate CPUs.
+			doWork(time.Millisecond)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func doWork(dur time.Duration) {
+	start := time.Now()
+	for time.Since(start) < dur {
+	}
+}
+
+// BenchmarkCreateGoroutinesSingle creates many goroutines, all from a single
+// producer (the main benchmark goroutine).
+//
+// Compared to BenchmarkCreateGoroutines, this causes different behavior in the
+// scheduler because Ms are much more likely to need to steal work from the
+// main P rather than having work in the local run queue.
+func BenchmarkCreateGoroutinesSingle(b *testing.B) {
+	// Since we are interested in stealing behavior, warm the scheduler to
+	// get all the Ps running first.
+	warmupScheduler(runtime.GOMAXPROCS(0))
+	b.ResetTimer()
+
+	var wg sync.WaitGroup
+	wg.Add(b.N)
+	for i := 0; i < b.N; i++ {
+		go func() {
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
 func BenchmarkClosureCall(b *testing.B) {
 	sum := 0
 	off1 := 1
diff --git a/src/runtime/race.go b/src/runtime/race.go
index 79fd21765d..cc8c5db1bd 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race
 // +build race
 
 package runtime
diff --git a/src/runtime/race/README b/src/runtime/race/README
index 178ab94ab5..3b188a0361 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -1,6 +1,6 @@
 runtime/race package contains the data race detector runtime library.
 It is based on ThreadSanitizer race detector, that is currently a part of
-the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).
+the LLVM project (https://github.com/llvm/llvm-project/tree/main/compiler-rt).
 
 To update the .syso files use golang.org/x/build/cmd/racebuild.
 
@@ -12,3 +12,4 @@ race_netbsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153
 race_windows_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
 race_linux_arm64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
 race_darwin_arm64.syso built with LLVM 00da38ce2d36c07f12c287dc515d37bb7bc410e9 and Go fe70a3a0fd31441bcbb9932ecab11a6083cf2119.
+race_openbsd_amd64.syso built with LLVM fcf6ae2f070eba73074b6ec8d8281e54d29dbeeb and Go 8f2db14cd35bbd674cb2988a508306de6655e425.
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index 17dc32013f..99052071d0 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race
 // +build race
 
 package race_test
@@ -19,11 +20,7 @@ import (
 )
 
 func TestOutput(t *testing.T) {
-	pkgdir, err := os.MkdirTemp("", "go-build-race-output")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer os.RemoveAll(pkgdir)
+	pkgdir := t.TempDir()
 	out, err := exec.Command(testenv.GoToolPath(t), "install", "-race", "-pkgdir="+pkgdir, "testing").CombinedOutput()
 	if err != nil {
 		t.Fatalf("go install -race: %v\n%s", err, out)
@@ -34,11 +31,7 @@ func TestOutput(t *testing.T) {
 			t.Logf("test %v runs only on %v, skipping: ", test.name, test.goos)
 			continue
 		}
-		dir, err := os.MkdirTemp("", "go-build")
-		if err != nil {
-			t.Fatalf("failed to create temp directory: %v", err)
-		}
-		defer os.RemoveAll(dir)
+		dir := t.TempDir()
 		source := "main.go"
 		if test.run == "test" {
 			source = "main_test.go"
@@ -105,6 +98,8 @@ var tests = []struct {
 	{"simple", "run", "", "atexit_sleep_ms=0", `
 package main
 import "time"
+var xptr *int
+var donechan chan bool
 func main() {
 	done := make(chan bool)
 	x := 0
@@ -116,32 +111,34 @@ func store(x *int, v int) {
 	*x = v
 }
 func startRacer(x *int, done chan bool) {
-	go racer(x, done)
+	xptr = x
+	donechan = done
+	go racer()
 }
-func racer(x *int, done chan bool) {
+func racer() {
 	time.Sleep(10*time.Millisecond)
-	store(x, 42)
-	done <- true
+	store(xptr, 42)
+	donechan <- true
 }
 `, []string{`==================
 WARNING: DATA RACE
 Write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.store\(\)
-      .+/main\.go:12 \+0x[0-9,a-f]+
+      .+/main\.go:14 \+0x[0-9,a-f]+
   main\.racer\(\)
-      .+/main\.go:19 \+0x[0-9,a-f]+
+      .+/main\.go:23 \+0x[0-9,a-f]+
 
 Previous write at 0x[0-9,a-f]+ by main goroutine:
   main\.store\(\)
-      .+/main\.go:12 \+0x[0-9,a-f]+
+      .+/main\.go:14 \+0x[0-9,a-f]+
   main\.main\(\)
-      .+/main\.go:8 \+0x[0-9,a-f]+
+      .+/main\.go:10 \+0x[0-9,a-f]+
 
 Goroutine [0-9] \(running\) created at:
   main\.startRacer\(\)
-      .+/main\.go:15 \+0x[0-9,a-f]+
+      .+/main\.go:19 \+0x[0-9,a-f]+
   main\.main\(\)
-      .+/main\.go:7 \+0x[0-9,a-f]+
+      .+/main\.go:9 \+0x[0-9,a-f]+
 ==================
 Found 1 data race\(s\)
 exit status 66
@@ -238,15 +235,15 @@ func main() {
 package main
 
 var x int
-
+var c chan int
 func main() {
-	c := make(chan int)
-	go f(c)
+	c = make(chan int)
+	go f()
 	x = 1
 	<-c
 }
 
-func f(c chan int) {
+func f() {
 	g(c)
 }
 
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index d6a14b79e7..84050e8771 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build race,linux,amd64 race,freebsd,amd64 race,netbsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le race,linux,arm64 race,darwin,arm64
+//go:build (race && linux && amd64) || (race && freebsd && amd64) || (race && netbsd && amd64) || (race && darwin && amd64) || (race && windows && amd64) || (race && linux && ppc64le) || (race && linux && arm64) || (race && darwin && arm64) || (race && openbsd && amd64)
+// +build race,linux,amd64 race,freebsd,amd64 race,netbsd,amd64 race,darwin,amd64 race,windows,amd64 race,linux,ppc64le race,linux,arm64 race,darwin,arm64 race,openbsd,amd64
 
 package race
 
diff --git a/src/runtime/race/race_linux_test.go b/src/runtime/race/race_linux_test.go
index c00ce4d3df..9c0d48d6bd 100644
--- a/src/runtime/race/race_linux_test.go
+++ b/src/runtime/race/race_linux_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && race
 // +build linux,race
 
 package race_test
diff --git a/src/runtime/race/race_openbsd_amd64.syso b/src/runtime/race/race_openbsd_amd64.syso
new file mode 100644
index 0000000000..9fefd87ec6
--- /dev/null
+++ b/src/runtime/race/race_openbsd_amd64.syso
diff --git a/src/runtime/race/race_test.go b/src/runtime/race/race_test.go
index d433af6bd0..8c880b8570 100644
--- a/src/runtime/race/race_test.go
+++ b/src/runtime/race/race_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race
 // +build race
 
 // This program is used to verify the race detector
diff --git a/src/runtime/race/race_unix_test.go b/src/runtime/race/race_unix_test.go
index 84f0acece6..acd6e47f98 100644
--- a/src/runtime/race/race_unix_test.go
+++ b/src/runtime/race/race_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race && (darwin || freebsd || linux)
 // +build race
 // +build darwin freebsd linux
 
diff --git a/src/runtime/race/race_windows_test.go b/src/runtime/race/race_windows_test.go
index 307a1ea6c0..e490d766dd 100644
--- a/src/runtime/race/race_windows_test.go
+++ b/src/runtime/race/race_windows_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build windows && race
 // +build windows,race
 
 package race_test
diff --git a/src/runtime/race/sched_test.go b/src/runtime/race/sched_test.go
index d6bb323cde..e904ebd20d 100644
--- a/src/runtime/race/sched_test.go
+++ b/src/runtime/race/sched_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race
 // +build race
 
 package race_test
diff --git a/src/runtime/race/syso_test.go b/src/runtime/race/syso_test.go
index db846c5d2a..cbce5a8f18 100644
--- a/src/runtime/race/syso_test.go
+++ b/src/runtime/race/syso_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !android && !js && !ppc64le
 // +build !android,!js,!ppc64le
 
 // Note: we don't run on Android or ppc64 because if there is any non-race test
diff --git a/src/runtime/race/testdata/io_test.go b/src/runtime/race/testdata/io_test.go
index c5055f7837..3303cb0717 100644
--- a/src/runtime/race/testdata/io_test.go
+++ b/src/runtime/race/testdata/io_test.go
@@ -17,7 +17,7 @@ import (
 
 func TestNoRaceIOFile(t *testing.T) {
 	x := 0
-	path, _ := os.MkdirTemp("", "race_test")
+	path := t.TempDir()
 	fname := filepath.Join(path, "data")
 	go func() {
 		x = 42
diff --git a/src/runtime/race/testdata/mutex_test.go b/src/runtime/race/testdata/mutex_test.go
index cbed2d370c..9dbed9a2c9 100644
--- a/src/runtime/race/testdata/mutex_test.go
+++ b/src/runtime/race/testdata/mutex_test.go
@@ -78,16 +78,23 @@ func TestNoRaceMutexPureHappensBefore(t *testing.T) {
 	var mu sync.Mutex
 	var x int16 = 0
 	_ = x
+	written := false
 	ch := make(chan bool, 2)
 	go func() {
 		x = 1
 		mu.Lock()
+		written = true
 		mu.Unlock()
 		ch <- true
 	}()
 	go func() {
-		<-time.After(1e5)
+		time.Sleep(100 * time.Microsecond)
 		mu.Lock()
+		for !written {
+			mu.Unlock()
+			time.Sleep(100 * time.Microsecond)
+			mu.Lock()
+		}
 		mu.Unlock()
 		x = 1
 		ch <- true
diff --git a/src/runtime/race/timer_test.go b/src/runtime/race/timer_test.go
index a6c34a8352..f11f8456a0 100644
--- a/src/runtime/race/timer_test.go
+++ b/src/runtime/race/timer_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build race
 // +build race
 
 package race_test
diff --git a/src/runtime/race0.go b/src/runtime/race0.go
index 180f707b1a..0e431b8103 100644
--- a/src/runtime/race0.go
+++ b/src/runtime/race0.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !race
 // +build !race
 
 // Dummy race detection API, used when not built with -race.
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
index c3b7bbfbfe..58a919efe8 100644
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s
@@ -8,6 +8,7 @@
 #include "go_tls.h"
 #include "funcdata.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 // The following thunks allow calling the gcc-compiled race runtime directly
 // from Go code without going all the way through cgo.
@@ -44,7 +45,11 @@
 // Defined as ABIInternal so as to avoid introducing a wrapper,
 // which would render runtime.getcallerpc ineffective.
 TEXT	runtime·raceread<ABIInternal>(SB), NOSPLIT, $0-8
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	AX, RARG1
+#else
 	MOVQ	addr+0(FP), RARG1
+#endif
 	MOVQ	(SP), RARG2
 	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
 	MOVQ	$__tsan_read(SB), AX
@@ -70,7 +75,11 @@ TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
 // Defined as ABIInternal so as to avoid introducing a wrapper,
 // which would render runtime.getcallerpc ineffective.
 TEXT	runtime·racewrite<ABIInternal>(SB), NOSPLIT, $0-8
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	AX, RARG1
+#else
 	MOVQ	addr+0(FP), RARG1
+#endif
 	MOVQ	(SP), RARG2
 	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
 	MOVQ	$__tsan_write(SB), AX
@@ -121,8 +130,13 @@ TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
 // Defined as ABIInternal so as to avoid introducing a wrapper,
 // which would render runtime.getcallerpc ineffective.
 TEXT	runtime·racewriterange<ABIInternal>(SB), NOSPLIT, $0-16
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	AX, RARG1
+	MOVQ	BX, RARG2
+#else
 	MOVQ	addr+0(FP), RARG1
 	MOVQ	size+8(FP), RARG2
+#endif
 	MOVQ	(SP), RARG3
 	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
 	MOVQ	$__tsan_write_range(SB), AX
@@ -146,7 +160,7 @@ TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
 // If addr (RARG1) is out of range, do nothing.
 // Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
 TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
@@ -167,25 +181,17 @@ call:
 ret:
 	RET
 
-// func runtime·racefuncenterfp(fp uintptr)
-// Called from instrumented code.
-// Like racefuncenter but passes FP, not PC
-TEXT	runtime·racefuncenterfp(SB), NOSPLIT, $0-8
-	MOVQ	fp+0(FP), R11
-	MOVQ	-8(R11), R11
-	JMP	racefuncenter<>(SB)
-
 // func runtime·racefuncenter(pc uintptr)
 // Called from instrumented code.
 TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
 	MOVQ	callpc+0(FP), R11
 	JMP	racefuncenter<>(SB)
 
-// Common code for racefuncenter/racefuncenterfp
+// Common code for racefuncenter
 // R11 = caller's return address
 TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
-	MOVQ	DX, R15		// save function entry context (for closures)
-#ifndef GOEXPERIMENT_REGABI
+	MOVQ	DX, BX		// save function entry context (for closures)
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
@@ -193,15 +199,15 @@ TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 	MOVQ	R11, RARG1
 	// void __tsan_func_enter(ThreadState *thr, void *pc);
 	MOVQ	$__tsan_func_enter(SB), AX
-	// racecall<> preserves R15
+	// racecall<> preserves BX
 	CALL	racecall<>(SB)
-	MOVQ	R15, DX	// restore function entry context
+	MOVQ	BX, DX	// restore function entry context
 	RET
 
 // func runtime·racefuncexit()
 // Called from instrumented code.
 TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
@@ -363,7 +369,7 @@ racecallatomic_data:
 	JAE	racecallatomic_ignore
 racecallatomic_ok:
 	// Addr is within the good range, call the atomic function.
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
@@ -376,15 +382,15 @@ racecallatomic_ignore:
 	// Addr is outside the good range.
 	// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
 	// An attempt to synchronize on the address would cause crash.
-	MOVQ	AX, R15	// remember the original function
+	MOVQ	AX, BX	// remember the original function
 	MOVQ	$__tsan_go_ignore_sync_begin(SB), AX
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
 	MOVQ	g_racectx(R14), RARG0	// goroutine context
 	CALL	racecall<>(SB)
-	MOVQ	R15, AX	// restore the original function
+	MOVQ	BX, AX	// restore the original function
 	// Call the atomic function.
 	MOVQ	g_racectx(R14), RARG0	// goroutine context
 	MOVQ	8(SP), RARG1	// caller pc
@@ -409,7 +415,7 @@ TEXT	runtime·racecall(SB), NOSPLIT, $0-0
 
 // Switches SP to g0 stack and calls (AX). Arguments already set.
 TEXT	racecall<>(SB), NOSPLIT, $0-0
-#ifndef GOEXPERIMENT_REGABI
+#ifndef GOEXPERIMENT_regabig
 	get_tls(R12)
 	MOVQ	g(R12), R14
 #endif
@@ -436,7 +442,7 @@ call:
 // See racecallback for command codes.
 // Defined as ABIInternal so as to avoid introducing a wrapper,
 // because its address is passed to C via funcPC.
-TEXT	runtime·racecallbackthunk<ABIInternal>(SB), NOSPLIT, $56-8
+TEXT	runtime·racecallbackthunk<ABIInternal>(SB), NOSPLIT, $0-0
 	// Handle command raceGetProcCmd (0) here.
 	// First, code below assumes that we are on curg, while raceGetProcCmd
 	// can be executed on g0. Second, it is called frequently, so will
@@ -452,16 +458,8 @@ TEXT	runtime·racecallbackthunk<ABIInternal>(SB), NOSPLIT, $56-8
 	RET
 
 rest:
-	// Save callee-saved registers (Go code won't respect that).
-	// This is superset of darwin/linux/windows registers.
-	PUSHQ	BX
-	PUSHQ	BP
-	PUSHQ	DI
-	PUSHQ	SI
-	PUSHQ	R12
-	PUSHQ	R13
-	PUSHQ	R14
-	PUSHQ	R15
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 	// Set g = g0.
 	get_tls(R12)
 	MOVQ	g(R12), R14
@@ -483,15 +481,7 @@ rest:
 	MOVQ	m_curg(R13), R14
 	MOVQ	R14, g(R12)	// g = m->curg
 ret:
-	// Restore callee-saved registers.
-	POPQ	R15
-	POPQ	R14
-	POPQ	R13
-	POPQ	R12
-	POPQ	SI
-	POPQ	DI
-	POPQ	BP
-	POPQ	BX
+	POP_REGS_HOST_TO_ABI0()
 	RET
 
 noswitch:
diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s
index 8aa17742b8..82e3caadc8 100644
--- a/src/runtime/race_arm64.s
+++ b/src/runtime/race_arm64.s
@@ -160,21 +160,13 @@ call:
 ret:
 	RET
 
-// func runtime·racefuncenterfp(fp uintptr)
-// Called from instrumented code.
-// Like racefuncenter but doesn't passes an arg, uses the caller pc
-// from the first slot on the stack
-TEXT	runtime·racefuncenterfp(SB), NOSPLIT, $0-0
-	MOVD	0(RSP), R9
-	JMP	racefuncenter<>(SB)
-
 // func runtime·racefuncenter(pc uintptr)
 // Called from instrumented code.
 TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
 	MOVD	callpc+0(FP), R9
 	JMP	racefuncenter<>(SB)
 
-// Common code for racefuncenter/racefuncenterfp
+// Common code for racefuncenter
 // R9 = caller's return address
 TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 	load_g
diff --git a/src/runtime/race_ppc64le.s b/src/runtime/race_ppc64le.s
index 8961254ea6..069e4d86dd 100644
--- a/src/runtime/race_ppc64le.s
+++ b/src/runtime/race_ppc64le.s
@@ -36,9 +36,9 @@
 // racecalladdr.
 //
 // The sequence used to get the race ctx:
-//    MOVD    runtime·tls_g(SB), R10	// offset to TLS
-//    MOVD    0(R13)(R10*1), g		// R13=TLS for this thread, g = R30
-//    MOVD    g_racectx(g), R3		// racectx == ThreadState
+//    MOVD    runtime·tls_g(SB), R10 // Address of TLS variable
+//    MOVD    0(R10), g              // g = R30
+//    MOVD    g_racectx(g), R3       // racectx == ThreadState
 
 // func runtime·RaceRead(addr uintptr)
 // Called from instrumented Go code
@@ -137,7 +137,7 @@ TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
 // Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
 TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD	0(R13)(R10*1), g
+	MOVD	0(R10), g
 	MOVD	g_racectx(g), R3	// goroutine context
 	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
 	MOVD	runtime·racearenastart(SB), R9
@@ -163,27 +163,17 @@ call:
 ret:
 	RET
 
-// func runtime·racefuncenterfp()
-// Called from instrumented Go code.
-// Like racefuncenter but doesn't pass an arg, uses the caller pc
-// from the first slot on the stack.
-TEXT	runtime·racefuncenterfp(SB), NOSPLIT, $0-0
-	MOVD	0(R1), R8
-	BR	racefuncenter<>(SB)
-
 // func runtime·racefuncenter(pc uintptr)
 // Called from instrumented Go code.
-// Not used now since gc/racewalk.go doesn't pass the
-// correct caller pc and racefuncenterfp can do it.
 TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
 	MOVD	callpc+0(FP), R8
 	BR	racefuncenter<>(SB)
 
-// Common code for racefuncenter/racefuncenterfp
+// Common code for racefuncenter
 // R11 = caller's return address
 TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	MOVD	R8, R4			// caller pc set by caller in R8
 	// void __tsan_func_enter(ThreadState *thr, void *pc);
@@ -195,7 +185,7 @@ TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 // Called from Go instrumented code.
 TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	// void __tsan_func_exit(ThreadState *thr);
 	MOVD	$__tsan_func_exit(SB), R8
@@ -390,7 +380,7 @@ racecallatomic_data:
 racecallatomic_ok:
 	// Addr is within the good range, call the atomic function.
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	MOVD	R8, R5			// pc is the function called
 	MOVD	(R1), R4		// caller pc from stack
@@ -404,7 +394,7 @@ racecallatomic_ignore:
 	MOVD	R6, R17 // save the original arg list addr
 	MOVD	$__tsan_go_ignore_sync_begin(SB), R8 // func addr to call
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine context
 	BL	racecall<>(SB)
 	MOVD	R15, R8	// restore the original function
@@ -412,7 +402,7 @@ racecallatomic_ignore:
 	// Call the atomic function.
 	// racecall will call LLVM race code which might clobber r30 (g)
 	MOVD	runtime·tls_g(SB), R10
-	MOVD	0(R13)(R10*1), g
+	MOVD	0(R10), g
 
 	MOVD	g_racectx(g), R3
 	MOVD	R8, R4		// pc being called same TODO as above
@@ -444,7 +434,7 @@ TEXT	racecall<>(SB), NOSPLIT, $0-0
 	MOVD	R10, 16(R1)	// C ABI
 	// Get info from the current goroutine
 	MOVD    runtime·tls_g(SB), R10	// g offset in TLS
-	MOVD    0(R13)(R10*1), g	// R13 = current TLS
+	MOVD    0(R10), g
 	MOVD	g_m(g), R7		// m for g
 	MOVD	R1, R16			// callee-saved, preserved across C call
 	MOVD	m_g0(R7), R10		// g0 for m
@@ -458,7 +448,7 @@ call:
 	XOR     R0, R0			// clear R0 on return from Clang
 	MOVD	R16, R1			// restore R1; R16 nonvol in Clang
 	MOVD    runtime·tls_g(SB), R10	// find correct g
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	16(R1), R10		// LR was saved away, restore for return
 	MOVD	R10, LR
 	RET
@@ -479,7 +469,7 @@ TEXT	runtime·racecallbackthunk(SB), NOSPLIT, $-8
 	// g0 TODO: Don't modify g here since R30 is nonvolatile
 	MOVD	g, R9
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	g_m(g), R3
 	MOVD	m_p(R3), R3
 	MOVD	p_raceprocctx(R3), R3
@@ -537,7 +527,7 @@ rest:
 	MOVD	R4, FIXED_FRAME+8(R1)
 
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 
 	MOVD	g_m(g), R7
 	MOVD	m_g0(R7), R8
@@ -550,7 +540,7 @@ rest:
 
 	// All registers are clobbered after Go code, reload.
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 
 	MOVD	g_m(g), R7
 	MOVD	m_curg(R7), g // restore g = m->curg
diff --git a/src/runtime/relax_stub.go b/src/runtime/relax_stub.go
index 81ed1291b8..5b92879c20 100644
--- a/src/runtime/relax_stub.go
+++ b/src/runtime/relax_stub.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !windows
 // +build !windows
 
 package runtime
diff --git a/src/runtime/rt0_windows_amd64.s b/src/runtime/rt0_windows_amd64.s
index 345e141802..e60bf4c86d 100644
--- a/src/runtime/rt0_windows_amd64.s
+++ b/src/runtime/rt0_windows_amd64.s
@@ -16,24 +16,12 @@ TEXT _rt0_amd64_windows(SB),NOSPLIT,$-8
 // phase.
 // Leave space for four pointers on the stack as required
 // by the Windows amd64 calling convention.
-TEXT _rt0_amd64_windows_lib(SB),NOSPLIT,$0x48
-	MOVQ	BP, 0x20(SP)
-	MOVQ	BX, 0x28(SP)
-	MOVQ	AX, 0x30(SP)
-	MOVQ  CX, 0x38(SP)
-	MOVQ  DX, 0x40(SP)
-
+TEXT _rt0_amd64_windows_lib(SB),NOSPLIT,$0x20
 	// Create a new thread to do the runtime initialization and return.
 	MOVQ	_cgo_sys_thread_create(SB), AX
 	MOVQ	$_rt0_amd64_windows_lib_go(SB), CX
 	MOVQ	$0, DX
 	CALL	AX
-
-	MOVQ	0x20(SP), BP
-	MOVQ	0x28(SP), BX
-	MOVQ	0x30(SP), AX
-	MOVQ	0x38(SP), CX
-	MOVQ	0x40(SP), DX
 	RET
 
 TEXT _rt0_amd64_windows_lib_go(SB),NOSPLIT,$0
diff --git a/src/runtime/rt0_windows_arm64.s b/src/runtime/rt0_windows_arm64.s
index 1e71a068d3..bad85c28ac 100644
--- a/src/runtime/rt0_windows_arm64.s
+++ b/src/runtime/rt0_windows_arm64.s
@@ -10,3 +10,20 @@
 // kernel for an ordinary -buildmode=exe program.
 TEXT _rt0_arm64_windows(SB),NOSPLIT|NOFRAME,$0
 	B	·rt0_go(SB)
+
+TEXT _rt0_arm64_windows_lib(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$_rt0_arm64_windows_lib_go(SB), R0
+	MOVD	$0, R1
+	MOVD	_cgo_sys_thread_create(SB), R2
+	B	(R2)
+
+TEXT _rt0_arm64_windows_lib_go(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$0, R0
+	MOVD	$0, R1
+	MOVD	$runtime·rt0_go(SB), R2
+	B	(R2)
+
+TEXT main(SB),NOSPLIT,$0
+	MOVD	$runtime·rt0_go(SB), R2
+	B	(R2)
+
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index 5df8c3c745..8c76a9123c 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -169,11 +169,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 	checkGdbVersion(t)
 	checkGdbPython(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	var buf bytes.Buffer
 	buf.WriteString("package main\n")
@@ -194,7 +190,7 @@ func testGdbPython(t *testing.T, cgo bool) {
 		}
 	}
 
-	err = os.WriteFile(filepath.Join(dir, "main.go"), src, 0644)
+	err := os.WriteFile(filepath.Join(dir, "main.go"), src, 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -403,15 +399,11 @@ func TestGdbBacktrace(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(backtraceSource), 0644)
+	err := os.WriteFile(src, []byte(backtraceSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -481,15 +473,11 @@ func TestGdbAutotmpTypes(t *testing.T) {
 		t.Skip("TestGdbAutotmpTypes is too slow on aix/ppc64")
 	}
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(autotmpTypeSource), 0644)
+	err := os.WriteFile(src, []byte(autotmpTypeSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -550,15 +538,11 @@ func TestGdbConst(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(constsSource), 0644)
+	err := os.WriteFile(src, []byte(constsSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -617,15 +601,11 @@ func TestGdbPanic(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(panicSource), 0644)
+	err := os.WriteFile(src, []byte(panicSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
@@ -695,15 +675,11 @@ func TestGdbInfCallstack(t *testing.T) {
 	t.Parallel()
 	checkGdbVersion(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// Build the source code.
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(InfCallstackSource), 0644)
+	err := os.WriteFile(src, []byte(InfCallstackSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
 	}
diff --git a/src/runtime/runtime-lldb_test.go b/src/runtime/runtime-lldb_test.go
index c923b872aa..19a6cc6f8d 100644
--- a/src/runtime/runtime-lldb_test.go
+++ b/src/runtime/runtime-lldb_test.go
@@ -142,14 +142,10 @@ func TestLldbPython(t *testing.T) {
 
 	checkLldbPython(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(lldbHelloSource), 0644)
+	err := os.WriteFile(src, []byte(lldbHelloSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create src file: %v", err)
 	}
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 30b7044bff..b238da8f51 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -310,7 +310,6 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
@@ -339,7 +338,6 @@ var dbgvars = []dbgVar{
 	{"invalidptr", &debug.invalidptr},
 	{"madvdontneed", &debug.madvdontneed},
 	{"sbrk", &debug.sbrk},
-	{"scavenge", &debug.scavenge},
 	{"scavtrace", &debug.scavtrace},
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 5bd283d12f..0e0eb0b728 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -413,14 +412,25 @@ type g struct {
 	stackguard0 uintptr // offset known to liblink
 	stackguard1 uintptr // offset known to liblink
 
-	_panic       *_panic // innermost panic - offset known to liblink
-	_defer       *_defer // innermost defer
-	m            *m      // current m; offset known to arm liblink
-	sched        gobuf
-	syscallsp    uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
-	syscallpc    uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
-	stktopsp     uintptr        // expected sp at top of stack, to check in traceback
-	param        unsafe.Pointer // passed parameter on wakeup
+	_panic    *_panic // innermost panic - offset known to liblink
+	_defer    *_defer // innermost defer
+	m         *m      // current m; offset known to arm liblink
+	sched     gobuf
+	syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
+	syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
+	stktopsp  uintptr // expected sp at top of stack, to check in traceback
+	// param is a generic pointer parameter field used to pass
+	// values in particular contexts where other storage for the
+	// parameter would be difficult to find. It is currently used
+	// in three ways:
+	// 1. When a channel operation wakes up a blocked goroutine, it sets param to
+	//    point to the sudog of the completed blocking operation.
+	// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
+	//    the GC cycle. It is unsafe to do so in any other way, because the goroutine's
+	//    stack may have moved in the meantime.
+	// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
+	//    closure in the runtime is forbidden.
+	param        unsafe.Pointer
 	atomicstatus uint32
 	stackLock    uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
 	goid         int64
@@ -452,6 +462,10 @@ type g struct {
 
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	tracking       bool     // whether we're tracking this G for sched latency statistics
+	trackingSeq    uint8    // used to decide whether to track this G
+	runnableStamp  int64    // timestamp of when the G last became runnable, only used when tracking
+	runnableTime   int64    // the amount of time spent runnable, cleared when running, only used when tracking
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
 	tracelastp     puintptr // last P emitted an event for this goroutine
@@ -483,17 +497,28 @@ type g struct {
 	gcAssistBytes int64
 }
 
+// gTrackingPeriod is the number of transitions out of _Grunning between
+// latency tracking runs.
+const gTrackingPeriod = 8
+
+const (
+	// tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
+	// like Windows.
+	tlsSlots = 6
+	tlsSize  = tlsSlots * sys.PtrSize
+)
+
 type m struct {
 	g0      *g     // goroutine with scheduling stack
 	morebuf gobuf  // gobuf arg to morestack
 	divmod  uint32 // div/mod denominator for arm - known to liblink
 
 	// Fields not known to debuggers.
-	procid        uint64       // for debuggers, but offset not hard-coded
-	gsignal       *g           // signal-handling g
-	goSigStack    gsignalStack // Go-allocated signal handling stack
-	sigmask       sigset       // storage for saved signal mask
-	tls           [6]uintptr   // thread-local storage (for x86 extern register)
+	procid        uint64            // for debuggers, but offset not hard-coded
+	gsignal       *g                // signal-handling g
+	goSigStack    gsignalStack      // Go-allocated signal handling stack
+	sigmask       sigset            // storage for saved signal mask
+	tls           [tlsSlots]uintptr // thread-local storage (for x86 extern register)
 	mstartfn      func()
 	curg          *g       // current running goroutine
 	caughtsig     guintptr // goroutine running during fatal signal
@@ -537,10 +562,13 @@ type m struct {
 	syscalltick   uint32
 	freelink      *m // on sched.freem
 
-	// mFixup is used to synchronize OS related m state (credentials etc)
-	// use mutex to access.
+	// mFixup is used to synchronize OS related m state
+	// (credentials etc) use mutex to access. To avoid deadlocks
+	// an atomic.Load() of used being zero in mDoFixupFn()
+	// guarantees fn is nil.
 	mFixup struct {
 		lock mutex
+		used uint32
 		fn   func(bool) bool
 	}
 
@@ -605,6 +633,9 @@ type p struct {
 	// unit and eliminates the (potentially large) scheduling
 	// latency that otherwise arises from adding the ready'd
 	// goroutines to the end of the run queue.
+	//
+	// Note that while other P's may atomically CAS this to zero,
+	// only the owner P can CAS it to a valid G.
 	runnext guintptr
 
 	// Available G's (status == Gdead)
@@ -713,7 +744,8 @@ type p struct {
 	// scheduler ASAP (regardless of what G is running on it).
 	preempt bool
 
-	pad cpu.CacheLinePad
+	// Padding is no longer needed. False sharing is now not a worry because p is large enough
+	// that its size class is an integer multiple of the cache line size (for any of our architectures).
 }
 
 type schedt struct {
@@ -803,6 +835,15 @@ type schedt struct {
 	// Acquire and hold this mutex to block sysmon from interacting
 	// with the rest of the runtime.
 	sysmonlock mutex
+
+	_ uint32 // ensure timeToRun has 8-byte alignment
+
+	// timeToRun is a distribution of scheduling latencies, defined
+	// as the sum of time a G spends in the _Grunnable state before
+	// it transitions to _Grunning.
+	//
+	// timeToRun is protected by sched.lock.
+	timeToRun timeHistogram
 }
 
 // Values for the flags field of a sigTabT.
@@ -1105,5 +1146,5 @@ var (
 	isarchive bool // -buildmode=c-archive
 )
 
-// Must agree with cmd/internal/objabi.Framepointer_enabled.
+// Must agree with internal/buildcfg.Experiment.FramePointer.
 const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64"
diff --git a/src/runtime/runtime_mmap_test.go b/src/runtime/runtime_mmap_test.go
index bb0b747606..f71f8afa57 100644
--- a/src/runtime/runtime_mmap_test.go
+++ b/src/runtime/runtime_mmap_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime_test
diff --git a/src/runtime/runtime_unix_test.go b/src/runtime/runtime_unix_test.go
index b0cbbbe3e6..0251c676e6 100644
--- a/src/runtime/runtime_unix_test.go
+++ b/src/runtime/runtime_unix_test.go
@@ -6,6 +6,7 @@
 // We need a fast system call to provoke the race,
 // and Close(-1) is nearly universally fast.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || plan9
 // +build aix darwin dragonfly freebsd linux netbsd openbsd plan9
 
 package runtime_test
diff --git a/src/runtime/semasleep_test.go b/src/runtime/semasleep_test.go
index 9b371b0732..905e932b7d 100644
--- a/src/runtime/semasleep_test.go
+++ b/src/runtime/semasleep_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !plan9 && !windows && !js
 // +build !plan9,!windows,!js
 
 package runtime_test
diff --git a/src/runtime/sigaction.go b/src/runtime/sigaction.go
index 3c888579d0..76f37b1b53 100644
--- a/src/runtime/sigaction.go
+++ b/src/runtime/sigaction.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (linux && !amd64 && !arm64) || (freebsd && !amd64)
 // +build linux,!amd64,!arm64 freebsd,!amd64
 
 package runtime
diff --git a/src/runtime/signal_386.go b/src/runtime/signal_386.go
index 065aff48d3..5824eaddb5 100644
--- a/src/runtime/signal_386.go
+++ b/src/runtime/signal_386.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux || netbsd || openbsd
 // +build dragonfly freebsd linux netbsd openbsd
 
 package runtime
diff --git a/src/runtime/signal_aix_ppc64.go b/src/runtime/signal_aix_ppc64.go
index c17563e2a5..a0becd431e 100644
--- a/src/runtime/signal_aix_ppc64.go
+++ b/src/runtime/signal_aix_ppc64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix
 // +build aix
 
 package runtime
diff --git a/src/runtime/signal_amd64.go b/src/runtime/signal_amd64.go
index 3eeb5e044f..e45fbb4a87 100644
--- a/src/runtime/signal_amd64.go
+++ b/src/runtime/signal_amd64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 && (darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris)
 // +build amd64
 // +build darwin dragonfly freebsd linux netbsd openbsd solaris
 
diff --git a/src/runtime/signal_arm.go b/src/runtime/signal_arm.go
index 156d9d384c..4d9c6224a2 100644
--- a/src/runtime/signal_arm.go
+++ b/src/runtime/signal_arm.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux || netbsd || openbsd
 // +build dragonfly freebsd linux netbsd openbsd
 
 package runtime
diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go
index b559b93938..f04750084f 100644
--- a/src/runtime/signal_arm64.go
+++ b/src/runtime/signal_arm64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || freebsd || linux || netbsd || openbsd
 // +build darwin freebsd linux netbsd openbsd
 
 package runtime
diff --git a/src/runtime/signal_linux_mips64x.go b/src/runtime/signal_linux_mips64x.go
index b608197d60..f0a75ac3ea 100644
--- a/src/runtime/signal_linux_mips64x.go
+++ b/src/runtime/signal_linux_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips64 || mips64le)
 // +build linux
 // +build mips64 mips64le
 
diff --git a/src/runtime/signal_linux_mipsx.go b/src/runtime/signal_linux_mipsx.go
index c88ac4d521..f3969c5aac 100644
--- a/src/runtime/signal_linux_mipsx.go
+++ b/src/runtime/signal_linux_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips || mipsle)
 // +build linux
 // +build mips mipsle
 
diff --git a/src/runtime/signal_linux_ppc64x.go b/src/runtime/signal_linux_ppc64x.go
index 97cb26d587..d9d3e55ec2 100644
--- a/src/runtime/signal_linux_ppc64x.go
+++ b/src/runtime/signal_linux_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (ppc64 || ppc64le)
 // +build linux
 // +build ppc64 ppc64le
 
diff --git a/src/runtime/signal_mips64x.go b/src/runtime/signal_mips64x.go
index 2a347ff4cb..1616b57027 100644
--- a/src/runtime/signal_mips64x.go
+++ b/src/runtime/signal_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (linux || openbsd) && (mips64 || mips64le)
 // +build linux openbsd
 // +build mips64 mips64le
 
diff --git a/src/runtime/signal_mipsx.go b/src/runtime/signal_mipsx.go
index 8c29f59bd1..dcc7f1e9dd 100644
--- a/src/runtime/signal_mipsx.go
+++ b/src/runtime/signal_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips || mipsle)
 // +build linux
 // +build mips mipsle
 
diff --git a/src/runtime/signal_ppc64x.go b/src/runtime/signal_ppc64x.go
index 5de93a330a..f2225da9a1 100644
--- a/src/runtime/signal_ppc64x.go
+++ b/src/runtime/signal_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (aix || linux) && (ppc64 || ppc64le)
 // +build aix linux
 // +build ppc64 ppc64le
 
diff --git a/src/runtime/signal_riscv64.go b/src/runtime/signal_riscv64.go
index 93363a4746..e6b1b14130 100644
--- a/src/runtime/signal_riscv64.go
+++ b/src/runtime/signal_riscv64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && riscv64
 // +build linux,riscv64
 
 package runtime
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 3f70707ab4..f2e526973d 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index 6215d0ba2d..f2ce24d735 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -45,9 +45,10 @@ func initExceptionHandler() {
 //go:nosplit
 func isAbort(r *context) bool {
 	pc := r.ip()
-	if GOARCH == "386" || GOARCH == "amd64" {
+	if GOARCH == "386" || GOARCH == "amd64" || GOARCH == "arm" {
 		// In the case of an abort, the exception IP is one byte after
-		// the INT3 (this differs from UNIX OSes).
+		// the INT3 (this differs from UNIX OSes). Note that on ARM,
+		// this means that the exception IP is no longer aligned.
 		pc--
 	}
 	return isAbortPC(pc)
@@ -144,7 +145,7 @@ func exceptionhandler(info *exceptionrecord, r *context, gp *g) int32 {
 			*((*uintptr)(sp)) = r.ip()
 		}
 	}
-	r.set_ip(funcPC(sigpanic))
+	r.set_ip(funcPC(sigpanic0))
 	return _EXCEPTION_CONTINUE_EXECUTION
 }
 
diff --git a/src/runtime/signal_windows_test.go b/src/runtime/signal_windows_test.go
index 33a9b92ee7..1b7cb9d4c4 100644
--- a/src/runtime/signal_windows_test.go
+++ b/src/runtime/signal_windows_test.go
@@ -1,3 +1,4 @@
+//go:build windows
 // +build windows
 
 package runtime_test
@@ -7,7 +8,6 @@ import (
 	"bytes"
 	"fmt"
 	"internal/testenv"
-	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime"
@@ -28,15 +28,11 @@ func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// build go dll
 	dll := filepath.Join(dir, "testwinlib.dll")
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlib/main.go")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "-buildmode", "c-shared", "testdata/testwinlib/main.go")
 	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("failed to build go library: %s\n%s", err, out)
@@ -156,15 +152,11 @@ func TestLibraryCtrlHandler(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// build go dll
 	dll := filepath.Join(dir, "dummy.dll")
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "-buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
 	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("failed to build go library: %s\n%s", err, out)
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 28b9e26d0f..6c91fb3fb3 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -28,6 +28,7 @@
 // unnecessary rechecks of sig.mask, but it cannot lead to missed signals
 // nor deadlocks.
 
+//go:build !plan9
 // +build !plan9
 
 package runtime
@@ -71,7 +72,7 @@ const (
 // It runs from the signal handler, so it's limited in what it can do.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
-	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
+	if s >= uint32(32*len(sig.wanted)) {
 		return false
 	}
 
@@ -119,7 +120,7 @@ Send:
 			}
 		case sigFixup:
 			// nothing to do - we need to wait for sigIdle.
-			osyield()
+			mDoFixupAndOSYield()
 		}
 	}
 
diff --git a/src/runtime/sigqueue_note.go b/src/runtime/sigqueue_note.go
index 16aeeb2ef0..e23446bea4 100644
--- a/src/runtime/sigqueue_note.go
+++ b/src/runtime/sigqueue_note.go
@@ -7,8 +7,8 @@
 // signal_recv thread. This file holds the non-Darwin implementations of
 // those functions. These functions will never be called.
 
-// +build !darwin
-// +build !plan9
+//go:build !darwin && !plan9
+// +build !darwin,!plan9
 
 package runtime
 
diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index 38d686544f..dc1debddab 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -2,11 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !mips
-// +build !mipsle
-// +build !mips64
-// +build !mips64le
-// +build linux
+//go:build !mips && !mipsle && !mips64 && !mips64le && linux
+// +build !mips,!mipsle,!mips64,!mips64le,linux
 
 package runtime
 
diff --git a/src/runtime/sigtab_linux_mipsx.go b/src/runtime/sigtab_linux_mipsx.go
index 51ef470ce7..af9c7e56eb 100644
--- a/src/runtime/sigtab_linux_mipsx.go
+++ b/src/runtime/sigtab_linux_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (mips || mipsle || mips64 || mips64le) && linux
 // +build mips mipsle mips64 mips64le
 // +build linux
 
diff --git a/src/runtime/sizeclasses.go b/src/runtime/sizeclasses.go
index c5521ce1bd..067871eaf3 100644
--- a/src/runtime/sizeclasses.go
+++ b/src/runtime/sizeclasses.go
@@ -3,74 +3,83 @@
 
 package runtime
 
-// class  bytes/obj  bytes/span  objects  tail waste  max waste
-//     1          8        8192     1024           0     87.50%
-//     2         16        8192      512           0     43.75%
-//     3         24        8192      341           8     29.24%
-//     4         32        8192      256           0     21.88%
-//     5         48        8192      170          32     31.52%
-//     6         64        8192      128           0     23.44%
-//     7         80        8192      102          32     19.07%
-//     8         96        8192       85          32     15.95%
-//     9        112        8192       73          16     13.56%
-//    10        128        8192       64           0     11.72%
-//    11        144        8192       56         128     11.82%
-//    12        160        8192       51          32      9.73%
-//    13        176        8192       46          96      9.59%
-//    14        192        8192       42         128      9.25%
-//    15        208        8192       39          80      8.12%
-//    16        224        8192       36         128      8.15%
-//    17        240        8192       34          32      6.62%
-//    18        256        8192       32           0      5.86%
-//    19        288        8192       28         128     12.16%
-//    20        320        8192       25         192     11.80%
-//    21        352        8192       23          96      9.88%
-//    22        384        8192       21         128      9.51%
-//    23        416        8192       19         288     10.71%
-//    24        448        8192       18         128      8.37%
-//    25        480        8192       17          32      6.82%
-//    26        512        8192       16           0      6.05%
-//    27        576        8192       14         128     12.33%
-//    28        640        8192       12         512     15.48%
-//    29        704        8192       11         448     13.93%
-//    30        768        8192       10         512     13.94%
-//    31        896        8192        9         128     15.52%
-//    32       1024        8192        8           0     12.40%
-//    33       1152        8192        7         128     12.41%
-//    34       1280        8192        6         512     15.55%
-//    35       1408       16384       11         896     14.00%
-//    36       1536        8192        5         512     14.00%
-//    37       1792       16384        9         256     15.57%
-//    38       2048        8192        4           0     12.45%
-//    39       2304       16384        7         256     12.46%
-//    40       2688        8192        3         128     15.59%
-//    41       3072       24576        8           0     12.47%
-//    42       3200       16384        5         384      6.22%
-//    43       3456       24576        7         384      8.83%
-//    44       4096        8192        2           0     15.60%
-//    45       4864       24576        5         256     16.65%
-//    46       5376       16384        3         256     10.92%
-//    47       6144       24576        4           0     12.48%
-//    48       6528       32768        5         128      6.23%
-//    49       6784       40960        6         256      4.36%
-//    50       6912       49152        7         768      3.37%
-//    51       8192        8192        1           0     15.61%
-//    52       9472       57344        6         512     14.28%
-//    53       9728       49152        5         512      3.64%
-//    54      10240       40960        4           0      4.99%
-//    55      10880       32768        3         128      6.24%
-//    56      12288       24576        2           0     11.45%
-//    57      13568       40960        3         256      9.99%
-//    58      14336       57344        4           0      5.35%
-//    59      16384       16384        1           0     12.49%
-//    60      18432       73728        4           0     11.11%
-//    61      19072       57344        3         128      3.57%
-//    62      20480       40960        2           0      6.87%
-//    63      21760       65536        3         256      6.25%
-//    64      24576       24576        1           0     11.45%
-//    65      27264       81920        3         128     10.00%
-//    66      28672       57344        2           0      4.91%
-//    67      32768       32768        1           0     12.50%
+// class  bytes/obj  bytes/span  objects  tail waste  max waste  min align
+//     1          8        8192     1024           0     87.50%          8
+//     2         16        8192      512           0     43.75%         16
+//     3         24        8192      341           8     29.24%          8
+//     4         32        8192      256           0     21.88%         32
+//     5         48        8192      170          32     31.52%         16
+//     6         64        8192      128           0     23.44%         64
+//     7         80        8192      102          32     19.07%         16
+//     8         96        8192       85          32     15.95%         32
+//     9        112        8192       73          16     13.56%         16
+//    10        128        8192       64           0     11.72%        128
+//    11        144        8192       56         128     11.82%         16
+//    12        160        8192       51          32      9.73%         32
+//    13        176        8192       46          96      9.59%         16
+//    14        192        8192       42         128      9.25%         64
+//    15        208        8192       39          80      8.12%         16
+//    16        224        8192       36         128      8.15%         32
+//    17        240        8192       34          32      6.62%         16
+//    18        256        8192       32           0      5.86%        256
+//    19        288        8192       28         128     12.16%         32
+//    20        320        8192       25         192     11.80%         64
+//    21        352        8192       23          96      9.88%         32
+//    22        384        8192       21         128      9.51%        128
+//    23        416        8192       19         288     10.71%         32
+//    24        448        8192       18         128      8.37%         64
+//    25        480        8192       17          32      6.82%         32
+//    26        512        8192       16           0      6.05%        512
+//    27        576        8192       14         128     12.33%         64
+//    28        640        8192       12         512     15.48%        128
+//    29        704        8192       11         448     13.93%         64
+//    30        768        8192       10         512     13.94%        256
+//    31        896        8192        9         128     15.52%        128
+//    32       1024        8192        8           0     12.40%       1024
+//    33       1152        8192        7         128     12.41%        128
+//    34       1280        8192        6         512     15.55%        256
+//    35       1408       16384       11         896     14.00%        128
+//    36       1536        8192        5         512     14.00%        512
+//    37       1792       16384        9         256     15.57%        256
+//    38       2048        8192        4           0     12.45%       2048
+//    39       2304       16384        7         256     12.46%        256
+//    40       2688        8192        3         128     15.59%        128
+//    41       3072       24576        8           0     12.47%       1024
+//    42       3200       16384        5         384      6.22%        128
+//    43       3456       24576        7         384      8.83%        128
+//    44       4096        8192        2           0     15.60%       4096
+//    45       4864       24576        5         256     16.65%        256
+//    46       5376       16384        3         256     10.92%        256
+//    47       6144       24576        4           0     12.48%       2048
+//    48       6528       32768        5         128      6.23%        128
+//    49       6784       40960        6         256      4.36%        128
+//    50       6912       49152        7         768      3.37%        256
+//    51       8192        8192        1           0     15.61%       8192
+//    52       9472       57344        6         512     14.28%        256
+//    53       9728       49152        5         512      3.64%        512
+//    54      10240       40960        4           0      4.99%       2048
+//    55      10880       32768        3         128      6.24%        128
+//    56      12288       24576        2           0     11.45%       4096
+//    57      13568       40960        3         256      9.99%        256
+//    58      14336       57344        4           0      5.35%       2048
+//    59      16384       16384        1           0     12.49%       8192
+//    60      18432       73728        4           0     11.11%       2048
+//    61      19072       57344        3         128      3.57%        128
+//    62      20480       40960        2           0      6.87%       4096
+//    63      21760       65536        3         256      6.25%        256
+//    64      24576       24576        1           0     11.45%       8192
+//    65      27264       81920        3         128     10.00%        128
+//    66      28672       57344        2           0      4.91%       4096
+//    67      32768       32768        1           0     12.50%       8192
+
+// alignment  bits  min obj size
+//         8     3             8
+//        16     4            32
+//        32     5           256
+//        64     6           512
+//       128     7           768
+//      4096    12         28672
+//      8192    13         32768
 
 const (
 	_MaxSmallSize   = 32768
@@ -83,14 +92,6 @@ const (
 
 var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
 var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
-
-type divMagic struct {
-	shift    uint8
-	shift2   uint8
-	mul      uint16
-	baseMask uint16
-}
-
-var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {3, 11, 683, 0}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
+var class_to_divmagic = [_NumSizeClasses]uint32{0, ^uint32(0)/8 + 1, ^uint32(0)/16 + 1, ^uint32(0)/24 + 1, ^uint32(0)/32 + 1, ^uint32(0)/48 + 1, ^uint32(0)/64 + 1, ^uint32(0)/80 + 1, ^uint32(0)/96 + 1, ^uint32(0)/112 + 1, ^uint32(0)/128 + 1, ^uint32(0)/144 + 1, ^uint32(0)/160 + 1, ^uint32(0)/176 + 1, ^uint32(0)/192 + 1, ^uint32(0)/208 + 1, ^uint32(0)/224 + 1, ^uint32(0)/240 + 1, ^uint32(0)/256 + 1, ^uint32(0)/288 + 1, ^uint32(0)/320 + 1, ^uint32(0)/352 + 1, ^uint32(0)/384 + 1, ^uint32(0)/416 + 1, ^uint32(0)/448 + 1, ^uint32(0)/480 + 1, ^uint32(0)/512 + 1, ^uint32(0)/576 + 1, ^uint32(0)/640 + 1, ^uint32(0)/704 + 1, ^uint32(0)/768 + 1, ^uint32(0)/896 + 1, ^uint32(0)/1024 + 1, ^uint32(0)/1152 + 1, ^uint32(0)/1280 + 1, ^uint32(0)/1408 + 1, ^uint32(0)/1536 + 1, ^uint32(0)/1792 + 1, ^uint32(0)/2048 + 1, ^uint32(0)/2304 + 1, ^uint32(0)/2688 + 1, ^uint32(0)/3072 + 1, ^uint32(0)/3200 + 1, ^uint32(0)/3456 + 1, ^uint32(0)/4096 + 1, ^uint32(0)/4864 + 1, ^uint32(0)/5376 + 1, ^uint32(0)/6144 + 1, ^uint32(0)/6528 + 1, ^uint32(0)/6784 + 1, ^uint32(0)/6912 + 1, ^uint32(0)/8192 + 1, ^uint32(0)/9472 + 1, ^uint32(0)/9728 + 1, ^uint32(0)/10240 + 1, ^uint32(0)/10880 + 1, ^uint32(0)/12288 + 1, ^uint32(0)/13568 + 1, ^uint32(0)/14336 + 1, ^uint32(0)/16384 + 1, ^uint32(0)/18432 + 1, ^uint32(0)/19072 + 1, ^uint32(0)/20480 + 1, ^uint32(0)/21760 + 1, ^uint32(0)/24576 + 1, ^uint32(0)/27264 + 1, ^uint32(0)/28672 + 1, ^uint32(0)/32768 + 1}
 var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
 var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go
index 736e848f8c..bbbd1becf7 100644
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@@ -21,7 +21,7 @@ func TestSizeof(t *testing.T) {
 		_32bit uintptr     // size on 32bit platforms
 		_64bit uintptr     // size on 64bit platforms
 	}{
-		{runtime.G{}, 216, 376},   // g, but exported for testing
+		{runtime.G{}, 236, 392},   // g, but exported for testing
 		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
 	}
 
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index c0647d95a0..f9d4154acf 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -112,6 +112,25 @@ func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
 	return makeslice(et, len, cap)
 }
 
+func unsafeslice(et *_type, len int) {
+	mem, overflow := math.MulUintptr(et.size, uintptr(len))
+	if overflow || mem > maxAlloc || len < 0 {
+		panicunsafeslicelen()
+	}
+}
+
+func unsafeslice64(et *_type, len64 int64) {
+	len := int(len64)
+	if int64(len) != len64 {
+		panicunsafeslicelen()
+	}
+	unsafeslice(et, len)
+}
+
+func panicunsafeslicelen() {
+	panic(errorString("unsafe.Slice: len out of range"))
+}
+
 // growslice handles slice growth during append.
 // It is passed the slice element type, the old slice, and the desired new minimum capacity,
 // and it returns a new slice with at least that capacity, with the old data
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index d971e5e26f..b21c9c9518 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/abi"
 	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
@@ -91,6 +92,10 @@ const (
 
 	// The stack guard is a pointer this many bytes above the
 	// bottom of the stack.
+	//
+	// The guard leaves enough room for one _StackSmall frame plus
+	// a _StackLimit chain of NOSPLIT calls plus _StackSystem
+	// bytes for the OS.
 	_StackGuard = 928*sys.StackGuardMultiplier + _StackSystem
 
 	// After a stack split check the SP is allowed to be this
@@ -122,16 +127,21 @@ const (
 const (
 	uintptrMask = 1<<(8*sys.PtrSize) - 1
 
+	// The values below can be stored to g.stackguard0 to force
+	// the next stack check to fail.
+	// These are all larger than any real SP.
+
 	// Goroutine preemption request.
-	// Stored into g->stackguard0 to cause split stack check failure.
-	// Must be greater than any real sp.
 	// 0xfffffade in hex.
 	stackPreempt = uintptrMask & -1314
 
-	// Thread is forking.
-	// Stored into g->stackguard0 to cause split stack check failure.
-	// Must be greater than any real sp.
+	// Thread is forking. Causes a split stack check failure.
+	// 0xfffffb2e in hex.
 	stackFork = uintptrMask & -1234
+
+	// Force a stack movement. Used for debugging.
+	// 0xfffffeed in hex.
+	stackForceMove = uintptrMask & -275
 )
 
 // Global pool of spans that have free stacks.
@@ -692,15 +702,15 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
 				// we call into morestack.)
 				continue
 			}
-			t := obj.typ
-			gcdata := t.gcdata
+			ptrdata := obj.ptrdata()
+			gcdata := obj.gcdata
 			var s *mspan
-			if t.kind&kindGCProg != 0 {
+			if obj.useGCProg() {
 				// See comments in mgcmark.go:scanstack
-				s = materializeGCProg(t.ptrdata, gcdata)
+				s = materializeGCProg(ptrdata, gcdata)
 				gcdata = (*byte)(unsafe.Pointer(s.startAddr))
 			}
-			for i := uintptr(0); i < t.ptrdata; i += sys.PtrSize {
+			for i := uintptr(0); i < ptrdata; i += sys.PtrSize {
 				if *addb(gcdata, i/(8*sys.PtrSize))>>(i/sys.PtrSize&7)&1 != 0 {
 					adjustpointer(adjinfo, unsafe.Pointer(p+i))
 				}
@@ -1054,11 +1064,18 @@ func newstack() {
 	// recheck the bounds on return.)
 	if f := findfunc(gp.sched.pc); f.valid() {
 		max := uintptr(funcMaxSPDelta(f))
-		for newsize-oldsize < max+_StackGuard {
+		for newsize-gp.sched.sp < max+_StackGuard {
 			newsize *= 2
 		}
 	}
 
+	if gp.stackguard0 == stackForceMove {
+		// Forced stack movement used for debugging.
+		// Don't double the stack (or we may quickly run out
+		// if this is done repeatedly).
+		newsize = oldsize
+	}
+
 	if newsize > maxstacksize || newsize > maxstackceiling {
 		if maxstacksize < maxstackceiling {
 			print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
@@ -1301,29 +1318,70 @@ func getStackMap(frame *stkframe, cache *pcvalueCache, debug bool) (locals, args
 	}
 
 	// stack objects.
-	p := funcdata(f, _FUNCDATA_StackObjects)
-	if p != nil {
-		n := *(*uintptr)(p)
-		p = add(p, sys.PtrSize)
-		*(*slice)(unsafe.Pointer(&objs)) = slice{array: noescape(p), len: int(n), cap: int(n)}
-		// Note: the noescape above is needed to keep
-		// getStackMap from "leaking param content:
-		// frame".  That leak propagates up to getgcmask, then
-		// GCMask, then verifyGCInfo, which converts the stack
-		// gcinfo tests into heap gcinfo tests :(
+	if GOARCH == "amd64" && unsafe.Sizeof(abi.RegArgs{}) > 0 && frame.argmap != nil {
+		// argmap is set when the function is reflect.makeFuncStub or reflect.methodValueCall.
+		// We don't actually use argmap in this case, but we need to fake the stack object
+		// record for these frames which contain an internal/abi.RegArgs at a hard-coded offset
+		// on amd64.
+		objs = methodValueCallFrameObjs
+	} else {
+		p := funcdata(f, _FUNCDATA_StackObjects)
+		if p != nil {
+			n := *(*uintptr)(p)
+			p = add(p, sys.PtrSize)
+			*(*slice)(unsafe.Pointer(&objs)) = slice{array: noescape(p), len: int(n), cap: int(n)}
+			// Note: the noescape above is needed to keep
+			// getStackMap from "leaking param content:
+			// frame".  That leak propagates up to getgcmask, then
+			// GCMask, then verifyGCInfo, which converts the stack
+			// gcinfo tests into heap gcinfo tests :(
+		}
 	}
 
 	return
 }
 
+var (
+	abiRegArgsEface          interface{} = abi.RegArgs{}
+	abiRegArgsType           *_type      = efaceOf(&abiRegArgsEface)._type
+	methodValueCallFrameObjs             = []stackObjectRecord{
+		{
+			off:      -int32(alignUp(abiRegArgsType.size, 8)), // It's always the highest address local.
+			size:     int32(abiRegArgsType.size),
+			_ptrdata: int32(abiRegArgsType.ptrdata),
+			gcdata:   abiRegArgsType.gcdata,
+		},
+	}
+)
+
+func init() {
+	if abiRegArgsType.kind&kindGCProg != 0 {
+		throw("abiRegArgsType needs GC Prog, update methodValueCallFrameObjs")
+	}
+}
+
 // A stackObjectRecord is generated by the compiler for each stack object in a stack frame.
-// This record must match the generator code in cmd/compile/internal/gc/ssa.go:emitStackObjects.
+// This record must match the generator code in cmd/compile/internal/liveness/plive.go:emitStackObjects.
 type stackObjectRecord struct {
 	// offset in frame
 	// if negative, offset from varp
 	// if non-negative, offset from argp
-	off int
-	typ *_type
+	off      int32
+	size     int32
+	_ptrdata int32 // ptrdata, or -ptrdata is GC prog is used
+	gcdata   *byte // pointer map or GC prog of the type
+}
+
+func (r *stackObjectRecord) useGCProg() bool {
+	return r._ptrdata < 0
+}
+
+func (r *stackObjectRecord) ptrdata() uintptr {
+	x := r._ptrdata
+	if x < 0 {
+		return uintptr(-x)
+	}
+	return uintptr(x)
 }
 
 // This is exported as ABI0 via linkname so obj can call it.
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 9a601f0094..d6030a1dca 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -55,20 +55,20 @@ func concatstrings(buf *tmpBuf, a []string) string {
 	return s
 }
 
-func concatstring2(buf *tmpBuf, a [2]string) string {
-	return concatstrings(buf, a[:])
+func concatstring2(buf *tmpBuf, a0, a1 string) string {
+	return concatstrings(buf, []string{a0, a1})
 }
 
-func concatstring3(buf *tmpBuf, a [3]string) string {
-	return concatstrings(buf, a[:])
+func concatstring3(buf *tmpBuf, a0, a1, a2 string) string {
+	return concatstrings(buf, []string{a0, a1, a2})
 }
 
-func concatstring4(buf *tmpBuf, a [4]string) string {
-	return concatstrings(buf, a[:])
+func concatstring4(buf *tmpBuf, a0, a1, a2, a3 string) string {
+	return concatstrings(buf, []string{a0, a1, a2, a3})
 }
 
-func concatstring5(buf *tmpBuf, a [5]string) string {
-	return concatstrings(buf, a[:])
+func concatstring5(buf *tmpBuf, a0, a1, a2, a3, a4 string) string {
+	return concatstrings(buf, []string{a0, a1, a2, a3, a4})
 }
 
 // slicebytetostring converts a byte slice to a string.
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index b9b313a711..16d7583202 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"internal/abi"
+	"internal/goexperiment"
 	"unsafe"
 )
 
@@ -108,6 +109,9 @@ func reflect_memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) {
 //go:noescape
 func memmove(to, from unsafe.Pointer, n uintptr)
 
+// Outside assembly calls memmove. Make sure it has ABI wrappers.
+//go:linkname memmove
+
 //go:linkname reflect_memmove reflect.memmove
 func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 	memmove(to, from, n)
@@ -318,33 +322,34 @@ func return0()
 
 // in asm_*.s
 // not called directly; definitions here supply type information for traceback.
-func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call64(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call128(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call256(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call512(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call1024(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call2048(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call4096(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call8192(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call16384(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call32768(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call65536(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call131072(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call262144(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call524288(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call1048576(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call2097152(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call4194304(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call8388608(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call16777216(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call33554432(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call67108864(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call134217728(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call268435456(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call536870912(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
-func call1073741824(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
+// These must have the same signature (arg pointer map) as reflectcall.
+func call16(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call32(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call64(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call128(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call256(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call512(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1024(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call2048(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call4096(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call8192(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call16384(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call32768(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call65536(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call131072(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call262144(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call524288(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1048576(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call2097152(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call4194304(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call8388608(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call16777216(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call33554432(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call67108864(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call134217728(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call268435456(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call536870912(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1073741824(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
 
 func systemstack_switch()
 
@@ -392,6 +397,36 @@ func duffcopy()
 // Called from linker-generated .initarray; declared for go vet; do NOT call from Go.
 func addmoduledata()
 
-// Injected by the signal handler for panicking signals. On many platforms it just
-// jumps to sigpanic.
+// Injected by the signal handler for panicking signals.
+// Initializes any registers that have fixed meaning at calls but
+// are scratch in bodies and calls sigpanic.
+// On many platforms it just jumps to sigpanic.
 func sigpanic0()
+
+// intArgRegs is used by the various register assignment
+// algorithm implementations in the runtime. These include:.
+// - Finalizers (mfinal.go)
+// - Windows callbacks (syscall_windows.go)
+//
+// Both are stripped-down versions of the algorithm since they
+// only have to deal with a subset of cases (finalizers only
+// take a pointer or interface argument, Go Windows callbacks
+// don't support floating point).
+//
+// It should be modified with care and are generally only
+// modified when testing this package.
+//
+// It should never be set higher than its internal/abi
+// constant counterparts, because the system relies on a
+// structure that is at least large enough to hold the
+// registers the system supports.
+//
+// Currently it's set to zero because using the actual
+// constant will break every part of the toolchain that
+// uses finalizers or Windows callbacks to call functions
+// The value that is currently commented out there should be
+// the actual value once we're ready to use the register ABI
+// everywhere.
+//
+// Protected by finlock.
+var intArgRegs = abi.IntArgRegs * goexperiment.RegabiArgsInt
diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 96096d236b..525b324c81 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go
@@ -2,13 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix
-// +build !darwin
-// +build !js
-// +build !openbsd
-// +build !plan9
-// +build !solaris
-// +build !windows
+//go:build !aix && !darwin && !js && !openbsd && !plan9 && !solaris && !windows
+// +build !aix,!darwin,!js,!openbsd,!plan9,!solaris,!windows
 
 package runtime
 
diff --git a/src/runtime/stubs3.go b/src/runtime/stubs3.go
index 1885d32051..b895be4c70 100644
--- a/src/runtime/stubs3.go
+++ b/src/runtime/stubs3.go
@@ -2,12 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix
-// +build !darwin
-// +build !freebsd
-// +build !openbsd
-// +build !plan9
-// +build !solaris
+//go:build !aix && !darwin && !freebsd && !openbsd && !plan9 && !solaris
+// +build !aix,!darwin,!freebsd,!openbsd,!plan9,!solaris
 
 package runtime
 
diff --git a/src/runtime/stubs_amd64.go b/src/runtime/stubs_amd64.go
index bf98493e9d..687a506cdd 100644
--- a/src/runtime/stubs_amd64.go
+++ b/src/runtime/stubs_amd64.go
@@ -40,3 +40,10 @@ func retpolineR15()
 
 //go:noescape
 func asmcgocall_no_g(fn, arg unsafe.Pointer)
+
+// Used by reflectcall and the reflect package.
+//
+// Spills/loads arguments in registers to/from an internal/abi.RegArgs
+// respectively. Does not follow the Go ABI.
+func spillArgs()
+func unspillArgs()
diff --git a/src/runtime/stubs_linux.go b/src/runtime/stubs_linux.go
index e75fcf6c95..ba267009ca 100644
--- a/src/runtime/stubs_linux.go
+++ b/src/runtime/stubs_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux
 // +build linux
 
 package runtime
diff --git a/src/runtime/stubs_mips64x.go b/src/runtime/stubs_mips64x.go
index 652e7a9e34..05a4d0d38d 100644
--- a/src/runtime/stubs_mips64x.go
+++ b/src/runtime/stubs_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips64 || mips64le
 // +build mips64 mips64le
 
 package runtime
diff --git a/src/runtime/stubs_mipsx.go b/src/runtime/stubs_mipsx.go
index 707b295f7a..9bffb35b67 100644
--- a/src/runtime/stubs_mipsx.go
+++ b/src/runtime/stubs_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips || mipsle
 // +build mips mipsle
 
 package runtime
diff --git a/src/runtime/stubs_nonlinux.go b/src/runtime/stubs_nonlinux.go
index e1ea05cf0b..f9b98595fc 100644
--- a/src/runtime/stubs_nonlinux.go
+++ b/src/runtime/stubs_nonlinux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !linux
 // +build !linux
 
 package runtime
diff --git a/src/runtime/stubs_ppc64x.go b/src/runtime/stubs_ppc64x.go
index 26f5bb20ca..0841b413fd 100644
--- a/src/runtime/stubs_ppc64x.go
+++ b/src/runtime/stubs_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ppc64 || ppc64le
 // +build ppc64 ppc64le
 
 package runtime
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 00f802aaa7..6b535dfcbf 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -102,7 +102,9 @@ func (ci *Frames) Next() (frame Frame, more bool) {
 		name := funcname(funcInfo)
 		if inldata := funcdata(funcInfo, _FUNCDATA_InlTree); inldata != nil {
 			inltree := (*[1 << 20]inlinedCall)(inldata)
-			ix := pcdatavalue(funcInfo, _PCDATA_InlTreeIndex, pc, nil)
+			// Non-strict as cgoTraceback may have added bogus PCs
+			// with a valid funcInfo but invalid PCDATA.
+			ix := pcdatavalue1(funcInfo, _PCDATA_InlTreeIndex, pc, nil, false)
 			if ix >= 0 {
 				// Note: entry is not modified. It always refers to a real frame, not an inlined one.
 				f = nil
@@ -183,7 +185,9 @@ func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr {
 	var cache pcvalueCache
 	inltree := (*[1 << 20]inlinedCall)(inldata)
 	for {
-		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, &cache)
+		// Non-strict as cgoTraceback may have added bogus PCs
+		// with a valid funcInfo but invalid PCDATA.
+		ix := pcdatavalue1(f, _PCDATA_InlTreeIndex, tracepc, &cache, false)
 		if ix < 0 {
 			break
 		}
@@ -277,6 +281,7 @@ const (
 	_FUNCDATA_StackObjects       = 2
 	_FUNCDATA_InlTree            = 3
 	_FUNCDATA_OpenCodedDeferInfo = 4
+	_FUNCDATA_ArgInfo            = 5
 
 	_ArgsSizeUnknown = -0x80000000
 )
@@ -308,11 +313,11 @@ type funcID uint8
 
 const (
 	funcID_normal funcID = iota // not a special function
+	funcID_abort
 	funcID_asmcgocall
 	funcID_asyncPreempt
 	funcID_cgocallback
-	funcID_debugCallV1
-	funcID_externalthreadhandler
+	funcID_debugCallV2
 	funcID_gcBgMarkWorker
 	funcID_goexit
 	funcID_gogo
@@ -557,7 +562,11 @@ func moduledataverify1(datap *moduledata) {
 	// Check that the pclntab's format is valid.
 	hdr := datap.pcHeader
 	if hdr.magic != 0xfffffffa || hdr.pad1 != 0 || hdr.pad2 != 0 || hdr.minLC != sys.PCQuantum || hdr.ptrSize != sys.PtrSize {
-		println("runtime: function symbol table header:", hex(hdr.magic), hex(hdr.pad1), hex(hdr.pad2), hex(hdr.minLC), hex(hdr.ptrSize))
+		print("runtime: function symbol table header:", hex(hdr.magic), hex(hdr.pad1), hex(hdr.pad2), hex(hdr.minLC), hex(hdr.ptrSize))
+		if datap.pluginpath != "" {
+			print(", plugin:", datap.pluginpath)
+		}
+		println()
 		throw("invalid function symbol table\n")
 	}
 
@@ -572,7 +581,11 @@ func moduledataverify1(datap *moduledata) {
 			if i+1 < nftab {
 				f2name = funcname(f2)
 			}
-			println("function symbol table not sorted by program counter:", hex(datap.ftab[i].entry), funcname(f1), ">", hex(datap.ftab[i+1].entry), f2name)
+			print("function symbol table not sorted by program counter:", hex(datap.ftab[i].entry), funcname(f1), ">", hex(datap.ftab[i+1].entry), f2name)
+			if datap.pluginpath != "" {
+				print(", plugin:", datap.pluginpath)
+			}
+			println()
 			for j := 0; j <= i; j++ {
 				print("\t", hex(datap.ftab[j].entry), " ", funcname(funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[j].funcoff])), datap}), "\n")
 			}
@@ -667,6 +680,12 @@ func (f *Func) FileLine(pc uintptr) (file string, line int) {
 	return file, int(line32)
 }
 
+// findmoduledatap looks up the moduledata for a PC.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
 func findmoduledatap(pc uintptr) *moduledata {
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
 		if datap.minpc <= pc && pc < datap.maxpc {
@@ -689,6 +708,12 @@ func (f funcInfo) _Func() *Func {
 	return (*Func)(unsafe.Pointer(f._func))
 }
 
+// findfunc looks up function metadata for a PC.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
 func findfunc(pc uintptr) funcInfo {
 	datap := findmoduledatap(pc)
 	if datap == nil {
diff --git a/src/runtime/symtab_test.go b/src/runtime/symtab_test.go
index 01e5002659..ffa07c7f3a 100644
--- a/src/runtime/symtab_test.go
+++ b/src/runtime/symtab_test.go
@@ -8,6 +8,7 @@ import (
 	"runtime"
 	"strings"
 	"testing"
+	"unsafe"
 )
 
 func TestCaller(t *testing.T) {
@@ -165,3 +166,87 @@ func TestNilName(t *testing.T) {
 		t.Errorf("Name() = %q, want %q", got, "")
 	}
 }
+
+var dummy int
+
+func inlined() {
+	// Side effect to prevent elimination of this entire function.
+	dummy = 42
+}
+
+// A function with an InlTree. Returns a PC within the function body.
+//
+// No inline to ensure this complete function appears in output.
+//
+//go:noinline
+func tracebackFunc(t *testing.T) uintptr {
+	// This body must be more complex than a single call to inlined to get
+	// an inline tree.
+	inlined()
+	inlined()
+
+	// Acquire a PC in this function.
+	pc, _, _, ok := runtime.Caller(0)
+	if !ok {
+		t.Fatalf("Caller(0) got ok false, want true")
+	}
+
+	return pc
+}
+
+// Test that CallersFrames handles PCs in the alignment region between
+// functions (int 3 on amd64) without crashing.
+//
+// Go will never generate a stack trace containing such an address, as it is
+// not a valid call site. However, the cgo traceback function passed to
+// runtime.SetCgoTraceback may not be completely accurate and may incorrect
+// provide PCs in Go code or the alignement region between functions.
+//
+// Go obviously doesn't easily expose the problematic PCs to running programs,
+// so this test is a bit fragile. Some details:
+//
+// * tracebackFunc is our target function. We want to get a PC in the
+//   alignment region following this function. This function also has other
+//   functions inlined into it to ensure it has an InlTree (this was the source
+//   of the bug in issue 44971).
+//
+// * We acquire a PC in tracebackFunc, walking forwards until FuncForPC says
+//   we're in a new function. The last PC of the function according to FuncForPC
+//   should be in the alignment region (assuming the function isn't already
+//   perfectly aligned).
+//
+// This is a regression test for issue 44971.
+func TestFunctionAlignmentTraceback(t *testing.T) {
+	pc := tracebackFunc(t)
+
+	// Double-check we got the right PC.
+	f := runtime.FuncForPC(pc)
+	if !strings.HasSuffix(f.Name(), "tracebackFunc") {
+		t.Fatalf("Caller(0) = %+v, want tracebackFunc", f)
+	}
+
+	// Iterate forward until we find a different function. Back up one
+	// instruction is (hopefully) an alignment instruction.
+	for runtime.FuncForPC(pc) == f {
+		pc++
+	}
+	pc--
+
+	// Is this an alignment region filler instruction? We only check this
+	// on amd64 for simplicity. If this function has no filler, then we may
+	// get a false negative, but will never get a false positive.
+	if runtime.GOARCH == "amd64" {
+		code := *(*uint8)(unsafe.Pointer(pc))
+		if code != 0xcc { // INT $3
+			t.Errorf("PC %v code got %#x want 0xcc", pc, code)
+		}
+	}
+
+	// Finally ensure that Frames.Next doesn't crash when processing this
+	// PC.
+	frames := runtime.CallersFrames([]uintptr{pc})
+	frame, _ := frames.Next()
+	if frame.Func != f {
+		t.Errorf("frames.Next() got %+v want %+v", frame.Func, f)
+	}
+}
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index dacce2ee1a..0f91685d6c 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"internal/abi"
+	"unsafe"
+)
 
 // The X versions of syscall expect the libc call to return a 64-bit result.
 // Otherwise (the non-X version) expects a 32-bit result.
@@ -17,7 +20,7 @@ import "unsafe"
 //go:cgo_unsafe_args
 func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 	entersyscall()
-	libcCall(unsafe.Pointer(funcPC(syscall)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -27,8 +30,8 @@ func syscall()
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_syscallX(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
-	libcCall(unsafe.Pointer(funcPC(syscallX)), unsafe.Pointer(&fn))
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscallX)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -39,7 +42,7 @@ func syscallX()
 //go:cgo_unsafe_args
 func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
 	entersyscall()
-	libcCall(unsafe.Pointer(funcPC(syscall6)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -50,7 +53,7 @@ func syscall6()
 //go:cgo_unsafe_args
 func syscall_syscall6X(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
 	entersyscall()
-	libcCall(unsafe.Pointer(funcPC(syscall6X)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6X)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -61,7 +64,7 @@ func syscall6X()
 //go:cgo_unsafe_args
 func syscall_syscallPtr(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 	entersyscall()
-	libcCall(unsafe.Pointer(funcPC(syscallPtr)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscallPtr)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -71,7 +74,7 @@ func syscallPtr()
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_rawSyscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
-	libcCall(unsafe.Pointer(funcPC(syscall)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&fn))
 	return
 }
 
@@ -79,7 +82,7 @@ func syscall_rawSyscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_rawSyscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
-	libcCall(unsafe.Pointer(funcPC(syscall6)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6)), unsafe.Pointer(&fn))
 	return
 }
 
@@ -90,7 +93,7 @@ func syscall_rawSyscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintpt
 //go:cgo_unsafe_args
 func crypto_x509_syscall(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1 uintptr) {
 	entersyscall()
-	libcCall(unsafe.Pointer(funcPC(syscallNoErr)), unsafe.Pointer(&fn))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscallNoErr)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
@@ -102,42 +105,42 @@ func syscallNoErr()
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_attr_init(attr *pthreadattr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_attr_init_trampoline)), unsafe.Pointer(&attr))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_init_trampoline)), unsafe.Pointer(&attr))
 }
 func pthread_attr_init_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_attr_getstacksize(attr *pthreadattr, size *uintptr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_attr_getstacksize_trampoline)), unsafe.Pointer(&attr))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_getstacksize_trampoline)), unsafe.Pointer(&attr))
 }
 func pthread_attr_getstacksize_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_attr_setdetachstate(attr *pthreadattr, state int) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_attr_setdetachstate_trampoline)), unsafe.Pointer(&attr))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_setdetachstate_trampoline)), unsafe.Pointer(&attr))
 }
 func pthread_attr_setdetachstate_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_create(attr *pthreadattr, start uintptr, arg unsafe.Pointer) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_create_trampoline)), unsafe.Pointer(&attr))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_create_trampoline)), unsafe.Pointer(&attr))
 }
 func pthread_create_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func raise(sig uint32) {
-	libcCall(unsafe.Pointer(funcPC(raise_trampoline)), unsafe.Pointer(&sig))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(raise_trampoline)), unsafe.Pointer(&sig))
 }
 func raise_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_self() (t pthread) {
-	libcCall(unsafe.Pointer(funcPC(pthread_self_trampoline)), unsafe.Pointer(&t))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_self_trampoline)), unsafe.Pointer(&t))
 	return
 }
 func pthread_self_trampoline()
@@ -145,7 +148,7 @@ func pthread_self_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_kill(t pthread, sig uint32) {
-	libcCall(unsafe.Pointer(funcPC(pthread_kill_trampoline)), unsafe.Pointer(&t))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_kill_trampoline)), unsafe.Pointer(&t))
 	return
 }
 func pthread_kill_trampoline()
@@ -163,7 +166,7 @@ func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (un
 		ret1            unsafe.Pointer
 		ret2            int
 	}{addr, n, prot, flags, fd, off, nil, 0}
-	libcCall(unsafe.Pointer(funcPC(mmap_trampoline)), unsafe.Pointer(&args))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(mmap_trampoline)), unsafe.Pointer(&args))
 	return args.ret1, args.ret2
 }
 func mmap_trampoline()
@@ -171,34 +174,34 @@ func mmap_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func munmap(addr unsafe.Pointer, n uintptr) {
-	libcCall(unsafe.Pointer(funcPC(munmap_trampoline)), unsafe.Pointer(&addr))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(munmap_trampoline)), unsafe.Pointer(&addr))
 }
 func munmap_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func madvise(addr unsafe.Pointer, n uintptr, flags int32) {
-	libcCall(unsafe.Pointer(funcPC(madvise_trampoline)), unsafe.Pointer(&addr))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(madvise_trampoline)), unsafe.Pointer(&addr))
 }
 func madvise_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func mlock(addr unsafe.Pointer, n uintptr) {
-	libcCall(unsafe.Pointer(funcPC(mlock_trampoline)), unsafe.Pointer(&addr))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(mlock_trampoline)), unsafe.Pointer(&addr))
 }
 func mlock_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func read(fd int32, p unsafe.Pointer, n int32) int32 {
-	return libcCall(unsafe.Pointer(funcPC(read_trampoline)), unsafe.Pointer(&fd))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(read_trampoline)), unsafe.Pointer(&fd))
 }
 func read_trampoline()
 
 func pipe() (r, w int32, errno int32) {
 	var p [2]int32
-	errno = libcCall(unsafe.Pointer(funcPC(pipe_trampoline)), noescape(unsafe.Pointer(&p)))
+	errno = libcCall(unsafe.Pointer(abi.FuncPCABI0(pipe_trampoline)), noescape(unsafe.Pointer(&p)))
 	return p[0], p[1], errno
 }
 func pipe_trampoline()
@@ -206,7 +209,7 @@ func pipe_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func closefd(fd int32) int32 {
-	return libcCall(unsafe.Pointer(funcPC(close_trampoline)), unsafe.Pointer(&fd))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(close_trampoline)), unsafe.Pointer(&fd))
 }
 func close_trampoline()
 
@@ -216,34 +219,34 @@ func close_trampoline()
 // This is exported via linkname to assembly in runtime/cgo.
 //go:linkname exit
 func exit(code int32) {
-	libcCall(unsafe.Pointer(funcPC(exit_trampoline)), unsafe.Pointer(&code))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(exit_trampoline)), unsafe.Pointer(&code))
 }
 func exit_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func usleep(usec uint32) {
-	libcCall(unsafe.Pointer(funcPC(usleep_trampoline)), unsafe.Pointer(&usec))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(usleep_trampoline)), unsafe.Pointer(&usec))
 }
 func usleep_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func usleep_no_g(usec uint32) {
-	asmcgocall_no_g(unsafe.Pointer(funcPC(usleep_trampoline)), unsafe.Pointer(&usec))
+	asmcgocall_no_g(unsafe.Pointer(abi.FuncPCABI0(usleep_trampoline)), unsafe.Pointer(&usec))
 }
 
 //go:nosplit
 //go:cgo_unsafe_args
 func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
-	return libcCall(unsafe.Pointer(funcPC(write_trampoline)), unsafe.Pointer(&fd))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(write_trampoline)), unsafe.Pointer(&fd))
 }
 func write_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func open(name *byte, mode, perm int32) (ret int32) {
-	return libcCall(unsafe.Pointer(funcPC(open_trampoline)), unsafe.Pointer(&name))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(open_trampoline)), unsafe.Pointer(&name))
 }
 func open_trampoline()
 
@@ -254,7 +257,7 @@ func nanotime1() int64 {
 		t            int64  // raw timer
 		numer, denom uint32 // conversion factors. nanoseconds = t * numer / denom.
 	}
-	libcCall(unsafe.Pointer(funcPC(nanotime_trampoline)), unsafe.Pointer(&r))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(nanotime_trampoline)), unsafe.Pointer(&r))
 	// Note: Apple seems unconcerned about overflow here. See
 	// https://developer.apple.com/library/content/qa/qa1398/_index.html
 	// Note also, numer == denom == 1 is common.
@@ -271,9 +274,9 @@ func nanotime_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
-func walltime1() (int64, int32) {
+func walltime() (int64, int32) {
 	var t timespec
-	libcCall(unsafe.Pointer(funcPC(walltime_trampoline)), unsafe.Pointer(&t))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(walltime_trampoline)), unsafe.Pointer(&t))
 	return t.tv_sec, int32(t.tv_nsec)
 }
 func walltime_trampoline()
@@ -281,14 +284,14 @@ func walltime_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func sigaction(sig uint32, new *usigactiont, old *usigactiont) {
-	libcCall(unsafe.Pointer(funcPC(sigaction_trampoline)), unsafe.Pointer(&sig))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigaction_trampoline)), unsafe.Pointer(&sig))
 }
 func sigaction_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func sigprocmask(how uint32, new *sigset, old *sigset) {
-	libcCall(unsafe.Pointer(funcPC(sigprocmask_trampoline)), unsafe.Pointer(&how))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigprocmask_trampoline)), unsafe.Pointer(&how))
 }
 func sigprocmask_trampoline()
 
@@ -302,49 +305,49 @@ func sigaltstack(new *stackt, old *stackt) {
 		// ref: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20140421/214296.html
 		new.ss_size = 32768
 	}
-	libcCall(unsafe.Pointer(funcPC(sigaltstack_trampoline)), unsafe.Pointer(&new))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigaltstack_trampoline)), unsafe.Pointer(&new))
 }
 func sigaltstack_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func raiseproc(sig uint32) {
-	libcCall(unsafe.Pointer(funcPC(raiseproc_trampoline)), unsafe.Pointer(&sig))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(raiseproc_trampoline)), unsafe.Pointer(&sig))
 }
 func raiseproc_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func setitimer(mode int32, new, old *itimerval) {
-	libcCall(unsafe.Pointer(funcPC(setitimer_trampoline)), unsafe.Pointer(&mode))
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(setitimer_trampoline)), unsafe.Pointer(&mode))
 }
 func setitimer_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func sysctl(mib *uint32, miblen uint32, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(sysctl_trampoline)), unsafe.Pointer(&mib))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(sysctl_trampoline)), unsafe.Pointer(&mib))
 }
 func sysctl_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func sysctlbyname(name *byte, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(sysctlbyname_trampoline)), unsafe.Pointer(&name))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(sysctlbyname_trampoline)), unsafe.Pointer(&name))
 }
 func sysctlbyname_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func fcntl(fd, cmd, arg int32) int32 {
-	return libcCall(unsafe.Pointer(funcPC(fcntl_trampoline)), unsafe.Pointer(&fd))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(fcntl_trampoline)), unsafe.Pointer(&fd))
 }
 func fcntl_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func kqueue() int32 {
-	v := libcCall(unsafe.Pointer(funcPC(kqueue_trampoline)), nil)
+	v := libcCall(unsafe.Pointer(abi.FuncPCABI0(kqueue_trampoline)), nil)
 	return v
 }
 func kqueue_trampoline()
@@ -352,56 +355,56 @@ func kqueue_trampoline()
 //go:nosplit
 //go:cgo_unsafe_args
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32 {
-	return libcCall(unsafe.Pointer(funcPC(kevent_trampoline)), unsafe.Pointer(&kq))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(kevent_trampoline)), unsafe.Pointer(&kq))
 }
 func kevent_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_mutex_init(m *pthreadmutex, attr *pthreadmutexattr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_mutex_init_trampoline)), unsafe.Pointer(&m))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_init_trampoline)), unsafe.Pointer(&m))
 }
 func pthread_mutex_init_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_mutex_lock(m *pthreadmutex) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_mutex_lock_trampoline)), unsafe.Pointer(&m))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_lock_trampoline)), unsafe.Pointer(&m))
 }
 func pthread_mutex_lock_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_mutex_unlock(m *pthreadmutex) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_mutex_unlock_trampoline)), unsafe.Pointer(&m))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_unlock_trampoline)), unsafe.Pointer(&m))
 }
 func pthread_mutex_unlock_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_cond_init(c *pthreadcond, attr *pthreadcondattr) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_cond_init_trampoline)), unsafe.Pointer(&c))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_init_trampoline)), unsafe.Pointer(&c))
 }
 func pthread_cond_init_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_cond_wait(c *pthreadcond, m *pthreadmutex) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_cond_wait_trampoline)), unsafe.Pointer(&c))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_wait_trampoline)), unsafe.Pointer(&c))
 }
 func pthread_cond_wait_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_cond_timedwait_relative_np(c *pthreadcond, m *pthreadmutex, t *timespec) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_cond_timedwait_relative_np_trampoline)), unsafe.Pointer(&c))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_timedwait_relative_np_trampoline)), unsafe.Pointer(&c))
 }
 func pthread_cond_timedwait_relative_np_trampoline()
 
 //go:nosplit
 //go:cgo_unsafe_args
 func pthread_cond_signal(c *pthreadcond) int32 {
-	return libcCall(unsafe.Pointer(funcPC(pthread_cond_signal_trampoline)), unsafe.Pointer(&c))
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_signal_trampoline)), unsafe.Pointer(&c))
 }
 func pthread_cond_signal_trampoline()
 
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 0fe8c7e172..3bd027f982 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -5,17 +5,16 @@
 // System calls and other sys.stuff for AMD64, Darwin
 // System calls are implemented in libSystem, this file contains
 // trampolines that convert from Go to C calling convention.
-// The trampolines are ABIInternal as they are referenced from
-// Go code with funcPC.
 
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 #define CLOCK_REALTIME		0
 
 // Exit the entire program (like C exit)
-TEXT runtime·exit_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·exit_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	0(DI), DI		// arg 1 exit status
@@ -24,7 +23,7 @@ TEXT runtime·exit_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·open_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·open_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	8(DI), SI		// arg 2 flags
@@ -35,7 +34,7 @@ TEXT runtime·open_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·close_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·close_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	0(DI), DI		// arg 1 fd
@@ -43,7 +42,7 @@ TEXT runtime·close_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·read_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·read_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 buf
@@ -59,7 +58,7 @@ noerr:
 	POPQ	BP
 	RET
 
-TEXT runtime·write_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·write_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 buf
@@ -75,7 +74,7 @@ noerr:
 	POPQ	BP
 	RET
 
-TEXT runtime·pipe_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pipe_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	CALL	libc_pipe(SB)		// pointer already in DI
@@ -86,7 +85,7 @@ TEXT runtime·pipe_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·setitimer_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 new
@@ -96,7 +95,7 @@ TEXT runtime·setitimer_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·madvise_trampoline<ABIInternal>(SB), NOSPLIT, $0
+TEXT runtime·madvise_trampoline(SB), NOSPLIT, $0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 len
@@ -107,12 +106,12 @@ TEXT runtime·madvise_trampoline<ABIInternal>(SB), NOSPLIT, $0
 	POPQ	BP
 	RET
 
-TEXT runtime·mlock_trampoline<ABIInternal>(SB), NOSPLIT, $0
+TEXT runtime·mlock_trampoline(SB), NOSPLIT, $0
 	UNDEF // unimplemented
 
 GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
 
-TEXT runtime·nanotime_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·nanotime_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	DI, BX
@@ -141,7 +140,7 @@ initialized:
 	POPQ	BP
 	RET
 
-TEXT runtime·walltime_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP			// make a frame; keep stack aligned
 	MOVQ	SP, BP
 	MOVQ	DI, SI			// arg 2 timespec
@@ -150,7 +149,7 @@ TEXT runtime·walltime_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·sigaction_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·sigaction_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 new
@@ -163,7 +162,7 @@ TEXT runtime·sigaction_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·sigprocmask_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 new
@@ -176,7 +175,7 @@ TEXT runtime·sigprocmask_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·sigaltstack_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 old
@@ -188,7 +187,7 @@ TEXT runtime·sigaltstack_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·raiseproc_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·raiseproc_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	0(DI), BX	// signal
@@ -214,41 +213,26 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 
 // This is the function registered during sigaction and is invoked when
 // a signal is received. It just redirects to the Go function sigtrampgo.
-TEXT runtime·sigtramp<ABIInternal>(SB),NOSPLIT,$0
-	// This runs on the signal stack, so we have lots of stack available.
-	// We allocate our own stack space, because if we tell the linker
-	// how much we're using, the NOSPLIT check fails.
-	PUSHQ	BP
-	MOVQ	SP, BP
-	SUBQ	$64, SP
-
-	// Save callee-save registers.
-	MOVQ	BX, 24(SP)
-	MOVQ	R12, 32(SP)
-	MOVQ	R13, 40(SP)
-	MOVQ	R14, 48(SP)
-	MOVQ	R15, 56(SP)
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
 	// Call into the Go signal handler
-	MOVL	DI, 0(SP)  // sig
-	MOVQ	SI, 8(SP)  // info
-	MOVQ	DX, 16(SP) // ctx
-	CALL runtime·sigtrampgo(SB)
-
-	// Restore callee-save registers.
-	MOVQ	24(SP), BX
-	MOVQ	32(SP), R12
-	MOVQ	40(SP), R13
-	MOVQ	48(SP), R14
-	MOVQ	56(SP), R15
+	NOP	SP		// disable vet stack checking
+	ADJSP	$24
+	MOVL	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	BP, SP
-	POPQ	BP
+	POP_REGS_HOST_TO_ABI0()
 	RET
 
 // Used instead of sigtramp in programs that use cgo.
 // Arguments from kernel are in DI, SI, DX.
-TEXT runtime·cgoSigtramp<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
 	// If no traceback function, do usual sigtramp.
 	MOVQ	runtime·cgoTraceback(SB), AX
 	TESTQ	AX, AX
@@ -291,12 +275,12 @@ TEXT runtime·cgoSigtramp<ABIInternal>(SB),NOSPLIT,$0
 	// The first three arguments, and the fifth, are already in registers.
 	// Set the two remaining arguments now.
 	MOVQ	runtime·cgoTraceback(SB), CX
-	MOVQ	$runtime·sigtramp<ABIInternal>(SB), R9
+	MOVQ	$runtime·sigtramp(SB), R9
 	MOVQ	_cgo_callers(SB), AX
 	JMP	AX
 
 sigtramp:
-	JMP	runtime·sigtramp<ABIInternal>(SB)
+	JMP	runtime·sigtramp(SB)
 
 sigtrampnog:
 	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
@@ -322,7 +306,7 @@ sigtrampnog:
 	MOVQ	_cgo_callers(SB), AX
 	JMP	AX
 
-TEXT runtime·mmap_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP			// make a frame; keep stack aligned
 	MOVQ	SP, BP
 	MOVQ	DI, BX
@@ -345,7 +329,7 @@ ok:
 	POPQ	BP
 	RET
 
-TEXT runtime·munmap_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·munmap_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 len
@@ -357,7 +341,7 @@ TEXT runtime·munmap_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·usleep_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	0(DI), DI	// arg 1 usec
@@ -369,7 +353,7 @@ TEXT runtime·settls(SB),NOSPLIT,$32
 	// Nothing to do on Darwin, pthread already set thread-local storage up.
 	RET
 
-TEXT runtime·sysctl_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	8(DI), SI		// arg 2 miblen
@@ -382,7 +366,7 @@ TEXT runtime·sysctl_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·sysctlbyname_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 oldp
@@ -394,14 +378,14 @@ TEXT runtime·sysctlbyname_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·kqueue_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	CALL	libc_kqueue(SB)
 	POPQ	BP
 	RET
 
-TEXT runtime·kevent_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI		// arg 2 keventt
@@ -420,7 +404,7 @@ ok:
 	POPQ	BP
 	RET
 
-TEXT runtime·fcntl_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	4(DI), SI		// arg 2 cmd
@@ -438,13 +422,8 @@ TEXT runtime·mstart_stub(SB),NOSPLIT,$0
 	// DI points to the m.
 	// We are already on m's g0 stack.
 
-	// Save callee-save registers.
-	SUBQ	$40, SP
-	MOVQ	BX, 0(SP)
-	MOVQ	R12, 8(SP)
-	MOVQ	R13, 16(SP)
-	MOVQ	R14, 24(SP)
-	MOVQ	R15, 32(SP)
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
 	MOVQ	m_g0(DI), DX // g
 
@@ -452,24 +431,14 @@ TEXT runtime·mstart_stub(SB),NOSPLIT,$0
 	// See cmd/link/internal/ld/sym.go:computeTLSOffset.
 	MOVQ	DX, 0x30(GS)
 
-	// Someday the convention will be D is always cleared.
-	CLD
-
 	CALL	runtime·mstart(SB)
 
-	// Restore callee-save registers.
-	MOVQ	0(SP), BX
-	MOVQ	8(SP), R12
-	MOVQ	16(SP), R13
-	MOVQ	24(SP), R14
-	MOVQ	32(SP), R15
+	POP_REGS_HOST_TO_ABI0()
 
 	// Go is all done with this OS thread.
 	// Tell pthread everything is ok (we never join with this thread, so
 	// the value here doesn't really matter).
 	XORL	AX, AX
-
-	ADDQ	$40, SP
 	RET
 
 // These trampolines help convert from Go calling convention to C calling convention.
@@ -477,7 +446,7 @@ TEXT runtime·mstart_stub(SB),NOSPLIT,$0
 // A pointer to the arguments is passed in DI.
 // A single int32 result is returned in AX.
 // (For more results, make an args/results structure.)
-TEXT runtime·pthread_attr_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_attr_init_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP	// make frame, keep stack 16-byte aligned.
 	MOVQ	SP, BP
 	MOVQ	0(DI), DI // arg 1 attr
@@ -485,7 +454,7 @@ TEXT runtime·pthread_attr_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_attr_getstacksize_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_attr_getstacksize_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 size
@@ -494,7 +463,7 @@ TEXT runtime·pthread_attr_getstacksize_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_attr_setdetachstate_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_attr_setdetachstate_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 state
@@ -503,7 +472,7 @@ TEXT runtime·pthread_attr_setdetachstate_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_create_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_create_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -516,7 +485,7 @@ TEXT runtime·pthread_create_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·raise_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·raise_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVL	0(DI), DI	// arg 1 signal
@@ -524,7 +493,7 @@ TEXT runtime·raise_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_mutex_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_mutex_init_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 attr
@@ -533,7 +502,7 @@ TEXT runtime·pthread_mutex_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_mutex_lock_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_mutex_lock_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	0(DI), DI	// arg 1 mutex
@@ -541,7 +510,7 @@ TEXT runtime·pthread_mutex_lock_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_mutex_unlock_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_mutex_unlock_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	0(DI), DI	// arg 1 mutex
@@ -549,7 +518,7 @@ TEXT runtime·pthread_mutex_unlock_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_cond_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_cond_init_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 attr
@@ -558,7 +527,7 @@ TEXT runtime·pthread_cond_init_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_cond_wait_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_cond_wait_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 mutex
@@ -567,7 +536,7 @@ TEXT runtime·pthread_cond_wait_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_cond_timedwait_relative_np_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_cond_timedwait_relative_np_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 mutex
@@ -577,7 +546,7 @@ TEXT runtime·pthread_cond_timedwait_relative_np_trampoline<ABIInternal>(SB),NOS
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_cond_signal_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_cond_signal_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	0(DI), DI	// arg 1 cond
@@ -585,7 +554,7 @@ TEXT runtime·pthread_cond_signal_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_self_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_self_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	DI, BX		// BX is caller-save
@@ -594,7 +563,7 @@ TEXT runtime·pthread_self_trampoline<ABIInternal>(SB),NOSPLIT,$0
 	POPQ	BP
 	RET
 
-TEXT runtime·pthread_kill_trampoline<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·pthread_kill_trampoline(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	MOVQ	8(DI), SI	// arg 2 sig
@@ -619,7 +588,7 @@ TEXT runtime·pthread_kill_trampoline<ABIInternal>(SB),NOSPLIT,$0
 //
 // syscall expects a 32-bit result and tests for 32-bit -1
 // to decide there was an error.
-TEXT runtime·syscall<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscall(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -669,7 +638,7 @@ ok:
 //
 // syscallX is like syscall but expects a 64-bit result
 // and tests for 64-bit -1 to decide there was an error.
-TEXT runtime·syscallX<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscallX(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -705,7 +674,7 @@ ok:
 
 // syscallPtr is like syscallX except that the libc function reports an
 // error by returning NULL and setting errno.
-TEXT runtime·syscallPtr<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscallPtr(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -758,7 +727,7 @@ ok:
 //
 // syscall6 expects a 32-bit result and tests for 32-bit -1
 // to decide there was an error.
-TEXT runtime·syscall6<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscall6(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -811,7 +780,7 @@ ok:
 //
 // syscall6X is like syscall6 but expects a 64-bit result
 // and tests for 64-bit -1 to decide there was an error.
-TEXT runtime·syscall6X<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscall6X(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
@@ -847,7 +816,7 @@ ok:
 
 // syscallNoErr is like syscall6 but does not check for errors, and
 // only returns one value, for use with standard C ABI library functions.
-TEXT runtime·syscallNoErr<ABIInternal>(SB),NOSPLIT,$0
+TEXT runtime·syscallNoErr(SB),NOSPLIT,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	SUBQ	$16, SP
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index 580633af55..d57bc2a7a4 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -9,6 +9,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 TEXT runtime·sys_umtx_sleep(SB),NOSPLIT,$0
 	MOVQ addr+0(FP), DI		// arg 1 - ptr
@@ -123,6 +124,24 @@ pipeok:
 	MOVL	$0, errno+8(FP)
 	RET
 
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	MOVL	$0, DI
+	// dragonfly expects flags as the 2nd argument
+	MOVL	flags+0(FP), SI
+	MOVL	$538, AX
+	SYSCALL
+	JCC	pipe2ok
+	MOVL	$-1,r+8(FP)
+	MOVL	$-1,w+12(FP)
+	MOVL	AX, errno+16(FP)
+	RET
+pipe2ok:
+	MOVL	AX, r+8(FP)
+	MOVL	DX, w+12(FP)
+	MOVL	$0, errno+16(FP)
+	RET
+
 TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVQ	fd+0(FP), DI		// arg 1 fd
 	MOVQ	p+8(FP), SI		// arg 2 buf
@@ -165,8 +184,8 @@ TEXT runtime·setitimer(SB), NOSPLIT, $-8
 	SYSCALL
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVL	$232, AX // clock_gettime
 	MOVQ	$0, DI  	// CLOCK_REALTIME
 	LEAQ	8(SP), SI
@@ -217,28 +236,21 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$72
-	// Save callee-saved C registers, since the caller may be a C signal handler.
-	MOVQ	BX,  bx-8(SP)
-	MOVQ	BP,  bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
-	MOVQ	R12, r12-24(SP)
-	MOVQ	R13, r13-32(SP)
-	MOVQ	R14, r14-40(SP)
-	MOVQ	R15, r15-48(SP)
-	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
-	// modify them.
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
-	MOVQ	DX, ctx-56(SP)
-	MOVQ	SI, info-64(SP)
-	MOVQ	DI, signum-72(SP)
-	CALL	runtime·sigtrampgo(SB)
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+	ADJSP	$24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	r15-48(SP), R15
-	MOVQ	r14-40(SP), R14
-	MOVQ	r13-32(SP), R13
-	MOVQ	r12-24(SP), R12
-	MOVQ	bp-16(SP),  BP
-	MOVQ	bx-8(SP),   BX
+	POP_REGS_HOST_TO_ABI0()
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 07734b0d7d..71a60cae65 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -9,6 +9,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
 	MOVQ addr+0(FP), DI
@@ -237,28 +238,21 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$72
-	// Save callee-saved C registers, since the caller may be a C signal handler.
-	MOVQ	BX, bx-8(SP)
-	MOVQ	BP, bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
-	MOVQ	R12, r12-24(SP)
-	MOVQ	R13, r13-32(SP)
-	MOVQ	R14, r14-40(SP)
-	MOVQ	R15, r15-48(SP)
-	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
-	// modify them.
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
-	MOVQ	DX, ctx-56(SP)
-	MOVQ	SI, info-64(SP)
-	MOVQ	DI, signum-72(SP)
-	CALL	runtime·sigtrampgo(SB)
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+        ADJSP   $24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	r15-48(SP), R15
-	MOVQ	r14-40(SP), R14
-	MOVQ	r13-32(SP), R13
-	MOVQ	r12-24(SP), R12
-	MOVQ	bp-16(SP),  BP
-	MOVQ	bx-8(SP),   BX
+        POP_REGS_HOST_TO_ABI0()
 	RET
 
 // Used instead of sigtramp in programs that use cgo.
diff --git a/src/runtime/sys_libc.go b/src/runtime/sys_libc.go
index 996c032105..91195eb3c0 100644
--- a/src/runtime/sys_libc.go
+++ b/src/runtime/sys_libc.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin openbsd,amd64 openbsd,arm64
+//go:build darwin || (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build darwin openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 1e3a834812..3ae5a9099f 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -219,8 +219,8 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-16
 	MOVL	AX, ret+12(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $8-12
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 
@@ -412,6 +412,7 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$12-16
 	MOVL	AX, SP
 	RET
 
+// Called using C ABI.
 TEXT runtime·sigtramp(SB),NOSPLIT,$28
 	// Save callee-saved C registers, since the caller may be a C signal handler.
 	MOVL	BX, bx-4(SP)
@@ -421,11 +422,11 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$28
 	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
 	// modify them.
 
-	MOVL	sig+0(FP), BX
+	MOVL	(28+4)(SP), BX
 	MOVL	BX, 0(SP)
-	MOVL	info+4(FP), BX
+	MOVL	(28+8)(SP), BX
 	MOVL	BX, 4(SP)
-	MOVL	ctx+8(FP), BX
+	MOVL	(28+12)(SP), BX
 	MOVL	BX, 8(SP)
 	CALL	runtime·sigtrampgo(SB)
 
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index d48573c2c5..33cc670b64 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -9,6 +9,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 #define AT_FDCWD -100
 
@@ -203,9 +204,8 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-28
 	MOVL	AX, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-// non-zero frame-size means bp is saved and restored
-TEXT runtime·walltime1(SB),NOSPLIT,$16-12
+// func nanotime1() int64
+TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 	// In particular, a kernel configured with CONFIG_OPTIMIZE_INLINING=n
@@ -215,75 +215,7 @@ TEXT runtime·walltime1(SB),NOSPLIT,$16-12
 
 	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
 
-#ifdef GOEXPERIMENT_REGABI
-	MOVQ	g_m(R14), BX // BX unchanged by C code.
-#else
-	get_tls(CX)
-	MOVQ	g(CX), AX
-	MOVQ	g_m(AX), BX // BX unchanged by C code.
-#endif
-
-	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	// Save the old values on stack and restore them on exit,
-	// so this function is reentrant.
-	MOVQ	m_vdsoPC(BX), CX
-	MOVQ	m_vdsoSP(BX), DX
-	MOVQ	CX, 0(SP)
-	MOVQ	DX, 8(SP)
-
-	LEAQ	sec+0(FP), DX
-	MOVQ	-8(DX), CX
-	MOVQ	CX, m_vdsoPC(BX)
-	MOVQ	DX, m_vdsoSP(BX)
-
-#ifdef GOEXPERIMENT_REGABI
-	CMPQ	R14, m_curg(BX)	// Only switch if on curg.
-#else
-	CMPQ	AX, m_curg(BX)	// Only switch if on curg.
-#endif
-	JNE	noswitch
-
-	MOVQ	m_g0(BX), DX
-	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
-
-noswitch:
-	SUBQ	$16, SP		// Space for results
-	ANDQ	$~15, SP	// Align for C code
-
-	MOVL	$0, DI // CLOCK_REALTIME
-	LEAQ	0(SP), SI
-	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
-	CMPQ	AX, $0
-	JEQ	fallback
-	CALL	AX
-ret:
-	MOVQ	0(SP), AX	// sec
-	MOVQ	8(SP), DX	// nsec
-	MOVQ	R12, SP		// Restore real SP
-	// Restore vdsoPC, vdsoSP
-	// We don't worry about being signaled between the two stores.
-	// If we are not in a signal handler, we'll restore vdsoSP to 0,
-	// and no one will care about vdsoPC. If we are in a signal handler,
-	// we cannot receive another signal.
-	MOVQ	8(SP), CX
-	MOVQ	CX, m_vdsoSP(BX)
-	MOVQ	0(SP), CX
-	MOVQ	CX, m_vdsoPC(BX)
-	MOVQ	AX, sec+0(FP)
-	MOVL	DX, nsec+8(FP)
-	RET
-fallback:
-	MOVQ	$SYS_clock_gettime, AX
-	SYSCALL
-	JMP ret
-
-// func nanotime1() int64
-TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
-	// Switch to g0 stack. See comment above in runtime·walltime.
-
-	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
-
-#ifdef GOEXPERIMENT_REGABI
+#ifdef GOEXPERIMENT_regabig
 	MOVQ	g_m(R14), BX // BX unchanged by C code.
 #else
 	get_tls(CX)
@@ -304,7 +236,7 @@ TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
 	MOVQ	CX, m_vdsoPC(BX)
 	MOVQ	DX, m_vdsoSP(BX)
 
-#ifdef GOEXPERIMENT_REGABI
+#ifdef GOEXPERIMENT_regabig
 	CMPQ	R14, m_curg(BX)	// Only switch if on curg.
 #else
 	CMPQ	AX, m_curg(BX)	// Only switch if on curg.
@@ -397,29 +329,21 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	RET
 
 // Defined as ABIInternal since it does not use the stack-based Go ABI.
-TEXT runtime·sigtramp<ABIInternal>(SB),NOSPLIT,$72
-	// Save callee-saved C registers, since the caller may be a C signal handler.
-	MOVQ	BX,  bx-8(SP)
-	MOVQ	BP,  bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
-	MOVQ	R12, r12-24(SP)
-	MOVQ	R13, r13-32(SP)
-	MOVQ	R14, r14-40(SP)
-	MOVQ	R15, r15-48(SP)
-	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
-	// modify them.
+// Called using C ABI.
+TEXT runtime·sigtramp<ABIInternal>(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
-	MOVQ	DX, ctx-56(SP)
-	MOVQ	SI, info-64(SP)
-	MOVQ	DI, signum-72(SP)
-	MOVQ	$runtime·sigtrampgo(SB), AX
-	CALL AX
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+        ADJSP   $24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	r15-48(SP), R15
-	MOVQ	r14-40(SP), R14
-	MOVQ	r13-32(SP), R13
-	MOVQ	r12-24(SP), R12
-	MOVQ	bp-16(SP),  BP
-	MOVQ	bx-8(SP),   BX
+        POP_REGS_HOST_TO_ABI0()
 	RET
 
 // Used instead of sigtramp in programs that use cgo.
@@ -652,7 +576,7 @@ nog1:
 	CALL	runtime·stackcheck(SB)
 
 nog2:
-	// Call fn
+	// Call fn. This is the PC of an ABI0 function.
 	CALL	R12
 
 	// It shouldn't return. If it does, exit that thread.
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 475f52344c..02a5d4a642 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -242,7 +242,7 @@ TEXT runtime·mincore(SB),NOSPLIT,$0
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 
diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index 198a5bacef..69ac160278 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s
@@ -206,8 +206,8 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	R0, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$24-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$24-12
 	MOVD	RSP, R20	// R20 is unchanged by C code
 	MOVD	RSP, R1
 
diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index c3e9f37694..0206cb88bd 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s
@@ -213,8 +213,8 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$16-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16-12
 	MOVV	R29, R16	// R16 is unchanged by C code
 	MOVV	R29, R1
 
@@ -319,7 +319,7 @@ noswitch:
 	BEQ	R25, fallback
 
 	JAL	(R25)
-	// see walltime1 for detail
+	// see walltime for detail
 	BEQ	R2, R0, finish
 	MOVV	R0, runtime·vdsoClockgettimeSym(SB)
 	MOVW	$1, R4 // CLOCK_MONOTONIC
diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index fab2ab3892..d5317d3957 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s
@@ -218,8 +218,8 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-16
 	MOVW	R2, ret+12(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	MOVW	$0, R4	// CLOCK_REALTIME
 	MOVW	$4(R29), R5
 	MOVW	$SYS_clock_gettime, R2
diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index fd69ee70a5..75da130357 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s
@@ -184,8 +184,8 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	R3, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$16-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16-12
 	MOVD	R1, R15		// R15 is unchanged by C code
 	MOVD	g_m(g), R21	// R21 = m
 
diff --git a/src/runtime/sys_linux_riscv64.s b/src/runtime/sys_linux_riscv64.s
index 626ab3912c..2389f1cc18 100644
--- a/src/runtime/sys_linux_riscv64.s
+++ b/src/runtime/sys_linux_riscv64.s
@@ -219,8 +219,8 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	A0, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$24-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$24-12
 	MOV	$0, A0 // CLOCK_REALTIME
 	MOV	$8(X2), A1
 	MOV	$SYS_clock_gettime, A7
diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
index c15a1d5364..916dfada8d 100644
--- a/src/runtime/sys_linux_s390x.s
+++ b/src/runtime/sys_linux_s390x.s
@@ -194,8 +194,8 @@ TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$16
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVW	$0, R2 // CLOCK_REALTIME
 	MOVD	$tp-16(SP), R3
 	MOVW	$SYS_clock_gettime, R1
diff --git a/src/runtime/sys_mips64x.go b/src/runtime/sys_mips64x.go
index cb429c3147..842a4a7084 100644
--- a/src/runtime/sys_mips64x.go
+++ b/src/runtime/sys_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips64 || mips64le
 // +build mips64 mips64le
 
 package runtime
diff --git a/src/runtime/sys_mipsx.go b/src/runtime/sys_mipsx.go
index 2819218d5f..2038eb7d79 100644
--- a/src/runtime/sys_mipsx.go
+++ b/src/runtime/sys_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build mips || mipsle
 // +build mips mipsle
 
 package runtime
diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index d0c470c457..8a33894892 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s
@@ -206,8 +206,8 @@ TEXT runtime·setitimer(SB),NOSPLIT,$-4
 	INT	$0x80
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	LEAL	12(SP), BX
 	MOVL	$CLOCK_REALTIME, 4(SP)	// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -376,6 +376,10 @@ TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
 	MOVL	$0x1234, 0x1005
 	RET
 
+TEXT ·netbsdMstart(SB),NOSPLIT|TOPFRAME,$0
+	CALL	·netbsdMstart0(SB)
+	RET // not reached
+
 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
 	MOVL	$SYS___sigaltstack14, AX
 	MOVL	new+0(FP), BX
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index dc9bd127d2..02f5b4ba3b 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -9,6 +9,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 #define CLOCK_REALTIME		0
 #define CLOCK_MONOTONIC		3
@@ -70,7 +71,7 @@ TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
 	MOVQ	R9, g(CX)
 	CALL	runtime·stackcheck(SB)
 
-	// Call fn
+	// Call fn. This is an ABI0 PC.
 	CALL	R12
 
 	// It shouldn't return. If it does, exit.
@@ -78,6 +79,10 @@ TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
 	SYSCALL
 	JMP	-3(PC)			// keep exiting
 
+TEXT ·netbsdMstart(SB),NOSPLIT|TOPFRAME,$0
+	CALL	·netbsdMstart0(SB)
+	RET // not reached
+
 TEXT runtime·osyield(SB),NOSPLIT,$0
 	MOVL	$SYS_sched_yield, AX
 	SYSCALL
@@ -233,8 +238,8 @@ TEXT runtime·setitimer(SB),NOSPLIT,$-8
 	SYSCALL
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVQ	$CLOCK_REALTIME, DI	// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$SYS___clock_gettime50, AX
@@ -314,28 +319,21 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$72
-	// Save callee-saved C registers, since the caller may be a C signal handler.
-	MOVQ	BX,  bx-8(SP)
-	MOVQ	BP,  bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
-	MOVQ	R12, r12-24(SP)
-	MOVQ	R13, r13-32(SP)
-	MOVQ	R14, r14-40(SP)
-	MOVQ	R15, r15-48(SP)
-	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
-	// modify them.
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
-	MOVQ	DX, ctx-56(SP)
-	MOVQ	SI, info-64(SP)
-	MOVQ	DI, signum-72(SP)
-	CALL	runtime·sigtrampgo(SB)
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+        ADJSP   $24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	r15-48(SP), R15
-	MOVQ	r14-40(SP), R14
-	MOVQ	r13-32(SP), R13
-	MOVQ	r12-24(SP), R12
-	MOVQ	bp-16(SP),  BP
-	MOVQ	bx-8(SP),   BX
+        POP_REGS_HOST_TO_ABI0()
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index 678dea57c6..3a763b2a6a 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s
@@ -177,6 +177,10 @@ TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
 	MOVW R8, (R8)
 	RET
 
+TEXT ·netbsdMstart(SB),NOSPLIT|TOPFRAME,$0
+	BL ·netbsdMstart0(SB)
+	RET // not reached
+
 TEXT runtime·usleep(SB),NOSPLIT,$16
 	MOVW usec+0(FP), R0
 	CALL runtime·usplitR0(SB)
@@ -212,8 +216,8 @@ TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0
 	SWI $SYS___setitimer50
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW $0, R0	// CLOCK_REALTIME
 	MOVW $8(R13), R1
 	SWI $SYS___clock_gettime50
diff --git a/src/runtime/sys_netbsd_arm64.s b/src/runtime/sys_netbsd_arm64.s
index 4d9b05478f..2d0b894d47 100644
--- a/src/runtime/sys_netbsd_arm64.s
+++ b/src/runtime/sys_netbsd_arm64.s
@@ -76,6 +76,10 @@ nog:
 	MOVD	$0, R0  // crash (not reached)
 	MOVD	R0, (R8)
 
+TEXT ·netbsdMstart(SB),NOSPLIT|TOPFRAME,$0
+	CALL	·netbsdMstart0(SB)
+	RET // not reached
+
 TEXT runtime·osyield(SB),NOSPLIT,$0
 	SVC	$SYS_sched_yield
 	RET
@@ -220,8 +224,8 @@ TEXT runtime·setitimer(SB),NOSPLIT,$-8
 	SVC	$SYS___setitimer50
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW	$CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVD	$8(RSP), R1		// arg 2 - tp
 	SVC	$SYS___clock_gettime50
diff --git a/src/runtime/sys_nonppc64x.go b/src/runtime/sys_nonppc64x.go
index 440937498f..66821b1f76 100644
--- a/src/runtime/sys_nonppc64x.go
+++ b/src/runtime/sys_nonppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !ppc64 && !ppc64le
 // +build !ppc64,!ppc64le
 
 package runtime
diff --git a/src/runtime/sys_openbsd.go b/src/runtime/sys_openbsd.go
index fcddf4d6a5..f6146c2e1d 100644
--- a/src/runtime/sys_openbsd.go
+++ b/src/runtime/sys_openbsd.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,amd64 openbsd,arm64
+//go:build (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
diff --git a/src/runtime/sys_openbsd1.go b/src/runtime/sys_openbsd1.go
index 44c7871ceb..b007b6d3f5 100644
--- a/src/runtime/sys_openbsd1.go
+++ b/src/runtime/sys_openbsd1.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,amd64 openbsd,arm64
+//go:build (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
diff --git a/src/runtime/sys_openbsd2.go b/src/runtime/sys_openbsd2.go
index 33032596c3..91ed04fb48 100644
--- a/src/runtime/sys_openbsd2.go
+++ b/src/runtime/sys_openbsd2.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,amd64 openbsd,arm64
+//go:build (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
@@ -161,7 +162,7 @@ func nanotime1() int64 {
 func clock_gettime_trampoline()
 
 //go:nosplit
-func walltime1() (int64, int32) {
+func walltime() (int64, int32) {
 	var ts timespec
 	args := struct {
 		clock_id int32
diff --git a/src/runtime/sys_openbsd3.go b/src/runtime/sys_openbsd3.go
index 4d4c88e3ac..8fdee05733 100644
--- a/src/runtime/sys_openbsd3.go
+++ b/src/runtime/sys_openbsd3.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build openbsd,amd64 openbsd,arm64
+//go:build (openbsd && 386) || (openbsd && amd64) || (openbsd && arm64)
+// +build openbsd,386 openbsd,amd64 openbsd,arm64
 
 package runtime
 
diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
index 24fbfd6266..7830b61b7d 100644
--- a/src/runtime/sys_openbsd_386.s
+++ b/src/runtime/sys_openbsd_386.s
@@ -3,7 +3,9 @@
 // license that can be found in the LICENSE file.
 //
 // System calls and other sys.stuff for 386, OpenBSD
-// /usr/src/sys/kern/syscalls.master for syscall numbers.
+// System calls are implemented in libc/libpthread, this file
+// contains trampolines that convert from Go to C calling convention.
+// Some direct system call implementations currently remain.
 //
 
 #include "go_asm.h"
@@ -12,221 +14,44 @@
 
 #define	CLOCK_MONOTONIC	$3
 
-// Exit the entire program (like C exit)
-TEXT runtime·exit(SB),NOSPLIT,$-4
-	MOVL	$1, AX
-	INT	$0x80
-	MOVL	$0xf1, 0xf1		// crash
+TEXT runtime·setldt(SB),NOSPLIT,$0
+	// Nothing to do, pthread already set thread-local storage up.
 	RET
 
-// func exitThread(wait *uint32)
-TEXT runtime·exitThread(SB),NOSPLIT,$0-4
-	MOVL	$302, AX		// sys___threxit
-	INT	$0x80
-	MOVL	$0xf1, 0xf1		// crash
-	JMP	0(PC)
-
-TEXT runtime·open(SB),NOSPLIT,$-4
-	MOVL	$5, AX
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$-1, AX
-	MOVL	AX, ret+12(FP)
-	RET
-
-TEXT runtime·closefd(SB),NOSPLIT,$-4
-	MOVL	$6, AX
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$-1, AX
-	MOVL	AX, ret+4(FP)
-	RET
-
-TEXT runtime·read(SB),NOSPLIT,$-4
-	MOVL	$3, AX
-	INT	$0x80
-	JAE	2(PC)
-	NEGL	AX			// caller expects negative errno
-	MOVL	AX, ret+12(FP)
-	RET
-
-// func pipe() (r, w int32, errno int32)
-TEXT runtime·pipe(SB),NOSPLIT,$8-12
-	MOVL	$263, AX
-	LEAL	r+0(FP), BX
-	MOVL	BX, 4(SP)
-	INT	$0x80
-	MOVL	AX, errno+8(FP)
-	RET
-
-// func pipe2(flags int32) (r, w int32, errno int32)
-TEXT runtime·pipe2(SB),NOSPLIT,$12-16
-	MOVL	$101, AX
-	LEAL	r+4(FP), BX
-	MOVL	BX, 4(SP)
-	MOVL	flags+0(FP), BX
-	MOVL	BX, 8(SP)
-	INT	$0x80
-	MOVL	AX, errno+12(FP)
-	RET
+// mstart_stub is the first function executed on a new thread started by pthread_create.
+// It just does some low-level setup and then calls mstart.
+// Note: called with the C calling convention.
+TEXT runtime·mstart_stub(SB),NOSPLIT,$28
+	NOP	SP	// tell vet SP changed - stop checking offsets
 
-TEXT runtime·write1(SB),NOSPLIT,$-4
-	MOVL	$4, AX			// sys_write
-	INT	$0x80
-	JAE	2(PC)
-	NEGL	AX			// caller expects negative errno
-	MOVL	AX, ret+12(FP)
-	RET
+	// We are already on m's g0 stack.
 
-TEXT runtime·usleep(SB),NOSPLIT,$24
-	MOVL	$0, DX
-	MOVL	usec+0(FP), AX
-	MOVL	$1000000, CX
-	DIVL	CX
-	MOVL	AX, 12(SP)		// tv_sec - l32
-	MOVL	$0, 16(SP)		// tv_sec - h32
-	MOVL	$1000, AX
-	MULL	DX
-	MOVL	AX, 20(SP)		// tv_nsec
+	// Save callee-save registers.
+	MOVL	BX, bx-4(SP)
+	MOVL	BP, bp-8(SP)
+	MOVL	SI, si-12(SP)
+	MOVL	DI, di-16(SP)
 
-	MOVL	$0, 0(SP)
-	LEAL	12(SP), AX
-	MOVL	AX, 4(SP)		// arg 1 - rqtp
-	MOVL	$0, 8(SP)		// arg 2 - rmtp
-	MOVL	$91, AX			// sys_nanosleep
-	INT	$0x80
-	RET
+	MOVL	32(SP), AX	// m
+	MOVL	m_g0(AX), DX
+	get_tls(CX)
+	MOVL	DX, g(CX)
 
-TEXT runtime·getthrid(SB),NOSPLIT,$0-4
-	MOVL	$299, AX		// sys_getthrid
-	INT	$0x80
-	MOVL	AX, ret+0(FP)
-	RET
+	CALL	runtime·mstart(SB)
 
-TEXT runtime·thrkill(SB),NOSPLIT,$16-8
-	MOVL	$0, 0(SP)
-	MOVL	tid+0(FP), AX
-	MOVL	AX, 4(SP)		// arg 1 - tid
-	MOVL	sig+4(FP), AX
-	MOVL	AX, 8(SP)		// arg 2 - signum
-	MOVL	$0, 12(SP)		// arg 3 - tcb
-	MOVL	$119, AX		// sys_thrkill
-	INT	$0x80
-	RET
-
-TEXT runtime·raiseproc(SB),NOSPLIT,$12
-	MOVL	$20, AX			// sys_getpid
-	INT	$0x80
-	MOVL	$0, 0(SP)
-	MOVL	AX, 4(SP)		// arg 1 - pid
-	MOVL	sig+0(FP), AX
-	MOVL	AX, 8(SP)		// arg 2 - signum
-	MOVL	$122, AX		// sys_kill
-	INT	$0x80
-	RET
+	// Restore callee-save registers.
+	MOVL	di-16(SP), DI
+	MOVL	si-12(SP), SI
+	MOVL	bp-8(SP),  BP
+	MOVL	bx-4(SP),  BX
 
-TEXT runtime·mmap(SB),NOSPLIT,$36
-	LEAL	addr+0(FP), SI
-	LEAL	4(SP), DI
-	CLD
-	MOVSL				// arg 1 - addr
-	MOVSL				// arg 2 - len
-	MOVSL				// arg 3 - prot
-	MOVSL				// arg 4 - flags
-	MOVSL				// arg 5 - fd
+	// Go is all done with this OS thread.
+	// Tell pthread everything is ok (we never join with this thread, so
+	// the value here doesn't really matter).
 	MOVL	$0, AX
-	STOSL				// arg 6 - pad
-	MOVSL				// arg 7 - offset
-	MOVL	$0, AX			// top 32 bits of file offset
-	STOSL
-	MOVL	$197, AX		// sys_mmap
-	INT	$0x80
-	JAE	ok
-	MOVL	$0, p+24(FP)
-	MOVL	AX, err+28(FP)
-	RET
-ok:
-	MOVL	AX, p+24(FP)
-	MOVL	$0, err+28(FP)
-	RET
-
-TEXT runtime·munmap(SB),NOSPLIT,$-4
-	MOVL	$73, AX			// sys_munmap
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$0xf1, 0xf1		// crash
-	RET
-
-TEXT runtime·madvise(SB),NOSPLIT,$-4
-	MOVL	$75, AX			// sys_madvise
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$-1, AX
-	MOVL	AX, ret+12(FP)
-	RET
-
-TEXT runtime·setitimer(SB),NOSPLIT,$-4
-	MOVL	$69, AX
-	INT	$0x80
-	RET
-
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
-	LEAL	12(SP), BX
-	MOVL	$0, 4(SP)		// arg 1 - clock_id
-	MOVL	BX, 8(SP)		// arg 2 - tp
-	MOVL	$87, AX			// sys_clock_gettime
-	INT	$0x80
-
-	MOVL	12(SP), AX		// sec - l32
-	MOVL	AX, sec_lo+0(FP)
-	MOVL	16(SP), AX		// sec - h32
-	MOVL	AX, sec_hi+4(FP)
-
-	MOVL	20(SP), BX		// nsec
-	MOVL	BX, nsec+8(FP)
-	RET
-
-// int64 nanotime1(void) so really
-// void nanotime1(int64 *nsec)
-TEXT runtime·nanotime1(SB),NOSPLIT,$32
-	LEAL	12(SP), BX
-	MOVL	CLOCK_MONOTONIC, 4(SP)	// arg 1 - clock_id
-	MOVL	BX, 8(SP)		// arg 2 - tp
-	MOVL	$87, AX			// sys_clock_gettime
-	INT	$0x80
-
-	MOVL    16(SP), CX		// sec - h32
-	IMULL   $1000000000, CX
-
-	MOVL    12(SP), AX		// sec - l32
-	MOVL    $1000000000, BX
-	MULL    BX			// result in dx:ax
-
-	MOVL	20(SP), BX		// nsec
-	ADDL	BX, AX
-	ADCL	CX, DX			// add high bits with carry
-
-	MOVL	AX, ret_lo+0(FP)
-	MOVL	DX, ret_hi+4(FP)
-	RET
-
-TEXT runtime·sigaction(SB),NOSPLIT,$-4
-	MOVL	$46, AX			// sys_sigaction
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·obsdsigprocmask(SB),NOSPLIT,$-4
-	MOVL	$48, AX			// sys_sigprocmask
-	INT	$0x80
-	JAE	2(PC)
-	MOVL	$0xf1, 0xf1		// crash
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·sigfwd(SB),NOSPLIT,$12-16
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
 	MOVL	fn+0(FP), AX
 	MOVL	sig+4(FP), BX
 	MOVL	info+8(FP), CX
@@ -268,194 +93,876 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$28
 	MOVL	bx-4(SP),  BX
 	RET
 
-// int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
-TEXT runtime·tfork(SB),NOSPLIT,$12
+// These trampolines help convert from Go calling convention to C calling convention.
+// They should be called with asmcgocall - note that while asmcgocall does
+// stack alignment, creation of a frame undoes it again.
+// A pointer to the arguments is passed on the stack.
+// A single int32 result is returned in AX.
+// (For more results, make an args/results structure.)
+TEXT runtime·pthread_attr_init_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$4, SP
+	MOVL	12(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	AX, 0(SP)		// arg 1 - attr
+	CALL	libc_pthread_attr_init(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Copy mp, gp and fn from the parent stack onto the child stack.
-	MOVL	param+0(FP), AX
-	MOVL	8(AX), CX		// tf_stack
-	SUBL	$16, CX
-	MOVL	CX, 8(AX)
-	MOVL	mm+8(FP), SI
-	MOVL	SI, 0(CX)
-	MOVL	gg+12(FP), SI
-	MOVL	SI, 4(CX)
-	MOVL	fn+16(FP), SI
-	MOVL	SI, 8(CX)
-	MOVL	$1234, 12(CX)
+TEXT runtime·pthread_attr_destroy_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$4, SP
+	MOVL	12(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	AX, 0(SP)		// arg 1 - attr
+	CALL	libc_pthread_attr_destroy(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	param+0(FP), AX
-	MOVL	AX, 4(SP)		// arg 1 - param
-	MOVL	psize+4(FP), AX
-	MOVL	AX, 8(SP)		// arg 2 - psize
-	MOVL	$8, AX			// sys___tfork
-	INT	$0x80
+TEXT runtime·pthread_attr_getstacksize_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - attr
+	MOVL	BX, 4(SP)		// arg 2 - size
+	CALL	libc_pthread_attr_getstacksize(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Return if tfork syscall failed.
-	JCC	4(PC)
-	NEGL	AX
-	MOVL	AX, ret+20(FP)
+TEXT runtime·pthread_attr_setdetachstate_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - attr
+	MOVL	BX, 4(SP)		// arg 2 - state
+	CALL	libc_pthread_attr_setdetachstate(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-	// In parent, return.
-	CMPL	AX, $0
-	JEQ	3(PC)
-	MOVL	AX, ret+20(FP)
+TEXT runtime·pthread_create_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$20, SP
+	MOVL	28(SP), DX		// pointer to args
+	LEAL	16(SP), AX
+	MOVL	AX, 0(SP)		// arg 1 - &threadid (discarded)
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 4(SP)		// arg 2 - attr
+	MOVL	BX, 8(SP)		// arg 3 - start
+	MOVL	CX, 12(SP)		// arg 4 - arg
+	CALL	libc_pthread_create(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-	// Paranoia: check that SP is as we expect.
-	MOVL	12(SP), BP
-	CMPL	BP, $1234
-	JEQ	2(PC)
-	INT	$3
+TEXT runtime·thrkill_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - tid
+	MOVL	BX, 4(SP)		// arg 2 - signal
+	MOVL	$0, 8(SP)		// arg 3 - tcb
+	CALL	libc_thrkill(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Reload registers.
-	MOVL	0(SP), BX		// m
-	MOVL	4(SP), DX		// g
-	MOVL	8(SP), SI		// fn
+TEXT runtime·thrsleep_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$20, SP
+	MOVL	28(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - id
+	MOVL	BX, 4(SP)		// arg 2 - clock_id
+	MOVL	CX, 8(SP)		// arg 3 - abstime
+	MOVL	12(DX), AX
+	MOVL	16(DX), BX
+	MOVL	AX, 12(SP)		// arg 4 - lock
+	MOVL	BX, 16(SP)		// arg 5 - abort
+	CALL	libc_thrsleep(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Set FS to point at m->tls.
-	LEAL	m_tls(BX), BP
-	PUSHAL				// save registers
+TEXT runtime·thrwakeup_trampoline(SB),NOSPLIT,$0
 	PUSHL	BP
-	CALL	set_tcb<>(SB)
-	POPL	AX
-	POPAL
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - id
+	MOVL	BX, 4(SP)		// arg 2 - count
+	CALL	libc_thrwakeup(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Now segment is established. Initialize m, g.
-	get_tls(AX)
-	MOVL	DX, g(AX)
-	MOVL	BX, g_m(DX)
+TEXT runtime·exit_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$4, SP
+	MOVL	12(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	AX, 0(SP)		// arg 1 - status
+	CALL	libc_exit(SB)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	CALL	runtime·stackcheck(SB)	// smashes AX, CX
-	MOVL	0(DX), DX		// paranoia; check they are not nil
-	MOVL	0(BX), BX
+TEXT runtime·getthrid_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	CALL	libc_getthrid(SB)
+	NOP	SP			// tell vet SP changed - stop checking offsets
+	MOVL	8(SP), DX		// pointer to return value
+	MOVL	AX, 0(DX)
+	POPL	BP
+	RET
 
-	// More paranoia; check that stack splitting code works.
-	PUSHAL
-	CALL	runtime·emptyfunc(SB)
-	POPAL
+TEXT runtime·raiseproc_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX
+	MOVL	0(DX), BX
+	CALL	libc_getpid(SB)
+	MOVL	AX, 0(SP)		// arg 1 - pid
+	MOVL	BX, 4(SP)		// arg 2 - signal
+	CALL	libc_kill(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·sched_yield_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	CALL	libc_sched_yield(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$32, SP
+	MOVL	40(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - addr
+	MOVL	BX, 4(SP)		// arg 2 - len
+	MOVL	CX, 8(SP)		// arg 3 - prot
+	MOVL	12(DX), AX
+	MOVL	16(DX), BX
+	MOVL	20(DX), CX
+	MOVL	AX, 12(SP)		// arg 4 - flags
+	MOVL	BX, 16(SP)		// arg 5 - fid
+	MOVL	$0, 20(SP)		// pad
+	MOVL	CX, 24(SP)		// arg 6 - offset (low 32 bits)
+	MOVL	$0, 28(SP)		// offset (high 32 bits)
+	CALL	libc_mmap(SB)
+	MOVL	$0, BX
+	CMPL	AX, $-1
+	JNE	ok
+	CALL	libc_errno(SB)
+	MOVL	(AX), BX
+	MOVL	$0, AX
+ok:
+	MOVL	40(SP), DX
+	MOVL	AX, 24(DX)
+	MOVL	BX, 28(DX)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·munmap_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - addr
+	MOVL	BX, 4(SP)		// arg 2 - len
+	CALL	libc_munmap(SB)
+	CMPL	AX, $-1
+	JNE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·madvise_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - addr
+	MOVL	BX, 4(SP)		// arg 2 - len
+	MOVL	CX, 8(SP)		// arg 3 - advice
+	CALL	libc_madvise(SB)
+	// ignore failure - maybe pages are locked
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·open_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$16, SP
+	MOVL	24(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - path
+	MOVL	BX, 4(SP)		// arg 2 - flags
+	MOVL	CX, 8(SP)		// arg 3 - mode
+	MOVL	$0, 12(SP)		// vararg
+	CALL	libc_open(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·close_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$4, SP
+	MOVL	12(SP), DX
+	MOVL	0(DX), AX
+	MOVL	AX, 0(SP)		// arg 1 - fd
+	CALL	libc_close(SB)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+TEXT runtime·read_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - fd
+	MOVL	BX, 4(SP)		// arg 2 - buf
+	MOVL	CX, 8(SP)		// arg 3 - count
+	CALL	libc_read(SB)
+	CMPL	AX, $-1
+	JNE	noerr
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno
+noerr:
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// Call fn.
-	CALL	SI
+TEXT runtime·write_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - fd
+	MOVL	BX, 4(SP)		// arg 2 - buf
+	MOVL	CX, 8(SP)		// arg 3 - count
+	CALL	libc_write(SB)
+	CMPL	AX, $-1
+	JNE	noerr
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno
+noerr:
+	MOVL	BP, SP
+	POPL	BP
+	RET
 
-	// fn should never return.
-	MOVL	$0x1234, 0x1005
+TEXT runtime·pipe2_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - fds
+	MOVL	BX, 4(SP)		// arg 2 - flags
+	CALL	libc_pipe2(SB)
+	CMPL	AX, $-1
+	JNE	noerr
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno
+noerr:
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
-	MOVL	$288, AX		// sys_sigaltstack
-	MOVL	new+0(FP), BX
-	MOVL	old+4(FP), CX
-	INT	$0x80
-	CMPL	AX, $0xfffff001
-	JLS	2(PC)
-	INT	$3
+TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - which
+	MOVL	BX, 4(SP)		// arg 2 - new
+	MOVL	CX, 8(SP)		// arg 3 - old
+	CALL	libc_setitimer(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·setldt(SB),NOSPLIT,$4
-	// Under OpenBSD we set the GS base instead of messing with the LDT.
-	MOVL	base+4(FP), AX
+TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$4, SP
+	MOVL	12(SP), DX		// pointer to args
+	MOVL	0(DX), AX
 	MOVL	AX, 0(SP)
-	CALL	set_tcb<>(SB)
+	CALL	libc_usleep(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT set_tcb<>(SB),NOSPLIT,$8
-	// adjust for ELF: wants to use -4(GS) for g
-	MOVL	tlsbase+0(FP), CX
-	ADDL	$4, CX
-	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	CX, 4(SP)		// arg 1 - tcb
-	MOVL	$329, AX		// sys___set_tcb
-	INT	$0x80
-	JCC	2(PC)
-	MOVL	$0xf1, 0xf1		// crash
+TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$24, SP
+	MOVL	32(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - name
+	MOVL	BX, 4(SP)		// arg 2 - namelen
+	MOVL	CX, 8(SP)		// arg 3 - old
+	MOVL	12(DX), AX
+	MOVL	16(DX), BX
+	MOVL	20(DX), CX
+	MOVL	AX, 12(SP)		// arg 4 - oldlenp
+	MOVL	BX, 16(SP)		// arg 5 - newp
+	MOVL	CX, 20(SP)		// arg 6 - newlen
+	CALL	libc_sysctl(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·osyield(SB),NOSPLIT,$-4
-	MOVL	$298, AX		// sys_sched_yield
-	INT	$0x80
+TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	CALL	libc_kqueue(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·thrsleep(SB),NOSPLIT,$-4
-	MOVL	$94, AX			// sys___thrsleep
-	INT	$0x80
-	MOVL	AX, ret+20(FP)
+TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$24, SP
+	MOVL	32(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - kq
+	MOVL	BX, 4(SP)		// arg 2 - keventt
+	MOVL	CX, 8(SP)		// arg 3 - nch
+	MOVL	12(DX), AX
+	MOVL	16(DX), BX
+	MOVL	20(DX), CX
+	MOVL	AX, 12(SP)		// arg 4 - ev
+	MOVL	BX, 16(SP)		// arg 5 - nev
+	MOVL	CX, 20(SP)		// arg 6 - ts
+	CALL	libc_kevent(SB)
+	CMPL	AX, $-1
+	JNE	noerr
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno
+noerr:
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·thrwakeup(SB),NOSPLIT,$-4
-	MOVL	$301, AX		// sys___thrwakeup
-	INT	$0x80
-	MOVL	AX, ret+8(FP)
+TEXT runtime·clock_gettime_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - tp
+	MOVL	BX, 4(SP)		// arg 2 - clock_id
+	CALL	libc_clock_gettime(SB)
+	CMPL	AX, $-1
+	JNE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-TEXT runtime·sysctl(SB),NOSPLIT,$28
-	LEAL	mib+0(FP), SI
-	LEAL	4(SP), DI
-	CLD
-	MOVSL				// arg 1 - name
-	MOVSL				// arg 2 - namelen
-	MOVSL				// arg 3 - oldp
-	MOVSL				// arg 4 - oldlenp
-	MOVSL				// arg 5 - newp
-	MOVSL				// arg 6 - newlen
-	MOVL	$202, AX		// sys___sysctl
-	INT	$0x80
-	JCC	4(PC)
-	NEGL	AX
-	MOVL	AX, ret+24(FP)
+TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$16, SP
+	MOVL	24(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - fd
+	MOVL	BX, 4(SP)		// arg 2 - cmd
+	MOVL	CX, 8(SP)		// arg 3 - arg
+	MOVL	$0, 12(SP)		// vararg
+	CALL	libc_fcntl(SB)
+	MOVL	BP, SP
+	POPL	BP
 	RET
-	MOVL	$0, AX
-	MOVL	AX, ret+24(FP)
+
+TEXT runtime·sigaction_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - sig
+	MOVL	BX, 4(SP)		// arg 2 - new
+	MOVL	CX, 8(SP)		// arg 3 - old
+	CALL	libc_sigaction(SB)
+	CMPL	AX, $-1
+	JNE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-// int32 runtime·kqueue(void);
-TEXT runtime·kqueue(SB),NOSPLIT,$0
-	MOVL	$269, AX
-	INT	$0x80
-	JAE	2(PC)
-	NEGL	AX
-	MOVL	AX, ret+0(FP)
+TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$12, SP
+	MOVL	20(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	8(DX), CX
+	MOVL	AX, 0(SP)		// arg 1 - how
+	MOVL	BX, 4(SP)		// arg 2 - new
+	MOVL	CX, 8(SP)		// arg 3 - old
+	CALL	libc_pthread_sigmask(SB)
+	CMPL	AX, $-1
+	JNE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
-TEXT runtime·kevent(SB),NOSPLIT,$0
-	MOVL	$72, AX			// sys_kevent
-	INT	$0x80
-	JAE	2(PC)
-	NEGL	AX
-	MOVL	AX, ret+24(FP)
+TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	SUBL	$8, SP
+	MOVL	16(SP), DX		// pointer to args
+	MOVL	0(DX), AX
+	MOVL	4(DX), BX
+	MOVL	AX, 0(SP)		// arg 1 - new
+	MOVL	BX, 4(SP)		// arg 2 - old
+	CALL	libc_sigaltstack(SB)
+	CMPL	AX, $-1
+	JNE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash on failure
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-// int32 runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT,$32
-	MOVL	$92, AX			// sys_fcntl
-	// 0(SP) is where the caller PC would be; kernel skips it
-	MOVL	fd+0(FP), BX
-	MOVL	BX, 4(SP)	// fd
-	MOVL	$2, 8(SP)	// F_SETFD
-	MOVL	$1, 12(SP)	// FD_CLOEXEC
-	INT	$0x80
-	JAE	2(PC)
-	NEGL	AX
+// syscall calls a function in libc on behalf of the syscall package.
+// syscall takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall expects a 32-bit result and tests for 32-bit -1
+// to decide there was an error.
+TEXT runtime·syscall(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$12, SP
+	MOVL	20(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (4*4)(BX)		// r1
+	MOVL	DX, (5*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (6*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-// func runtime·setNonblock(fd int32)
-TEXT runtime·setNonblock(SB),NOSPLIT,$16-4
-	MOVL	$92, AX // fcntl
-	MOVL	fd+0(FP), BX // fd
-	MOVL	BX, 4(SP)
-	MOVL	$3, 8(SP) // F_GETFL
-	MOVL	$0, 12(SP)
-	INT	$0x80
-	MOVL	fd+0(FP), BX // fd
-	MOVL	BX, 4(SP)
-	MOVL	$4, 8(SP) // F_SETFL
-	ORL	$4, AX // O_NONBLOCK
-	MOVL	AX, 12(SP)
-	MOVL	$92, AX // fcntl
-	INT	$0x80
+// syscallX calls a function in libc on behalf of the syscall package.
+// syscallX takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscallX must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscallX is like syscall but expects a 64-bit result
+// and tests for 64-bit -1 to decide there was an error.
+TEXT runtime·syscallX(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$12, SP
+	MOVL	20(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (4*4)(BX)		// r1
+	MOVL	DX, (5*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+	CMPL	DX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (6*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+// syscall6 calls a function in libc on behalf of the syscall package.
+// syscall6 takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall6 must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall6 expects a 32-bit result and tests for 32-bit -1
+// to decide there was an error.
+TEXT runtime·syscall6(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$24, SP
+	MOVL	32(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+	MOVL	(4*4)(BX), AX
+	MOVL	(5*4)(BX), CX
+	MOVL	(6*4)(BX), DX
+	MOVL	AX, (3*4)(SP)		// a4
+	MOVL	CX, (4*4)(SP)		// a5
+	MOVL	DX, (5*4)(SP)		// a6
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (7*4)(BX)		// r1
+	MOVL	DX, (8*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (9*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+// syscall6X calls a function in libc on behalf of the syscall package.
+// syscall6X takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall6X must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall6X is like syscall6 but expects a 64-bit result
+// and tests for 64-bit -1 to decide there was an error.
+TEXT runtime·syscall6X(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$24, SP
+	MOVL	32(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+	MOVL	(4*4)(BX), AX
+	MOVL	(5*4)(BX), CX
+	MOVL	(6*4)(BX), DX
+	MOVL	AX, (3*4)(SP)		// a4
+	MOVL	CX, (4*4)(SP)		// a5
+	MOVL	DX, (5*4)(SP)		// a6
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (7*4)(BX)		// r1
+	MOVL	DX, (8*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+	CMPL	DX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (9*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
 	RET
 
-GLOBL runtime·tlsoffset(SB),NOPTR,$4
+// syscall10 calls a function in libc on behalf of the syscall package.
+// syscall10 takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	a7    uintptr
+//	a8    uintptr
+//	a9    uintptr
+//	a10   uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall10 must be called on the g0 stack with the
+// C calling convention (use libcCall).
+TEXT runtime·syscall10(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$40, SP
+	MOVL	48(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+	MOVL	(4*4)(BX), AX
+	MOVL	(5*4)(BX), CX
+	MOVL	(6*4)(BX), DX
+	MOVL	AX, (3*4)(SP)		// a4
+	MOVL	CX, (4*4)(SP)		// a5
+	MOVL	DX, (5*4)(SP)		// a6
+	MOVL	(7*4)(BX), AX
+	MOVL	(8*4)(BX), CX
+	MOVL	(9*4)(BX), DX
+	MOVL	AX, (6*4)(SP)		// a7
+	MOVL	CX, (7*4)(SP)		// a8
+	MOVL	DX, (8*4)(SP)		// a9
+	MOVL	(10*4)(BX), AX
+	MOVL	AX, (9*4)(SP)		// a10
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (11*4)(BX)		// r1
+	MOVL	DX, (12*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (13*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
+	RET
+
+// syscall10X calls a function in libc on behalf of the syscall package.
+// syscall10X takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	a7    uintptr
+//	a8    uintptr
+//	a9    uintptr
+//	a10   uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall10X must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall10X is like syscall9 but expects a 64-bit result
+// and tests for 64-bit -1 to decide there was an error.
+TEXT runtime·syscall10X(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+
+	SUBL	$40, SP
+	MOVL	48(SP), BX		// pointer to args
+
+	MOVL	(1*4)(BX), AX
+	MOVL	(2*4)(BX), CX
+	MOVL	(3*4)(BX), DX
+	MOVL	AX, (0*4)(SP)		// a1
+	MOVL	CX, (1*4)(SP)		// a2
+	MOVL	DX, (2*4)(SP)		// a3
+	MOVL	(4*4)(BX), AX
+	MOVL	(5*4)(BX), CX
+	MOVL	(6*4)(BX), DX
+	MOVL	AX, (3*4)(SP)		// a4
+	MOVL	CX, (4*4)(SP)		// a5
+	MOVL	DX, (5*4)(SP)		// a6
+	MOVL	(7*4)(BX), AX
+	MOVL	(8*4)(BX), CX
+	MOVL	(9*4)(BX), DX
+	MOVL	AX, (6*4)(SP)		// a7
+	MOVL	CX, (7*4)(SP)		// a8
+	MOVL	DX, (8*4)(SP)		// a9
+	MOVL	(10*4)(BX), AX
+	MOVL	AX, (9*4)(SP)		// a10
+
+	MOVL	(0*4)(BX), AX		// fn
+	CALL	AX
+
+	MOVL	AX, (11*4)(BX)		// r1
+	MOVL	DX, (12*4)(BX)		// r2
+
+	// Standard libc functions return -1 on error and set errno.
+	CMPL	AX, $-1
+	JNE	ok
+	CMPL	DX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_errno(SB)
+	MOVL	(AX), AX
+	MOVW	AX, (13*4)(BX)		// err
+
+ok:
+	MOVL	$0, AX			// no error (it's ignored anyway)
+	MOVL	BP, SP
+	POPL	BP
+	RET
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index b3a76b57a3..522e98cf4f 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -11,6 +11,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "cgo/abi_amd64.h"
 
 #define CLOCK_MONOTONIC	$3
 
@@ -25,39 +26,22 @@ TEXT runtime·mstart_stub(SB),NOSPLIT,$0
 	// DI points to the m.
 	// We are already on m's g0 stack.
 
-	// Save callee-save registers.
-	SUBQ	$48, SP
-	MOVQ	BX, 0(SP)
-	MOVQ	BP, 8(SP)
-	MOVQ	R12, 16(SP)
-	MOVQ	R13, 24(SP)
-	MOVQ	R14, 32(SP)
-	MOVQ	R15, 40(SP)
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
 	// Load g and save to TLS entry.
 	// See cmd/link/internal/ld/sym.go:computeTLSOffset.
 	MOVQ	m_g0(DI), DX // g
 	MOVQ	DX, -8(FS)
 
-	// Someday the convention will be D is always cleared.
-	CLD
-
 	CALL	runtime·mstart(SB)
 
-	// Restore callee-save registers.
-	MOVQ	0(SP), BX
-	MOVQ	8(SP), BP
-	MOVQ	16(SP), R12
-	MOVQ	24(SP), R13
-	MOVQ	32(SP), R14
-	MOVQ	40(SP), R15
+	POP_REGS_HOST_TO_ABI0()
 
 	// Go is all done with this OS thread.
 	// Tell pthread everything is ok (we never join with this thread, so
 	// the value here doesn't really matter).
 	XORL	AX, AX
-
-	ADDQ	$48, SP
 	RET
 
 TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
@@ -73,28 +57,21 @@ TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$72
-	// Save callee-saved C registers, since the caller may be a C signal handler.
-	MOVQ	BX,  bx-8(SP)
-	MOVQ	BP,  bp-16(SP)  // save in case GOEXPERIMENT=noframepointer is set
-	MOVQ	R12, r12-24(SP)
-	MOVQ	R13, r13-32(SP)
-	MOVQ	R14, r14-40(SP)
-	MOVQ	R15, r15-48(SP)
-	// We don't save mxcsr or the x87 control word because sigtrampgo doesn't
-	// modify them.
+// Called using C ABI.
+TEXT runtime·sigtramp<ABIInternal>(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
-	MOVQ	DX, ctx-56(SP)
-	MOVQ	SI, info-64(SP)
-	MOVQ	DI, signum-72(SP)
-	CALL	runtime·sigtrampgo(SB)
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+        ADJSP   $24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
 
-	MOVQ	r15-48(SP), R15
-	MOVQ	r14-40(SP), R14
-	MOVQ	r13-32(SP), R13
-	MOVQ	r12-24(SP), R12
-	MOVQ	bp-16(SP),  BP
-	MOVQ	bx-8(SP),   BX
+        POP_REGS_HOST_TO_ABI0()
 	RET
 
 //
diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index 9e18ce0e16..45d69a312a 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s
@@ -188,8 +188,8 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0
 	INVOKE_SYSCALL
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW	CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVW	$8(R13), R1		// arg 2 - tp
 	MOVW	$87, R12		// sys_clock_gettime
diff --git a/src/runtime/sys_openbsd_mips64.s b/src/runtime/sys_openbsd_mips64.s
index 3e4d209081..f8ae8e7c30 100644
--- a/src/runtime/sys_openbsd_mips64.s
+++ b/src/runtime/sys_openbsd_mips64.s
@@ -186,8 +186,8 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0
 	SYSCALL
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW	CLOCK_REALTIME, R4	// arg 1 - clock_id
 	MOVV	$8(R29), R5		// arg 2 - tp
 	MOVV	$87, R2			// sys_clock_gettime
diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index f9969f6d10..b3d2f1376d 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s
@@ -102,8 +102,8 @@ TEXT runtime·nsec(SB),NOSPLIT,$8
 	MOVL	$-1, ret_hi+8(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	CALL	runtime·nanotime1(SB)
 	MOVL	0(SP), AX
 	MOVL	4(SP), DX
diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
index 383622b5a6..731306ab44 100644
--- a/src/runtime/sys_plan9_amd64.s
+++ b/src/runtime/sys_plan9_amd64.s
@@ -88,8 +88,8 @@ TEXT runtime·nsec(SB),NOSPLIT,$0
 	MOVQ	AX, ret+8(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	CALL	runtime·nanotime1(SB)
 	MOVQ	0(SP), AX
 
diff --git a/src/runtime/sys_plan9_arm.s b/src/runtime/sys_plan9_arm.s
index 9fbe30536d..5343085743 100644
--- a/src/runtime/sys_plan9_arm.s
+++ b/src/runtime/sys_plan9_arm.s
@@ -138,8 +138,8 @@ TEXT runtime·nsec(SB),NOSPLIT|NOFRAME,$0-12
 	MOVW	R0, ret_hi+8(FP)
 	RET
 
-// func walltime1() (sec int64, nsec int32)
-TEXT runtime·walltime1(SB),NOSPLIT,$12-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$12-12
 	// use nsec system call to get current time in nanoseconds
 	MOVW	$sysnsec_lo-8(SP), R0	// destination addr
 	MOVW	R0,res-12(SP)
diff --git a/src/runtime/sys_ppc64x.go b/src/runtime/sys_ppc64x.go
index 796f27c4e3..69bd99fa09 100644
--- a/src/runtime/sys_ppc64x.go
+++ b/src/runtime/sys_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ppc64 || ppc64le
 // +build ppc64 ppc64le
 
 package runtime
diff --git a/src/runtime/sys_wasm.s b/src/runtime/sys_wasm.s
index e7a6570095..164dd16ec9 100644
--- a/src/runtime/sys_wasm.s
+++ b/src/runtime/sys_wasm.s
@@ -185,7 +185,7 @@ TEXT ·nanotime1(SB), NOSPLIT, $0
 	CallImport
 	RET
 
-TEXT ·walltime1(SB), NOSPLIT, $0
+TEXT ·walltime(SB), NOSPLIT, $0
 	CallImport
 	RET
 
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 4f00c58c16..0b3933502a 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -5,9 +5,10 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "time_windows.h"
 
 // void runtime·asmstdcall(void *c);
-TEXT runtime·asmstdcall(SB),NOSPLIT,$0
+TEXT runtime·asmstdcall<ABIInternal>(SB),NOSPLIT,$0
 	MOVL	fn+0(FP), BX
 
 	// SetLastError(0).
@@ -57,7 +58,9 @@ TEXT	runtime·badsignal2(SB),NOSPLIT,$24
 	MOVL	DX, 12(SP)
 	MOVL	$0, 16(SP) // overlapped
 	CALL	*runtime·_WriteFile(SB)
-	MOVL	BP, SI
+
+	// Does not return.
+	CALL	runtime·abort(SB)
 	RET
 
 // faster get/set last error
@@ -144,90 +147,21 @@ done:
 	BYTE $0xC2; WORD $4
 	RET // unreached; make assembler happy
 
-TEXT runtime·exceptiontramp(SB),NOSPLIT,$0
+TEXT runtime·exceptiontramp<ABIInternal>(SB),NOSPLIT,$0
 	MOVL	$runtime·exceptionhandler(SB), AX
 	JMP	sigtramp<>(SB)
 
-TEXT runtime·firstcontinuetramp(SB),NOSPLIT,$0-0
+TEXT runtime·firstcontinuetramp<ABIInternal>(SB),NOSPLIT,$0-0
 	// is never called
 	INT	$3
 
-TEXT runtime·lastcontinuetramp(SB),NOSPLIT,$0-0
+TEXT runtime·lastcontinuetramp<ABIInternal>(SB),NOSPLIT,$0-0
 	MOVL	$runtime·lastcontinuehandler(SB), AX
 	JMP	sigtramp<>(SB)
 
-// Called by OS using stdcall ABI: bool ctrlhandler(uint32).
-TEXT runtime·ctrlhandler(SB),NOSPLIT,$0
-	PUSHL	$runtime·ctrlhandler1(SB)
-	NOP	SP	// tell vet SP changed - stop checking offsets
-	CALL	runtime·externalthreadhandler(SB)
-	MOVL	4(SP), CX
-	ADDL	$12, SP
-	JMP	CX
-
-// Called by OS using stdcall ABI: uint32 profileloop(void*).
-TEXT runtime·profileloop(SB),NOSPLIT,$0
-	PUSHL	$runtime·profileloop1(SB)
-	NOP	SP	// tell vet SP changed - stop checking offsets
-	CALL	runtime·externalthreadhandler(SB)
-	MOVL	4(SP), CX
-	ADDL	$12, SP
-	JMP	CX
-
-TEXT runtime·externalthreadhandler(SB),NOSPLIT|TOPFRAME,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	PUSHL	BX
-	PUSHL	SI
-	PUSHL	DI
-	PUSHL	0x14(FS)
-	MOVL	SP, DX
-
-	// setup dummy m, g
-	SUBL	$m__size, SP		// space for M
-	MOVL	SP, 0(SP)
-	MOVL	$m__size, 4(SP)
-	CALL	runtime·memclrNoHeapPointers(SB)	// smashes AX,BX,CX
-
-	LEAL	m_tls(SP), CX
-	MOVL	CX, 0x14(FS)
-	MOVL	SP, BX
-	SUBL	$g__size, SP		// space for G
-	MOVL	SP, g(CX)
-	MOVL	SP, m_g0(BX)
-
-	MOVL	SP, 0(SP)
-	MOVL	$g__size, 4(SP)
-	CALL	runtime·memclrNoHeapPointers(SB)	// smashes AX,BX,CX
-	LEAL	g__size(SP), BX
-	MOVL	BX, g_m(SP)
-
-	LEAL	-32768(SP), CX		// must be less than SizeOfStackReserve set by linker
-	MOVL	CX, (g_stack+stack_lo)(SP)
-	ADDL	$const__StackGuard, CX
-	MOVL	CX, g_stackguard0(SP)
-	MOVL	CX, g_stackguard1(SP)
-	MOVL	DX, (g_stack+stack_hi)(SP)
-
-	PUSHL	AX			// room for return value
-	PUSHL	16(BP)			// arg for handler
-	CALL	8(BP)
-	POPL	CX
-	POPL	AX			// pass return value to Windows in AX
-
-	get_tls(CX)
-	MOVL	g(CX), CX
-	MOVL	(g_stack+stack_hi)(CX), SP
-	POPL	0x14(FS)
-	POPL	DI
-	POPL	SI
-	POPL	BX
-	POPL	BP
-	RET
-
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
-TEXT runtime·callbackasm1(SB),NOSPLIT,$0
+TEXT runtime·callbackasm1<ABIInternal>(SB),NOSPLIT,$0
   	MOVL	0(SP), AX	// will use to find our callback context
 
 	// remove return address from stack, we are not returning to callbackasm, but to its caller.
@@ -246,7 +180,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	CLD
 
 	// determine index into runtime·cbs table
-	SUBL	$runtime·callbackasm(SB), AX
+	SUBL	$runtime·callbackasm<ABIInternal>(SB), AX
 	MOVL	$0, DX
 	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	BX
@@ -316,7 +250,7 @@ TEXT tstart<>(SB),NOSPLIT,$0
 	RET
 
 // uint32 tstart_stdcall(M *newm);
-TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
+TEXT runtime·tstart_stdcall<ABIInternal>(SB),NOSPLIT,$0
 	MOVL	newm+0(FP), BX
 
 	PUSHL	BX
@@ -400,14 +334,6 @@ TEXT runtime·switchtothread(SB),NOSPLIT,$0
 	MOVL	BP, SP
 	RET
 
-// See https://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
-#define _INTERRUPT_TIME 0x7ffe0008
-#define _SYSTEM_TIME 0x7ffe0014
-#define time_lo 0
-#define time_hi1 4
-#define time_hi2 8
-
 TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	CMPB	runtime·useQPCTime(SB), $0
 	JNE	useQPC
@@ -430,78 +356,3 @@ loop:
 useQPC:
 	JMP	runtime·nanotimeQPC(SB)
 	RET
-
-TEXT time·now(SB),NOSPLIT,$0-20
-	CMPB	runtime·useQPCTime(SB), $0
-	JNE	useQPC
-loop:
-	MOVL	(_INTERRUPT_TIME+time_hi1), AX
-	MOVL	(_INTERRUPT_TIME+time_lo), CX
-	MOVL	(_INTERRUPT_TIME+time_hi2), DI
-	CMPL	AX, DI
-	JNE	loop
-
-	// w = DI:CX
-	// multiply by 100
-	MOVL	$100, AX
-	MULL	CX
-	IMULL	$100, DI
-	ADDL	DI, DX
-	// w*100 = DX:AX
-	MOVL	AX, mono+12(FP)
-	MOVL	DX, mono+16(FP)
-
-wall:
-	MOVL	(_SYSTEM_TIME+time_hi1), CX
-	MOVL	(_SYSTEM_TIME+time_lo), AX
-	MOVL	(_SYSTEM_TIME+time_hi2), DX
-	CMPL	CX, DX
-	JNE	wall
-
-	// w = DX:AX
-	// convert to Unix epoch (but still 100ns units)
-	#define delta 116444736000000000
-	SUBL	$(delta & 0xFFFFFFFF), AX
-	SBBL $(delta >> 32), DX
-
-	// nano/100 = DX:AX
-	// split into two decimal halves by div 1e9.
-	// (decimal point is two spots over from correct place,
-	// but we avoid overflow in the high word.)
-	MOVL	$1000000000, CX
-	DIVL	CX
-	MOVL	AX, DI
-	MOVL	DX, SI
-
-	// DI = nano/100/1e9 = nano/1e11 = sec/100, DX = SI = nano/100%1e9
-	// split DX into seconds and nanoseconds by div 1e7 magic multiply.
-	MOVL	DX, AX
-	MOVL	$1801439851, CX
-	MULL	CX
-	SHRL	$22, DX
-	MOVL	DX, BX
-	IMULL	$10000000, DX
-	MOVL	SI, CX
-	SUBL	DX, CX
-
-	// DI = sec/100 (still)
-	// BX = (nano/100%1e9)/1e7 = (nano/1e9)%100 = sec%100
-	// CX = (nano/100%1e9)%1e7 = (nano%1e9)/100 = nsec/100
-	// store nsec for return
-	IMULL	$100, CX
-	MOVL	CX, nsec+8(FP)
-
-	// DI = sec/100 (still)
-	// BX = sec%100
-	// construct DX:AX = 64-bit sec and store for return
-	MOVL	$0, DX
-	MOVL	$100, AX
-	MULL	DI
-	ADDL	BX, AX
-	ADCL	$0, DX
-	MOVL	AX, sec+0(FP)
-	MOVL	DX, sec+4(FP)
-	RET
-useQPC:
-	JMP	runtime·nowQPC(SB)
-	RET
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index aba2811e59..e7782846b2 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -5,13 +5,15 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "time_windows.h"
+#include "cgo/abi_amd64.h"
 
 // maxargs should be divisible by 2, as Windows stack
 // must be kept 16-byte aligned on syscall entry.
-#define maxargs 16
+#define maxargs 18
 
 // void runtime·asmstdcall(void *c);
-TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·asmstdcall<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	// asmcgocall will put first argument into CX.
 	PUSHQ	CX			// save for later
 	MOVQ	libcall_fn(CX), AX
@@ -94,6 +96,8 @@ TEXT runtime·badsignal2(SB),NOSPLIT|NOFRAME,$48
 	MOVQ	runtime·_WriteFile(SB), AX
 	CALL	AX
 
+	// Does not return.
+	CALL	runtime·abort(SB)
 	RET
 
 // faster get/set last error
@@ -111,18 +115,10 @@ TEXT runtime·getlasterror(SB),NOSPLIT,$0
 TEXT sigtramp<>(SB),NOSPLIT|NOFRAME,$0-0
 	// CX: PEXCEPTION_POINTERS ExceptionInfo
 
-	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
-	// as required by windows callback convention.
-	PUSHFQ
-	SUBQ	$112, SP
-	MOVQ	DI, 80(SP)
-	MOVQ	SI, 72(SP)
-	MOVQ	BP, 64(SP)
-	MOVQ	BX, 56(SP)
-	MOVQ	R12, 48(SP)
-	MOVQ	R13, 40(SP)
-	MOVQ	R14, 32(SP)
-	MOVQ	R15, 88(SP)
+	// Switch from the host ABI to the Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+	// Make stack space for the rest of the function.
+	ADJSP	$48
 
 	MOVQ	AX, R15	// save handler address
 
@@ -138,8 +134,8 @@ TEXT sigtramp<>(SB),NOSPLIT|NOFRAME,$0-0
 	CALL	runtime·badsignal2(SB)
 
 	// save g and SP in case of stack switch
-	MOVQ	DX, 96(SP) // g
-	MOVQ	SP, 104(SP)
+	MOVQ	DX, 32(SP) // g
+	MOVQ	SP, 40(SP)
 
 	// do we need to switch to the g0 stack?
 	MOVQ	g_m(DX), BX
@@ -153,9 +149,11 @@ TEXT sigtramp<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ	(g_sched+gobuf_sp)(BX), DI
 	// make room for sighandler arguments
 	// and re-save old SP for restoring later.
-	// (note that the 104(DI) here must match the 104(SP) above.)
-	SUBQ	$120, DI
-	MOVQ	SP, 104(DI)
+	// Adjust g0 stack by the space we're using and
+	// save SP at the same place on the g0 stack.
+	// The 40(DI) here must match the 40(SP) above.
+	SUBQ	$(REGS_HOST_TO_ABI0_STACK + 48), DI
+	MOVQ	SP, 40(DI)
 	MOVQ	DI, SP
 
 g0:
@@ -170,102 +168,29 @@ g0:
 
 	// switch back to original stack and g
 	// no-op if we never left.
-	MOVQ	104(SP), SP
-	MOVQ	96(SP), DX
+	MOVQ	40(SP), SP
+	MOVQ	32(SP), DX
 	get_tls(BP)
 	MOVQ	DX, g(BP)
 
 done:
-	// restore registers as required for windows callback
-	MOVQ	88(SP), R15
-	MOVQ	32(SP), R14
-	MOVQ	40(SP), R13
-	MOVQ	48(SP), R12
-	MOVQ	56(SP), BX
-	MOVQ	64(SP), BP
-	MOVQ	72(SP), SI
-	MOVQ	80(SP), DI
-	ADDQ	$112, SP
-	POPFQ
+	ADJSP	$-48
+	POP_REGS_HOST_TO_ABI0()
 
 	RET
 
-TEXT runtime·exceptiontramp(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·exceptiontramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	$runtime·exceptionhandler(SB), AX
 	JMP	sigtramp<>(SB)
 
-TEXT runtime·firstcontinuetramp(SB),NOSPLIT|NOFRAME,$0-0
+TEXT runtime·firstcontinuetramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ	$runtime·firstcontinuehandler(SB), AX
 	JMP	sigtramp<>(SB)
 
-TEXT runtime·lastcontinuetramp(SB),NOSPLIT|NOFRAME,$0-0
+TEXT runtime·lastcontinuetramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ	$runtime·lastcontinuehandler(SB), AX
 	JMP	sigtramp<>(SB)
 
-TEXT runtime·ctrlhandler(SB),NOSPLIT|NOFRAME,$8
-	MOVQ	CX, 16(SP)		// spill
-	MOVQ	$runtime·ctrlhandler1(SB), CX
-	MOVQ	CX, 0(SP)
-	CALL	runtime·externalthreadhandler(SB)
-	RET
-
-TEXT runtime·profileloop(SB),NOSPLIT|NOFRAME,$8
-	MOVQ	$runtime·profileloop1(SB), CX
-	MOVQ	CX, 0(SP)
-	CALL	runtime·externalthreadhandler(SB)
-	RET
-
-TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
-	PUSHQ	BP
-	MOVQ	SP, BP
-	PUSHQ	BX
-	PUSHQ	SI
-	PUSHQ	DI
-	PUSHQ	0x28(GS)
-	MOVQ	SP, DX
-
-	// setup dummy m, g
-	SUBQ	$m__size, SP		// space for M
-	MOVQ	SP, 0(SP)
-	MOVQ	$m__size, 8(SP)
-	CALL	runtime·memclrNoHeapPointers(SB)	// smashes AX,BX,CX, maybe BP
-
-	LEAQ	m_tls(SP), CX
-	MOVQ	CX, 0x28(GS)
-	MOVQ	SP, BX
-	SUBQ	$g__size, SP		// space for G
-	MOVQ	SP, g(CX)
-	MOVQ	SP, m_g0(BX)
-
-	MOVQ	SP, 0(SP)
-	MOVQ	$g__size, 8(SP)
-	CALL	runtime·memclrNoHeapPointers(SB)	// smashes AX,BX,CX, maybe BP
-	LEAQ	g__size(SP), BX
-	MOVQ	BX, g_m(SP)
-
-	LEAQ	-32768(SP), CX		// must be less than SizeOfStackReserve set by linker
-	MOVQ	CX, (g_stack+stack_lo)(SP)
-	ADDQ	$const__StackGuard, CX
-	MOVQ	CX, g_stackguard0(SP)
-	MOVQ	CX, g_stackguard1(SP)
-	MOVQ	DX, (g_stack+stack_hi)(SP)
-
-	PUSHQ	AX			// room for return value
-	PUSHQ	32(BP)			// arg for handler
-	CALL	16(BP)
-	POPQ	CX
-	POPQ	AX			// pass return value to Windows in AX
-
-	get_tls(CX)
-	MOVQ	g(CX), CX
-	MOVQ	(g_stack+stack_hi)(CX), SP
-	POPQ	0x28(GS)
-	POPQ	DI
-	POPQ	SI
-	POPQ	BX
-	POPQ	BP
-	RET
-
 GLOBL runtime·cbctxts(SB), NOPTR, $8
 
 TEXT runtime·callbackasm1(SB),NOSPLIT,$0
@@ -287,28 +212,15 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	ADDQ	$8, SP
 
 	// determine index into runtime·cbs table
-	MOVQ	$runtime·callbackasm(SB), DX
+	MOVQ	$runtime·callbackasm<ABIInternal>(SB), DX
 	SUBQ	DX, AX
 	MOVQ	$0, DX
 	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	CX
 	SUBQ	$1, AX	// subtract 1 because return PC is to the next slot
 
-	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
-	// as required by windows callback convention.
-	PUSHFQ
-	SUBQ	$64, SP
-	MOVQ	DI, 56(SP)
-	MOVQ	SI, 48(SP)
-	MOVQ	BP, 40(SP)
-	MOVQ	BX, 32(SP)
-	MOVQ	R12, 24(SP)
-	MOVQ	R13, 16(SP)
-	MOVQ	R14, 8(SP)
-	MOVQ	R15, 0(SP)
-
-	// Go ABI requires DF flag to be cleared.
-	CLD
+	// Switch from the host ABI to the Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
 
 	// Create a struct callbackArgs on our stack to be passed as
 	// the "frame" to cgocallback and on to callbackWrap.
@@ -320,30 +232,23 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	// Call cgocallback, which will call callbackWrap(frame).
 	MOVQ	$0, 16(SP)	// context
 	MOVQ	AX, 8(SP)	// frame (address of callbackArgs)
-	LEAQ	·callbackWrap(SB), BX
+	LEAQ	·callbackWrap<ABIInternal>(SB), BX	// cgocallback takes an ABIInternal entry-point
 	MOVQ	BX, 0(SP)	// PC of function value to call (callbackWrap)
 	CALL	·cgocallback(SB)
 	// Get callback result.
 	MOVQ	(24+callbackArgs_result)(SP), AX
 	ADDQ	$(24+callbackArgs__size), SP
 
-	// restore registers as required for windows callback
-	MOVQ	0(SP), R15
-	MOVQ	8(SP), R14
-	MOVQ	16(SP), R13
-	MOVQ	24(SP), R12
-	MOVQ	32(SP), BX
-	MOVQ	40(SP), BP
-	MOVQ	48(SP), SI
-	MOVQ	56(SP), DI
-	ADDQ	$64, SP
-	POPFQ
+	POP_REGS_HOST_TO_ABI0()
 
 	// The return value was placed in AX above.
 	RET
 
 // uint32 tstart_stdcall(M *newm);
-TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
+TEXT runtime·tstart_stdcall<ABIInternal>(SB),NOSPLIT,$0
+	// Switch from the host ABI to the Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
 	// CX contains first arg newm
 	MOVQ	m_g0(CX), DX		// g
 
@@ -362,12 +267,11 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
 	MOVQ	CX, g_m(DX)
 	MOVQ	DX, g(SI)
 
-	// Someday the convention will be D is always cleared.
-	CLD
-
 	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
 	CALL	runtime·mstart(SB)
 
+	POP_REGS_HOST_TO_ABI0()
+
 	XORL	AX, AX			// return 0 == success
 	RET
 
@@ -440,14 +344,6 @@ TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	32(SP), SP
 	RET
 
-// See https://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
-#define _INTERRUPT_TIME 0x7ffe0008
-#define _SYSTEM_TIME 0x7ffe0014
-#define time_lo 0
-#define time_hi1 4
-#define time_hi2 8
-
 TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	CMPB	runtime·useQPCTime(SB), $0
 	JNE	useQPC
@@ -467,48 +363,10 @@ useQPC:
 	JMP	runtime·nanotimeQPC(SB)
 	RET
 
-TEXT time·now(SB),NOSPLIT,$0-24
-	CMPB	runtime·useQPCTime(SB), $0
-	JNE	useQPC
-	MOVQ	$_INTERRUPT_TIME, DI
-loop:
-	MOVL	time_hi1(DI), AX
-	MOVL	time_lo(DI), BX
-	MOVL	time_hi2(DI), CX
-	CMPL	AX, CX
-	JNE	loop
-	SHLQ	$32, AX
-	ORQ	BX, AX
-	IMULQ	$100, AX
-	MOVQ	AX, mono+16(FP)
-
-	MOVQ	$_SYSTEM_TIME, DI
-wall:
-	MOVL	time_hi1(DI), AX
-	MOVL	time_lo(DI), BX
-	MOVL	time_hi2(DI), CX
-	CMPL	AX, CX
-	JNE	wall
-	SHLQ	$32, AX
-	ORQ	BX, AX
-	MOVQ	$116444736000000000, DI
-	SUBQ	DI, AX
-	IMULQ	$100, AX
-
-	// generated code for
-	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
-	// adapted to reduce duplication
-	MOVQ	AX, CX
-	MOVQ	$1360296554856532783, AX
-	MULQ	CX
-	ADDQ	CX, DX
-	RCRQ	$1, DX
-	SHRQ	$29, DX
-	MOVQ	DX, sec+0(FP)
-	IMULQ	$1000000000, DX
-	SUBQ	DX, CX
-	MOVL	CX, nsec+8(FP)
-	RET
-useQPC:
-	JMP	runtime·nowQPC(SB)
+// func osSetupTLS(mp *m)
+// Setup TLS. for use by needm on Windows.
+TEXT runtime·osSetupTLS(SB),NOSPLIT,$0-8
+	MOVQ	mp+0(FP), AX
+	LEAQ	m_tls(AX), DI
+	CALL	runtime·settls(SB)
 	RET
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index cd230ccffd..48f8c7dedf 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -5,11 +5,12 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "time_windows.h"
 
 // Note: For system ABI, R0-R3 are args, R4-R11 are callee-save.
 
 // void runtime·asmstdcall(void *c);
-TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·asmstdcall<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVM.DB.W [R4, R5, R14], (R13)	// push {r4, r5, lr}
 	MOVW	R0, R4			// put libcall * in r4
 	MOVW	R13, R5			// save stack pointer in r5
@@ -96,8 +97,8 @@ TEXT runtime·badsignal2(SB),NOSPLIT|NOFRAME,$0
 	MOVW	runtime·_WriteFile(SB), R12
 	BL	(R12)
 
-	MOVW	R4, R13			// restore SP
-	MOVM.IA.W (R13), [R4, R15]	// pop {r4, pc}
+	// Does not return.
+	B	runtime·abort(SB)
 
 TEXT runtime·getlasterror(SB),NOSPLIT,$0
 	MRC	15, 0, R0, C13, C0, 2
@@ -157,9 +158,14 @@ g0:
 	MOVW	R2, 4(R13)	// Move arg0 (ExceptionRecord) into position
 	MOVW	R3, 8(R13)	// Move arg1 (ContextRecord) into position
 	MOVW	R5, 12(R13)	// Move arg2 (original g) into position
-	BL	(R7)		// Call the go routine
+	BL	(R7)		// Call the goroutine
 	MOVW	16(R13), R4	// Fetch return value from stack
 
+	// Save system stack pointer for sigresume setup below.
+	// The exact value does not matter - nothing is read or written
+	// from this address. It just needs to be on the system stack.
+	MOVW	R13, R12
+
 	// switch back to original stack and g
 	MOVW	24(R13), R13
 	MOVW	20(R13), g
@@ -216,96 +222,21 @@ TEXT sigresume<>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	R0, R13
 	B	(R1)
 
-TEXT runtime·exceptiontramp(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·exceptiontramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$runtime·exceptionhandler(SB), R1
 	B	sigtramp<>(SB)
 
-TEXT runtime·firstcontinuetramp(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·firstcontinuetramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$runtime·firstcontinuehandler(SB), R1
 	B	sigtramp<>(SB)
 
-TEXT runtime·lastcontinuetramp(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·lastcontinuetramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$runtime·lastcontinuehandler(SB), R1
 	B	sigtramp<>(SB)
 
-TEXT runtime·ctrlhandler(SB),NOSPLIT|NOFRAME,$0
-	MOVW	$runtime·ctrlhandler1(SB), R1
-	B	runtime·externalthreadhandler(SB)
-
-TEXT runtime·profileloop(SB),NOSPLIT|NOFRAME,$0
-	MOVW	$runtime·profileloop1(SB), R1
-	B	runtime·externalthreadhandler(SB)
-
-// int32 externalthreadhandler(uint32 arg, int (*func)(uint32))
-// stack layout:
-//   +----------------+
-//   | callee-save    |
-//   | registers      |
-//   +----------------+
-//   | m              |
-//   +----------------+
-// 20| g              |
-//   +----------------+
-// 16| func ptr (r1)  |
-//   +----------------+
-// 12| argument (r0)  |
-//---+----------------+
-// 8 | param1         | (also return value for called Go function)
-//   +----------------+
-// 4 | param0         |
-//   +----------------+
-// 0 | slot for LR    |
-//   +----------------+
-//
-TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
-	MOVM.DB.W [R4-R11, R14], (R13)		// push {r4-r11, lr}
-	SUB	$(m__size + g__size + 20), R13	// space for locals
-	MOVW	R14, 0(R13)			// push LR again for anything unwinding the stack
-	MOVW	R0, 12(R13)
-	MOVW	R1, 16(R13)
-
-	// zero out m and g structures
-	ADD	$20, R13, R0			// compute pointer to g
-	MOVW	R0, 4(R13)
-	MOVW	$(m__size + g__size), R0
-	MOVW	R0, 8(R13)
-	BL	runtime·memclrNoHeapPointers(SB)
-
-	// initialize m and g structures
-	ADD	$20, R13, R2			// R2 = g
-	ADD	$(20 + g__size), R13, R3	// R3 = m
-	MOVW	R2, m_g0(R3)			// m->g0 = g
-	MOVW	R3, g_m(R2)			// g->m = m
-	MOVW	R2, m_curg(R3)			// m->curg = g
-
-	MOVW	R2, g
-	BL	runtime·save_g(SB)
-
-	// set up stackguard stuff
-	MOVW	R13, R0
-	MOVW	R0, g_stack+stack_hi(g)
-	SUB	$(32*1024), R0
-	MOVW	R0, (g_stack+stack_lo)(g)
-	MOVW	R0, g_stackguard0(g)
-	MOVW	R0, g_stackguard1(g)
-
-	// move argument into position and call function
-	MOVW	12(R13), R0
-	MOVW	R0, 4(R13)
-	MOVW	16(R13), R1
-	BL	(R1)
-
-	// clear g
-	MOVW	$0, g
-	BL	runtime·save_g(SB)
-
-	MOVW	8(R13), R0			// load return value
-	ADD	$(m__size + g__size + 20), R13	// free locals
-	MOVM.IA.W (R13), [R4-R11, R15]		// pop {r4-r11, pc}
-
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
-TEXT runtime·callbackasm1(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·callbackasm1<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	// On entry, the trampoline in zcallback_windows_arm.s left
 	// the callback index in R12 (which is volatile in the C ABI).
 
@@ -344,7 +275,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT|NOFRAME,$0
 	B	(R12)	// return
 
 // uint32 tstart_stdcall(M *newm);
-TEXT runtime·tstart_stdcall(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·tstart_stdcall<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVM.DB.W [R4-R11, R14], (R13)		// push {r4-r11, lr}
 
 	MOVW	m_g0(R0), g
@@ -370,12 +301,11 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT|NOFRAME,$0
 // duration (in -100ns units) is in dt+0(FP).
 // g may be nil.
 TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$0-4
-	MOVW	dt+0(FP), R0
+	MOVW	dt+0(FP), R3
 	MOVM.DB.W [R4, R14], (R13)	// push {r4, lr}
 	MOVW	R13, R4			// Save SP
 	SUB	$8, R13			// R13 = R13 - 8
 	BIC	$0x7, R13		// Align SP for ABI
-	RSB	$0, R0, R3		// R3 = -R0
 	MOVW	$0, R1			// R1 = FALSE (alertable)
 	MOVW	$-1, R0			// R0 = handle
 	MOVW	R13, R2			// R2 = pTime
@@ -406,20 +336,12 @@ TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	B	runtime·armPublicationBarrier(SB)
 
-// never called (cgo not supported)
+// never called (this is a GOARM=7 platform)
 TEXT runtime·read_tls_fallback(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$0xabcd, R0
 	MOVW	R0, (R0)
 	RET
 
-// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
-#define _INTERRUPT_TIME 0x7ffe0008
-#define _SYSTEM_TIME 0x7ffe0014
-#define time_lo 0
-#define time_hi1 4
-#define time_hi2 8
-
 TEXT runtime·nanotime1(SB),NOSPLIT|NOFRAME,$0-8
 	MOVW	$0, R0
 	MOVB	runtime·useQPCTime(SB), R0
@@ -445,82 +367,6 @@ loop:
 useQPC:
 	B	runtime·nanotimeQPC(SB)		// tail call
 
-TEXT time·now(SB),NOSPLIT|NOFRAME,$0-20
-	MOVW    $0, R0
-	MOVB    runtime·useQPCTime(SB), R0
-	CMP	$0, R0
-	BNE	useQPC
-	MOVW	$_INTERRUPT_TIME, R3
-loop:
-	MOVW	time_hi1(R3), R1
-	MOVW	time_lo(R3), R0
-	MOVW	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	loop
-
-	// wintime = R1:R0, multiply by 100
-	MOVW	$100, R2
-	MULLU	R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
-	MULA	R1, R2, R4, R4
-
-	// wintime*100 = R4:R3
-	MOVW	R3, mono+12(FP)
-	MOVW	R4, mono+16(FP)
-
-	MOVW	$_SYSTEM_TIME, R3
-wall:
-	MOVW	time_hi1(R3), R1
-	MOVW	time_lo(R3), R0
-	MOVW	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	wall
-
-	// w = R1:R0 in 100ns untis
-	// convert to Unix epoch (but still 100ns units)
-	#define delta 116444736000000000
-	SUB.S   $(delta & 0xFFFFFFFF), R0
-	SBC     $(delta >> 32), R1
-
-	// Convert to nSec
-	MOVW    $100, R2
-	MULLU   R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
-	MULA    R1, R2, R4, R4
-	// w = R2:R1 in nSec
-	MOVW    R3, R1	      // R4:R3 -> R2:R1
-	MOVW    R4, R2
-
-	// multiply nanoseconds by reciprocal of 10**9 (scaled by 2**61)
-	// to get seconds (96 bit scaled result)
-	MOVW	$0x89705f41, R3		// 2**61 * 10**-9
-	MULLU	R1,R3,(R6,R5)		// R7:R6:R5 = R2:R1 * R3
-	MOVW	$0,R7
-	MULALU	R2,R3,(R7,R6)
-
-	// unscale by discarding low 32 bits, shifting the rest by 29
-	MOVW	R6>>29,R6		// R7:R6 = (R7:R6:R5 >> 61)
-	ORR	R7<<3,R6
-	MOVW	R7>>29,R7
-
-	// subtract (10**9 * sec) from nsec to get nanosecond remainder
-	MOVW	$1000000000, R5	// 10**9
-	MULLU	R6,R5,(R9,R8)   // R9:R8 = R7:R6 * R5
-	MULA	R7,R5,R9,R9
-	SUB.S	R8,R1		// R2:R1 -= R9:R8
-	SBC	R9,R2
-
-	// because reciprocal was a truncated repeating fraction, quotient
-	// may be slightly too small -- adjust to make remainder < 10**9
-	CMP	R5,R1	// if remainder > 10**9
-	SUB.HS	R5,R1   //    remainder -= 10**9
-	ADD.HS	$1,R6	//    sec += 1
-
-	MOVW	R6,sec_lo+0(FP)
-	MOVW	R7,sec_hi+4(FP)
-	MOVW	R1,nsec+8(FP)
-	RET
-useQPC:
-	B	runtime·nowQPC(SB)		// tail call
-
 // save_g saves the g register (R10) into thread local memory
 // so that we can call externally compiled
 // ARM code that will overwrite those registers.
diff --git a/src/runtime/sys_windows_arm64.s b/src/runtime/sys_windows_arm64.s
index 53960488f9..7a2e11f5ae 100644
--- a/src/runtime/sys_windows_arm64.s
+++ b/src/runtime/sys_windows_arm64.s
@@ -6,6 +6,7 @@
 #include "go_tls.h"
 #include "textflag.h"
 #include "funcdata.h"
+#include "time_windows.h"
 
 // Offsets into Thread Environment Block (pointer in R18)
 #define TEB_error 0x68
@@ -17,7 +18,7 @@
 // load_g and save_g (in tls_arm64.s) clobber R27 (REGTMP) and R0.
 
 // void runtime·asmstdcall(void *c);
-TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·asmstdcall<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	STP.W	(R29, R30), -32(RSP)	// allocate C ABI stack frame
 	STP	(R19, R20), 16(RSP) // save old R19, R20
 	MOVD	R0, R19	// save libcall pointer
@@ -116,7 +117,9 @@ TEXT runtime·badsignal2(SB),NOSPLIT,$16-0
 	MOVD	runtime·_WriteFile(SB), R12
 	SUB	$16, RSP	// skip over saved frame pointer below RSP
 	BL	(R12)
-	ADD	$16, RSP
+
+	// Does not return.
+	B	runtime·abort(SB)
 
 	RET
 
@@ -287,89 +290,21 @@ TEXT sigresume<>(SB),NOSPLIT|NOFRAME,$0
 	MOVD	R0, RSP
 	B	(R1)
 
-TEXT runtime·exceptiontramp(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·exceptionhandler(SB), R1
+TEXT runtime·exceptiontramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$runtime·exceptionhandler<ABIInternal>(SB), R1
 	B	sigtramp<>(SB)
 
-TEXT runtime·firstcontinuetramp(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·firstcontinuehandler(SB), R1
+TEXT runtime·firstcontinuetramp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$runtime·firstcontinuehandler<ABIInternal>(SB), R1
 	B	sigtramp<>(SB)
 
 TEXT runtime·lastcontinuetramp(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·lastcontinuehandler(SB), R1
+	MOVD	$runtime·lastcontinuehandler<ABIInternal>(SB), R1
 	B	sigtramp<>(SB)
 
-TEXT runtime·ctrlhandler(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·ctrlhandler1(SB), R1
-	B	runtime·externalthreadhandler(SB)
-
-TEXT runtime·profileloop(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·profileloop1(SB), R1
-	B	runtime·externalthreadhandler(SB)
-
-// externalthreadhander called with R0 = uint32 arg, R1 = Go function f.
-// Need to call f(arg), which returns a uint32, and return it in R0.
-TEXT runtime·externalthreadhandler(SB),NOSPLIT|TOPFRAME,$96-0
-	NO_LOCAL_POINTERS
-
-	// Push C callee-save registers R19-R28. LR, FP already saved.
-	SAVE_R19_TO_R28(-10*8)
-
-	// Allocate space for args, saved R0+R1, g, and m structures.
-	// Hide from nosplit check.
-	#define extra ((64+g__size+m__size+15)&~15)
-	SUB	$extra, RSP, R2	// hide from nosplit overflow check
-	MOVD	R2, RSP
-
-	// Save R0 and R1 (our args).
-	MOVD	R0, 32(RSP)
-	MOVD	R1, 40(RSP)
-
-	// Zero out m and g structures.
-	MOVD	$64(RSP), R0
-	MOVD	R0, 8(RSP)
-	MOVD	$(m__size + g__size), R0
-	MOVD	R0, 16(RSP)
-	MOVD	$0, 0(RSP)	// not-saved LR
-	BL	runtime·memclrNoHeapPointers(SB)
-
-	// Initialize m and g structures.
-	MOVD	$64(RSP), g
-	MOVD	$g__size(g), R3		// m
-	MOVD	R3, g_m(g)		// g->m = m
-	MOVD	g, m_g0(R3)		// m->g0 = g
-	MOVD	g, m_curg(R3)		// m->curg = g
-	MOVD	RSP, R0
-	MOVD	R0, g_stack+stack_hi(g)
-	SUB	$(32*1024), R0
-	MOVD	R0, (g_stack+stack_lo)(g)
-	MOVD	R0, g_stackguard0(g)
-	MOVD	R0, g_stackguard1(g)
-	BL	runtime·save_g(SB)
-
-	// Call function.
-	MOVD	32(RSP), R0
-	MOVD	40(RSP), R1
-	MOVW	R0, 8(RSP)
-	BL	(R1)
-
-	// Clear g.
-	MOVD	$0, g
-	BL	runtime·save_g(SB)
-
-	// Load return value (save_g would have smashed)
-	MOVW	(2*8)(RSP), R0
-
-	ADD	$extra, RSP, R2
-	MOVD	R2, RSP
-	#undef extra
-
-	RESTORE_R19_TO_R28(-10*8)
-	RET
-
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
-TEXT runtime·callbackasm1(SB),NOSPLIT,$208-0
+TEXT runtime·callbackasm1<ABIInternal>(SB),NOSPLIT,$208-0
 	NO_LOCAL_POINTERS
 
 	// On entry, the trampoline in zcallback_windows_arm64.s left
@@ -377,14 +312,19 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$208-0
 
 	// Save callback register arguments R0-R7.
 	// We do this at the top of the frame so they're contiguous with stack arguments.
-	MOVD	R0, arg0-(8*8)(SP)
-	MOVD	R1, arg1-(7*8)(SP)
-	MOVD	R2, arg2-(6*8)(SP)
-	MOVD	R3, arg3-(5*8)(SP)
-	MOVD	R4, arg4-(4*8)(SP)
-	MOVD	R5, arg5-(3*8)(SP)
-	MOVD	R6, arg6-(2*8)(SP)
-	MOVD	R7, arg7-(1*8)(SP)
+	// The 7*8 setting up R14 looks like a bug but is not: the eighth word
+	// is the space the assembler reserved for our caller's frame pointer,
+	// but we are not called from Go so that space is ours to use,
+	// and we must to be contiguous with the stack arguments.
+	MOVD	$arg0-(7*8)(SP), R14
+	MOVD	R0, (0*8)(R14)
+	MOVD	R1, (1*8)(R14)
+	MOVD	R2, (2*8)(R14)
+	MOVD	R3, (3*8)(R14)
+	MOVD	R4, (4*8)(R14)
+	MOVD	R5, (5*8)(R14)
+	MOVD	R6, (6*8)(R14)
+	MOVD	R7, (7*8)(R14)
 
 	// Push C callee-save registers R19-R28.
 	// LR, FP already saved.
@@ -393,7 +333,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$208-0
 	// Create a struct callbackArgs on our stack.
 	MOVD	$cbargs-(18*8+callbackArgs__size)(SP), R13
 	MOVD	R12, callbackArgs_index(R13)	// callback index
-	MOVD	$arg0-(8*8)(SP), R0
+	MOVD	R14, R0
 	MOVD	R0, callbackArgs_args(R13)		// address of args vector
 	MOVD	$0, R0
 	MOVD	R0, callbackArgs_result(R13)	// result
@@ -416,7 +356,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$208-0
 	RET
 
 // uint32 tstart_stdcall(M *newm);
-TEXT runtime·tstart_stdcall(SB),NOSPLIT,$96-0
+TEXT runtime·tstart_stdcall<ABIInternal>(SB),NOSPLIT,$96-0
 	SAVE_R19_TO_R28(-10*8)
 
 	MOVD	m_g0(R0), g
@@ -470,14 +410,6 @@ TEXT runtime·switchtothread(SB),NOSPLIT,$16-0
 	ADD	$16, RSP
 	RET
 
-// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
-#define _INTERRUPT_TIME 0x7ffe0008
-#define _SYSTEM_TIME 0x7ffe0014
-#define time_lo 0
-#define time_hi1 4
-#define time_hi2 8
-
 TEXT runtime·nanotime1(SB),NOSPLIT|NOFRAME,$0-8
 	MOVB	runtime·useQPCTime(SB), R0
 	CMP	$0, R0
@@ -499,62 +431,6 @@ loop:
 useQPC:
 	B	runtime·nanotimeQPC(SB)		// tail call
 
-TEXT time·now(SB),NOSPLIT|NOFRAME,$0-24
-	MOVB    runtime·useQPCTime(SB), R0
-	CMP	$0, R0
-	BNE	useQPC
-	MOVD	$_INTERRUPT_TIME, R3
-loop:
-	MOVWU	time_hi1(R3), R1
-	MOVWU	time_lo(R3), R0
-	MOVWU	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	loop
-
-	// wintime = R1:R0, multiply by 100
-	ORR	R1<<32, R0
-	MOVD	$100, R1
-	MUL	R1, R0
-	MOVD	R0, mono+16(FP)
-
-	MOVD	$_SYSTEM_TIME, R3
-wall:
-	MOVWU	time_hi1(R3), R1
-	MOVWU	time_lo(R3), R0
-	MOVWU	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	wall
-
-	// w = R1:R0 in 100ns units
-	// convert to Unix epoch (but still 100ns units)
-	#define delta 116444736000000000
-	ORR	R1<<32, R0
-	SUB	$delta, R0
-
-	// Convert to nSec
-	MOVD	$100, R1
-	MUL	R1, R0
-
-	// Code stolen from compiler output for:
-	//
-	//	var x uint64
-	//	func f() (sec uint64, nsec uint32) { return x / 1000000000, uint32(x % 100000000) }
-	//
-	LSR	$1, R0, R1
-	MOVD	$-8543223759426509416, R2
-	UMULH	R2, R1, R1
-	LSR	$28, R1, R1
-	MOVD	R1, sec+0(FP)
-	MOVD	$-6067343680855748867, R1
-	UMULH	R0, R1, R1
-	LSR	$26, R1, R1
-	MOVD	$100000000, R2
-	MSUB	R1, R0, R2, R0
-	MOVW	R0, nsec+8(FP)
-	RET
-useQPC:
-	B	runtime·nowQPC(SB)		// tail call
-
 // This is called from rt0_go, which runs on the system stack
 // using the initial stack allocated by the OS.
 // It calls back into standard C using the BL below.
@@ -573,7 +449,7 @@ TEXT runtime·wintls(SB),NOSPLIT,$0
 ok:
 
 	// Save offset from R18 into tls_g.
-	LSL	$3, R1
-	ADD	$TEB_TlsSlots, R1
-	MOVD	R1, runtime·tls_g(SB)
+	LSL	$3, R0
+	ADD	$TEB_TlsSlots, R0
+	MOVD	R0, runtime·tls_g(SB)
 	RET
diff --git a/src/runtime/sys_x86.go b/src/runtime/sys_x86.go
index 8f21585d28..0866e3140e 100644
--- a/src/runtime/sys_x86.go
+++ b/src/runtime/sys_x86.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 || 386
 // +build amd64 386
 
 package runtime
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 7cf9318bdb..6b9195bcd5 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -22,32 +22,180 @@ var cbs struct {
 type winCallback struct {
 	fn     *funcval // Go function
 	retPop uintptr  // For 386 cdecl, how many bytes to pop on return
-
-	// abiMap specifies how to translate from a C frame to a Go
-	// frame. This does not specify how to translate back because
-	// the result is always a uintptr. If the C ABI is fastcall,
-	// this assumes the four fastcall registers were first spilled
-	// to the shadow space.
-	abiMap []abiPart
-	// retOffset is the offset of the uintptr-sized result in the Go
-	// frame.
-	retOffset uintptr
+	abiMap abiDesc
 }
 
+// abiPartKind is the action an abiPart should take.
+type abiPartKind int
+
+const (
+	abiPartBad   abiPartKind = iota
+	abiPartStack             // Move a value from memory to the stack.
+	abiPartReg               // Move a value from memory to a register.
+)
+
 // abiPart encodes a step in translating between calling ABIs.
 type abiPart struct {
-	src, dst uintptr
-	len      uintptr
+	kind           abiPartKind
+	srcStackOffset uintptr
+	dstStackOffset uintptr // used if kind == abiPartStack
+	dstRegister    int     // used if kind == abiPartReg
+	len            uintptr
 }
 
 func (a *abiPart) tryMerge(b abiPart) bool {
-	if a.src+a.len == b.src && a.dst+a.len == b.dst {
+	if a.kind != abiPartStack || b.kind != abiPartStack {
+		return false
+	}
+	if a.srcStackOffset+a.len == b.srcStackOffset && a.dstStackOffset+a.len == b.dstStackOffset {
 		a.len += b.len
 		return true
 	}
 	return false
 }
 
+// abiDesc specifies how to translate from a C frame to a Go
+// frame. This does not specify how to translate back because
+// the result is always a uintptr. If the C ABI is fastcall,
+// this assumes the four fastcall registers were first spilled
+// to the shadow space.
+type abiDesc struct {
+	parts []abiPart
+
+	srcStackSize uintptr // stdcall/fastcall stack space tracking
+	dstStackSize uintptr // Go stack space used
+	dstRegisters int     // Go ABI int argument registers used
+
+	// retOffset is the offset of the uintptr-sized result in the Go
+	// frame.
+	retOffset uintptr
+}
+
+func (p *abiDesc) assignArg(t *_type) {
+	if t.size > sys.PtrSize {
+		// We don't support this right now. In
+		// stdcall/cdecl, 64-bit ints and doubles are
+		// passed as two words (little endian); and
+		// structs are pushed on the stack. In
+		// fastcall, arguments larger than the word
+		// size are passed by reference. On arm,
+		// 8-byte aligned arguments round up to the
+		// next even register and can be split across
+		// registers and the stack.
+		panic("compileCallback: argument size is larger than uintptr")
+	}
+	if k := t.kind & kindMask; GOARCH != "386" && (k == kindFloat32 || k == kindFloat64) {
+		// In fastcall, floating-point arguments in
+		// the first four positions are passed in
+		// floating-point registers, which we don't
+		// currently spill. arm passes floating-point
+		// arguments in VFP registers, which we also
+		// don't support.
+		// So basically we only support 386.
+		panic("compileCallback: float arguments not supported")
+	}
+
+	if t.size == 0 {
+		// The Go ABI aligns for zero-sized types.
+		p.dstStackSize = alignUp(p.dstStackSize, uintptr(t.align))
+		return
+	}
+
+	// In the C ABI, we're already on a word boundary.
+	// Also, sub-word-sized fastcall register arguments
+	// are stored to the least-significant bytes of the
+	// argument word and all supported Windows
+	// architectures are little endian, so srcStackOffset
+	// is already pointing to the right place for smaller
+	// arguments. The same is true on arm.
+
+	oldParts := p.parts
+	if !p.tryRegAssignArg(t, 0) {
+		// Register assignment failed.
+		// Undo the work and stack assign.
+		p.parts = oldParts
+
+		// The Go ABI aligns arguments.
+		p.dstStackSize = alignUp(p.dstStackSize, uintptr(t.align))
+
+		// Copy just the size of the argument. Note that this
+		// could be a small by-value struct, but C and Go
+		// struct layouts are compatible, so we can copy these
+		// directly, too.
+		part := abiPart{
+			kind:           abiPartStack,
+			srcStackOffset: p.srcStackSize,
+			dstStackOffset: p.dstStackSize,
+			len:            t.size,
+		}
+		// Add this step to the adapter.
+		if len(p.parts) == 0 || !p.parts[len(p.parts)-1].tryMerge(part) {
+			p.parts = append(p.parts, part)
+		}
+		// The Go ABI packs arguments.
+		p.dstStackSize += t.size
+	}
+
+	// cdecl, stdcall, fastcall, and arm pad arguments to word size.
+	// TODO(rsc): On arm and arm64 do we need to skip the caller's saved LR?
+	p.srcStackSize += sys.PtrSize
+}
+
+// tryRegAssignArg tries to register-assign a value of type t.
+// If this type is nested in an aggregate type, then offset is the
+// offset of this type within its parent type.
+// Assumes t.size <= sys.PtrSize and t.size != 0.
+//
+// Returns whether the assignment succeeded.
+func (p *abiDesc) tryRegAssignArg(t *_type, offset uintptr) bool {
+	switch k := t.kind & kindMask; k {
+	case kindBool, kindInt, kindInt8, kindInt16, kindInt32, kindUint, kindUint8, kindUint16, kindUint32, kindUintptr, kindPtr, kindUnsafePointer:
+		// Assign a register for all these types.
+		return p.assignReg(t.size, offset)
+	case kindInt64, kindUint64:
+		// Only register-assign if the registers are big enough.
+		if sys.PtrSize == 8 {
+			return p.assignReg(t.size, offset)
+		}
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		if at.len == 1 {
+			return p.tryRegAssignArg(at.elem, offset)
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		for i := range st.fields {
+			f := &st.fields[i]
+			if !p.tryRegAssignArg(f.typ, offset+f.offset()) {
+				return false
+			}
+		}
+		return true
+	}
+	// Pointer-sized types such as maps and channels are currently
+	// not supported.
+	panic("compileCallabck: type " + t.string() + " is currently not supported for use in system callbacks")
+}
+
+// assignReg attempts to assign a single register for an
+// argument with the given size, at the given offset into the
+// value in the C ABI space.
+//
+// Returns whether the assignment was successful.
+func (p *abiDesc) assignReg(size, offset uintptr) bool {
+	if p.dstRegisters >= intArgRegs {
+		return false
+	}
+	p.parts = append(p.parts, abiPart{
+		kind:           abiPartReg,
+		srcStackOffset: p.srcStackSize + offset,
+		dstRegister:    p.dstRegisters,
+		len:            size,
+	})
+	p.dstRegisters++
+	return true
+}
+
 type winCallbackKey struct {
 	fn    *funcval
 	cdecl bool
@@ -101,62 +249,14 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 	ft := (*functype)(unsafe.Pointer(fn._type))
 
 	// Check arguments and construct ABI translation.
-	var abiMap []abiPart
-	var src, dst uintptr
+	var abiMap abiDesc
 	for _, t := range ft.in() {
-		if t.size > sys.PtrSize {
-			// We don't support this right now. In
-			// stdcall/cdecl, 64-bit ints and doubles are
-			// passed as two words (little endian); and
-			// structs are pushed on the stack. In
-			// fastcall, arguments larger than the word
-			// size are passed by reference. On arm,
-			// 8-byte aligned arguments round up to the
-			// next even register and can be split across
-			// registers and the stack.
-			panic("compileCallback: argument size is larger than uintptr")
-		}
-		if k := t.kind & kindMask; GOARCH != "386" && (k == kindFloat32 || k == kindFloat64) {
-			// In fastcall, floating-point arguments in
-			// the first four positions are passed in
-			// floating-point registers, which we don't
-			// currently spill. arm passes floating-point
-			// arguments in VFP registers, which we also
-			// don't support.
-			// So basically we only support 386.
-			panic("compileCallback: float arguments not supported")
-		}
-
-		// The Go ABI aligns arguments.
-		dst = alignUp(dst, uintptr(t.align))
-		// In the C ABI, we're already on a word boundary.
-		// Also, sub-word-sized fastcall register arguments
-		// are stored to the least-significant bytes of the
-		// argument word and all supported Windows
-		// architectures are little endian, so src is already
-		// pointing to the right place for smaller arguments.
-		// The same is true on arm.
-
-		// Copy just the size of the argument. Note that this
-		// could be a small by-value struct, but C and Go
-		// struct layouts are compatible, so we can copy these
-		// directly, too.
-		part := abiPart{src, dst, t.size}
-		// Add this step to the adapter.
-		if len(abiMap) == 0 || !abiMap[len(abiMap)-1].tryMerge(part) {
-			abiMap = append(abiMap, part)
-		}
-
-		// cdecl, stdcall, fastcall, and arm pad arguments to word size.
-		// TODO(rsc): On arm and arm64 do we need to skip the caller's saved LR?
-		src += sys.PtrSize
-		// The Go ABI packs arguments.
-		dst += t.size
+		abiMap.assignArg(t)
 	}
 	// The Go ABI aligns the result to the word size. src is
 	// already aligned.
-	dst = alignUp(dst, sys.PtrSize)
-	retOffset := dst
+	abiMap.dstStackSize = alignUp(abiMap.dstStackSize, sys.PtrSize)
+	abiMap.retOffset = abiMap.dstStackSize
 
 	if len(ft.out()) != 1 {
 		panic("compileCallback: expected function with one uintptr-sized result")
@@ -170,10 +270,14 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 		// Either way, it's not AX.
 		panic("compileCallback: float results not supported")
 	}
-	// Make room for the uintptr-sized result.
-	dst += sys.PtrSize
+	if intArgRegs == 0 {
+		// Make room for the uintptr-sized result.
+		// If there are argument registers, the return value will
+		// be passed in the first register.
+		abiMap.dstStackSize += sys.PtrSize
+	}
 
-	if dst > callbackMaxFrame {
+	if abiMap.dstStackSize > callbackMaxFrame {
 		panic("compileCallback: function argument frame too large")
 	}
 
@@ -181,7 +285,7 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 	// arguments from the C stack.
 	var retPop uintptr
 	if cdecl {
-		retPop = src
+		retPop = abiMap.srcStackSize
 	}
 
 	key := winCallbackKey{(*funcval)(fn.data), cdecl}
@@ -203,7 +307,7 @@ func compileCallback(fn eface, cdecl bool) (code uintptr) {
 		unlock(&cbs.lock)
 		throw("too many callback functions")
 	}
-	c := winCallback{key.fn, retPop, abiMap, retOffset}
+	c := winCallback{key.fn, retPop, abiMap}
 	cbs.ctxt[n] = c
 	cbs.index[key] = n
 	cbs.n++
@@ -237,22 +341,39 @@ func callbackWrap(a *callbackArgs) {
 	a.retPop = c.retPop
 
 	// Convert from C to Go ABI.
+	var regs abi.RegArgs
 	var frame [callbackMaxFrame]byte
 	goArgs := unsafe.Pointer(&frame)
-	for _, part := range c.abiMap {
-		memmove(add(goArgs, part.dst), add(a.args, part.src), part.len)
+	for _, part := range c.abiMap.parts {
+		switch part.kind {
+		case abiPartStack:
+			memmove(add(goArgs, part.dstStackOffset), add(a.args, part.srcStackOffset), part.len)
+		case abiPartReg:
+			goReg := unsafe.Pointer(&regs.Ints[part.dstRegister])
+			memmove(goReg, add(a.args, part.srcStackOffset), part.len)
+		default:
+			panic("bad ABI description")
+		}
 	}
 
 	// Even though this is copying back results, we can pass a nil
 	// type because those results must not require write barriers.
-	//
-	// Pass a dummy RegArgs for now.
-	// TODO(mknyszek): Pass arguments in registers.
-	var regs abi.RegArgs
-	reflectcall(nil, unsafe.Pointer(c.fn), noescape(goArgs), uint32(c.retOffset)+sys.PtrSize, uint32(c.retOffset), uint32(c.retOffset)+sys.PtrSize, &regs)
+	reflectcall(nil, unsafe.Pointer(c.fn), noescape(goArgs), uint32(c.abiMap.dstStackSize), uint32(c.abiMap.retOffset), uint32(c.abiMap.dstStackSize), &regs)
 
 	// Extract the result.
-	a.result = *(*uintptr)(unsafe.Pointer(&frame[c.retOffset]))
+	//
+	// There's always exactly one return value, one pointer in size.
+	// If it's on the stack, then we will have reserved space for it
+	// at the end of the frame, otherwise it was passed in a register.
+	if c.abiMap.dstStackSize != c.abiMap.retOffset {
+		a.result = *(*uintptr)(unsafe.Pointer(&frame[c.abiMap.retOffset]))
+	} else {
+		var zero int
+		// On architectures with no registers, Ints[0] would be a compile error,
+		// so we use a dynamic index. These architectures will never take this
+		// branch, so this won't cause a runtime panic.
+		a.result = regs.Ints[zero]
+	}
 }
 
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
@@ -263,6 +384,7 @@ const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
 // to the full path inside of system32 for use with vanilla LoadLibrary.
 //go:linkname syscall_loadsystemlibrary syscall.loadsystemlibrary
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_loadsystemlibrary(filename *uint16, absoluteFilepath *uint16) (handle, err uintptr) {
 	lockOSThread()
 	c := &getg().m.syscall
@@ -293,6 +415,7 @@ func syscall_loadsystemlibrary(filename *uint16, absoluteFilepath *uint16) (hand
 
 //go:linkname syscall_loadlibrary syscall.loadlibrary
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {
 	lockOSThread()
 	defer unlockOSThread()
@@ -310,6 +433,7 @@ func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {
 
 //go:linkname syscall_getprocaddress syscall.getprocaddress
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_getprocaddress(handle uintptr, procname *byte) (outhandle, err uintptr) {
 	lockOSThread()
 	defer unlockOSThread()
@@ -327,6 +451,7 @@ func syscall_getprocaddress(handle uintptr, procname *byte) (outhandle, err uint
 
 //go:linkname syscall_Syscall syscall.Syscall
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall(fn, nargs, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
 	defer unlockOSThread()
@@ -340,6 +465,7 @@ func syscall_Syscall(fn, nargs, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
 
 //go:linkname syscall_Syscall6 syscall.Syscall6
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
 	defer unlockOSThread()
@@ -353,52 +479,56 @@ func syscall_Syscall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err ui
 
 //go:linkname syscall_Syscall9 syscall.Syscall9
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall9(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
-	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
 	c.args = uintptr(noescape(unsafe.Pointer(&a1)))
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
+	unlockOSThread()
 	return c.r1, c.r2, c.err
 }
 
 //go:linkname syscall_Syscall12 syscall.Syscall12
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall12(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
-	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
 	c.args = uintptr(noescape(unsafe.Pointer(&a1)))
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
+	unlockOSThread()
 	return c.r1, c.r2, c.err
 }
 
 //go:linkname syscall_Syscall15 syscall.Syscall15
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall15(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
-	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
 	c.args = uintptr(noescape(unsafe.Pointer(&a1)))
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
+	unlockOSThread()
 	return c.r1, c.r2, c.err
 }
 
 //go:linkname syscall_Syscall18 syscall.Syscall18
 //go:nosplit
+//go:cgo_unsafe_args
 func syscall_Syscall18(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18 uintptr) (r1, r2, err uintptr) {
 	lockOSThread()
-	defer unlockOSThread()
 	c := &getg().m.syscall
 	c.fn = fn
 	c.n = nargs
 	c.args = uintptr(noescape(unsafe.Pointer(&a1)))
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
+	unlockOSThread()
 	return c.r1, c.r2, c.err
 }
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index fb215b3c31..5e9694d444 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -7,6 +7,7 @@ package runtime_test
 import (
 	"bytes"
 	"fmt"
+	"internal/abi"
 	"internal/syscall/windows/sysdll"
 	"internal/testenv"
 	"io"
@@ -390,6 +391,103 @@ var cbFuncs = []cbFunc{
 	}},
 }
 
+//go:registerparams
+func sum2(i1, i2 uintptr) uintptr {
+	return i1 + i2
+}
+
+//go:registerparams
+func sum3(i1, i2, i3 uintptr) uintptr {
+	return i1 + i2 + i3
+}
+
+//go:registerparams
+func sum4(i1, i2, i3, i4 uintptr) uintptr {
+	return i1 + i2 + i3 + i4
+}
+
+//go:registerparams
+func sum5(i1, i2, i3, i4, i5 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5
+}
+
+//go:registerparams
+func sum6(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5 + i6
+}
+
+//go:registerparams
+func sum7(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5 + i6 + i7
+}
+
+//go:registerparams
+func sum8(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8
+}
+
+//go:registerparams
+func sum9(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9
+}
+
+//go:registerparams
+func sum10(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10 uintptr) uintptr {
+	return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + i10
+}
+
+//go:registerparams
+func sum9uint8(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint8) uintptr {
+	return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+}
+
+//go:registerparams
+func sum9uint16(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint16) uintptr {
+	return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+}
+
+//go:registerparams
+func sum9int8(i1, i2, i3, i4, i5, i6, i7, i8, i9 int8) uintptr {
+	return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+}
+
+//go:registerparams
+func sum5mix(i1 int8, i2 int16, i3 int32, i4, i5 uintptr) uintptr {
+	return uintptr(i1) + uintptr(i2) + uintptr(i3) + i4 + i5
+}
+
+//go:registerparams
+func sum5andPair(i1, i2, i3, i4, i5 uint8Pair) uintptr {
+	return uintptr(i1.x + i1.y + i2.x + i2.y + i3.x + i3.y + i4.x + i4.y + i5.x + i5.y)
+}
+
+// TODO(register args): Remove this once we switch to using the register
+// calling convention by default, since this is redundant with the existing
+// tests.
+var cbFuncsRegABI = []cbFunc{
+	{sum2},
+	{sum3},
+	{sum4},
+	{sum5},
+	{sum6},
+	{sum7},
+	{sum8},
+	{sum9},
+	{sum10},
+	{sum9uint8},
+	{sum9uint16},
+	{sum9int8},
+	{sum5mix},
+	{sum5andPair},
+}
+
+func getCallbackTestFuncs() []cbFunc {
+	if regs := runtime.SetIntArgRegs(-1); regs > 0 {
+		return cbFuncsRegABI
+	}
+	return cbFuncs
+}
+
 type cbDLL struct {
 	name      string
 	buildArgs func(out, src string) []string
@@ -406,7 +504,7 @@ func (d *cbDLL) makeSrc(t *testing.T, path string) {
 #include <stdint.h>
 typedef struct { uint8_t x, y; } uint8Pair_t;
 `)
-	for _, cbf := range cbFuncs {
+	for _, cbf := range getCallbackTestFuncs() {
 		cbf.cSrc(f, false)
 		cbf.cSrc(f, true)
 	}
@@ -445,18 +543,17 @@ func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	if _, err := exec.LookPath("gcc"); err != nil {
 		t.Skip("skipping test: gcc is missing")
 	}
-	tmp, err := os.MkdirTemp("", "TestCDeclCallback")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer os.RemoveAll(tmp)
+	tmp := t.TempDir()
+
+	oldRegs := runtime.SetIntArgRegs(abi.IntArgRegs)
+	defer runtime.SetIntArgRegs(oldRegs)
 
 	for _, dll := range cbDLLs {
 		t.Run(dll.name, func(t *testing.T) {
 			dllPath := dll.build(t, tmp)
 			dll := syscall.MustLoadDLL(dllPath)
 			defer dll.Release()
-			for _, cbf := range cbFuncs {
+			for _, cbf := range getCallbackTestFuncs() {
 				t.Run(cbf.cName(false), func(t *testing.T) {
 					stdcall := syscall.NewCallback(cbf.goFunc)
 					cbf.testOne(t, dll, false, stdcall)
@@ -601,14 +698,10 @@ uintptr_t cfunc(callback f, uintptr_t n) {
    return r;
 }
 `
-	tmpdir, err := os.MkdirTemp("", "TestReturnAfterStackGrowInCallback")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer os.RemoveAll(tmpdir)
+	tmpdir := t.TempDir()
 
 	srcname := "mydll.c"
-	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err := os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -651,6 +744,51 @@ uintptr_t cfunc(callback f, uintptr_t n) {
 	}
 }
 
+func TestSyscall18(t *testing.T) {
+	if _, err := exec.LookPath("gcc"); err != nil {
+		t.Skip("skipping test: gcc is missing")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skipf("skipping test: GOARCH=%s", runtime.GOARCH)
+	}
+
+	const src = `
+#include <stdint.h>
+#include <windows.h>
+
+int cfunc(	int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8, int a9,
+			int a10, int a11, int a12, int a13, int a14, int a15, int a16, int a17, int a18) {
+	return 1;
+}
+`
+	tmpdir := t.TempDir()
+
+	srcname := "mydll.c"
+	err := os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	outname := "mydll.dll"
+	cmd := exec.Command("gcc", "-shared", "-s", "-Werror", "-o", outname, srcname)
+	cmd.Dir = tmpdir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build dll: %v - %v", err, string(out))
+	}
+	dllpath := filepath.Join(tmpdir, outname)
+
+	dll := syscall.MustLoadDLL(dllpath)
+	defer dll.Release()
+
+	proc := dll.MustFindProc("cfunc")
+
+	// proc.Call() will call Syscall18() internally.
+	r, _, err := proc.Call(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
+	if r != 1 {
+		t.Errorf("got %d want 1 (err=%v)", r, err)
+	}
+}
+
 func TestFloatArgs(t *testing.T) {
 	if _, err := exec.LookPath("gcc"); err != nil {
 		t.Skip("skipping test: gcc is missing")
@@ -670,14 +808,10 @@ uintptr_t cfunc(uintptr_t a, double b, float c, double d) {
 	return 0;
 }
 `
-	tmpdir, err := os.MkdirTemp("", "TestFloatArgs")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer os.RemoveAll(tmpdir)
+	tmpdir := t.TempDir()
 
 	srcname := "mydll.c"
-	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err := os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -732,14 +866,10 @@ double cfuncDouble(uintptr_t a, double b, float c, double d) {
 	return 0;
 }
 `
-	tmpdir, err := os.MkdirTemp("", "TestFloatReturn")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer os.RemoveAll(tmpdir)
+	tmpdir := t.TempDir()
 
 	srcname := "mydll.c"
-	err = os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	err := os.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -947,16 +1077,7 @@ func TestDLLPreloadMitigation(t *testing.T) {
 		t.Skip("skipping test: gcc is missing")
 	}
 
-	tmpdir, err := os.MkdirTemp("", "TestDLLPreloadMitigation")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer func() {
-		err := os.RemoveAll(tmpdir)
-		if err != nil {
-			t.Error(err)
-		}
-	}()
+	tmpdir := t.TempDir()
 
 	dir0, err := os.Getwd()
 	if err != nil {
@@ -1034,11 +1155,7 @@ func TestBigStackCallbackSyscall(t *testing.T) {
 		t.Fatal("Abs failed: ", err)
 	}
 
-	tmpdir, err := os.MkdirTemp("", "TestBigStackCallback")
-	if err != nil {
-		t.Fatal("TempDir failed: ", err)
-	}
-	defer os.RemoveAll(tmpdir)
+	tmpdir := t.TempDir()
 
 	outname := "mydll.dll"
 	cmd := exec.Command("gcc", "-shared", "-s", "-Werror", "-o", outname, srcname)
@@ -1183,14 +1300,10 @@ func BenchmarkOsYield(b *testing.B) {
 }
 
 func BenchmarkRunningGoProgram(b *testing.B) {
-	tmpdir, err := os.MkdirTemp("", "BenchmarkRunningGoProgram")
-	if err != nil {
-		b.Fatal(err)
-	}
-	defer os.RemoveAll(tmpdir)
+	tmpdir := b.TempDir()
 
 	src := filepath.Join(tmpdir, "main.go")
-	err = os.WriteFile(src, []byte(benchmarkRunningGoProgram), 0666)
+	err := os.WriteFile(src, []byte(benchmarkRunningGoProgram), 0666)
 	if err != nil {
 		b.Fatal(err)
 	}
diff --git a/src/runtime/testdata/testprog/crashdump.go b/src/runtime/testdata/testprog/crashdump.go
new file mode 100644
index 0000000000..bced397b8a
--- /dev/null
+++ b/src/runtime/testdata/testprog/crashdump.go
@@ -0,0 +1,47 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+)
+
+func init() {
+	register("CrashDumpsAllThreads", CrashDumpsAllThreads)
+}
+
+func CrashDumpsAllThreads() {
+	const count = 4
+	runtime.GOMAXPROCS(count + 1)
+
+	chans := make([]chan bool, count)
+	for i := range chans {
+		chans[i] = make(chan bool)
+		go crashDumpsAllThreadsLoop(i, chans[i])
+	}
+
+	// Wait for all the goroutines to start executing.
+	for _, c := range chans {
+		<-c
+	}
+
+	// Tell our parent that all the goroutines are executing.
+	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
+		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
+		os.Exit(2)
+	}
+
+	select {}
+}
+
+func crashDumpsAllThreadsLoop(i int, c chan bool) {
+	close(c)
+	for {
+		for j := 0; j < 0x7fffffff; j++ {
+		}
+	}
+}
diff --git a/src/runtime/testdata/testprogcgo/windows/win.go b/src/runtime/testdata/testprogcgo/windows/win.go
index f2eabb9548..12488aa658 100644
--- a/src/runtime/testdata/testprogcgo/windows/win.go
+++ b/src/runtime/testdata/testprogcgo/windows/win.go
@@ -1,7 +1,7 @@
 package windows
 
 /*
-#cgo CFLAGS: -mnop-fun-dllimport
+#cgo amd64 386 CFLAGS: -mnop-fun-dllimport
 
 #include <windows.h>
 
diff --git a/src/runtime/testdata/testwinlibsignal/dummy.go b/src/runtime/testdata/testwinlibsignal/dummy.go
index 82dfd91c93..e610f15d06 100644
--- a/src/runtime/testdata/testwinlibsignal/dummy.go
+++ b/src/runtime/testdata/testwinlibsignal/dummy.go
@@ -1,7 +1,10 @@
+//go:build windows
 // +build windows
 
 package main
 
+import "C"
+
 //export Dummy
 func Dummy() int {
 	return 42
diff --git a/src/runtime/testdata/testwinsignal/main.go b/src/runtime/testdata/testwinsignal/main.go
index d8cd884ffa..1e7c9475fd 100644
--- a/src/runtime/testdata/testwinsignal/main.go
+++ b/src/runtime/testdata/testwinsignal/main.go
@@ -1,19 +1,19 @@
-package main
-
-import (
-	"fmt"
-	"os"
-	"os/signal"
-	"time"
-)
-
-func main() {
-	c := make(chan os.Signal, 1)
-	signal.Notify(c)
-
-	fmt.Println("ready")
-	sig := <-c
-
-	time.Sleep(time.Second)
-	fmt.Println(sig)
-}
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/signal"
+	"time"
+)
+
+func main() {
+	c := make(chan os.Signal, 1)
+	signal.Notify(c)
+
+	fmt.Println("ready")
+	sig := <-c
+
+	time.Sleep(time.Second)
+	fmt.Println(sig)
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index 8ab2a03430..dee6a674e4 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -263,6 +263,9 @@ func addtimer(t *timer) {
 
 	when := t.when
 
+	// Disable preemption while using pp to avoid changing another P's heap.
+	mp := acquirem()
+
 	pp := getg().m.p.ptr()
 	lock(&pp.timersLock)
 	cleantimers(pp)
@@ -270,6 +273,8 @@ func addtimer(t *timer) {
 	unlock(&pp.timersLock)
 
 	wakeNetPoller(when)
+
+	releasem(mp)
 }
 
 // doaddtimer adds t to the current P's heap.
diff --git a/src/runtime/time_fake.go b/src/runtime/time_fake.go
index c64d2994a9..c790faba3d 100644
--- a/src/runtime/time_fake.go
+++ b/src/runtime/time_fake.go
@@ -2,20 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build faketime
-// +build !windows
+//go:build faketime && !windows
+// +build faketime,!windows
 
-// Faketime isn't currently supported on Windows. This would require:
-//
-// 1. Shadowing time_now, which is implemented in assembly on Windows.
-//    Since that's exported directly to the time package from runtime
-//    assembly, this would involve moving it from sys_windows_*.s into
-//    its own assembly files build-tagged with !faketime and using the
-//    implementation of time_now from timestub.go in faketime mode.
-//
-// 2. Modifying syscall.Write to call syscall.faketimeWrite,
-//    translating the Stdout and Stderr handles into FDs 1 and 2.
-//    (See CL 192739 PS 3.)
+// Faketime isn't currently supported on Windows. This would require
+// modifying syscall.Write to call syscall.faketimeWrite,
+// translating the Stdout and Stderr handles into FDs 1 and 2.
+// (See CL 192739 PS 3.)
 
 package runtime
 
@@ -44,8 +37,9 @@ func nanotime() int64 {
 	return faketime
 }
 
-func walltime() (sec int64, nsec int32) {
-	return faketime / 1000000000, int32(faketime % 1000000000)
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64) {
+	return faketime / 1e9, int32(faketime % 1e9), faketime
 }
 
 func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
diff --git a/src/runtime/time_linux_amd64.s b/src/runtime/time_linux_amd64.s
new file mode 100644
index 0000000000..0dd7919896
--- /dev/null
+++ b/src/runtime/time_linux_amd64.s
@@ -0,0 +1,97 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+// +build !faketime
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define SYS_clock_gettime	228
+
+// func time.now() (sec int64, nsec int32, mono int64)
+TEXT time·now(SB),NOSPLIT,$16-24
+	MOVQ	SP, R12 // Save old SP; R12 unchanged by C code.
+
+#ifdef GOEXPERIMENT_regabig
+	MOVQ	g_m(R14), BX // BX unchanged by C code.
+#else
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), BX // BX unchanged by C code.
+#endif
+
+	// Store CLOCK_REALTIME results directly to return space.
+	LEAQ	sec+0(FP), SI
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	// Save the old values on stack and restore them on exit,
+	// so this function is reentrant.
+	MOVQ	m_vdsoPC(BX), CX
+	MOVQ	m_vdsoSP(BX), DX
+	MOVQ	CX, 0(SP)
+	MOVQ	DX, 8(SP)
+
+	MOVQ	-8(SI), CX	// Sets CX to function return address.
+	MOVQ	CX, m_vdsoPC(BX)
+	MOVQ	SI, m_vdsoSP(BX)
+
+#ifdef GOEXPERIMENT_regabig
+	CMPQ	R14, m_curg(BX)	// Only switch if on curg.
+#else
+	CMPQ	AX, m_curg(BX)	// Only switch if on curg.
+#endif
+	JNE	noswitch
+
+	MOVQ	m_g0(BX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBQ	$16, SP		// Space for monotonic time results
+	ANDQ	$~15, SP	// Align for C code
+
+	MOVL	$0, DI // CLOCK_REALTIME
+	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback
+	CALL	AX
+
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
+	CALL	AX
+
+ret:
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
+
+	MOVQ	R12, SP		// Restore real SP
+	// Restore vdsoPC, vdsoSP
+	// We don't worry about being signaled between the two stores.
+	// If we are not in a signal handler, we'll restore vdsoSP to 0,
+	// and no one will care about vdsoPC. If we are in a signal handler,
+	// we cannot receive another signal.
+	MOVQ	8(SP), CX
+	MOVQ	CX, m_vdsoSP(BX)
+	MOVQ	0(SP), CX
+	MOVQ	CX, m_vdsoPC(BX)
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, mono+16(FP)
+	RET
+
+fallback:
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
+
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
+
+	JMP	ret
diff --git a/src/runtime/time_nofake.go b/src/runtime/time_nofake.go
index 1912a94e87..5a4ceaf43d 100644
--- a/src/runtime/time_nofake.go
+++ b/src/runtime/time_nofake.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !faketime
 // +build !faketime
 
 package runtime
@@ -19,10 +20,6 @@ func nanotime() int64 {
 	return nanotime1()
 }
 
-func walltime() (sec int64, nsec int32) {
-	return walltime1()
-}
-
 // write must be nosplit on Windows (see write1)
 //
 //go:nosplit
diff --git a/src/runtime/time_windows.h b/src/runtime/time_windows.h
new file mode 100644
index 0000000000..cd16fd163b
--- /dev/null
+++ b/src/runtime/time_windows.h
@@ -0,0 +1,16 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Constants for fetching time values on Windows for use in asm code.
+
+// See https://wrkhpi.wordpress.com/2007/08/09/getting-os-information-the-kuser_shared_data-structure/
+// Archived copy at:
+// http://web.archive.org/web/20210411000829/https://wrkhpi.wordpress.com/2007/08/09/getting-os-information-the-kuser_shared_data-structure/
+
+// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
+#define _INTERRUPT_TIME 0x7ffe0008
+#define _SYSTEM_TIME 0x7ffe0014
+#define time_lo 0
+#define time_hi1 4
+#define time_hi2 8
diff --git a/src/runtime/time_windows_386.s b/src/runtime/time_windows_386.s
new file mode 100644
index 0000000000..19ce6910d7
--- /dev/null
+++ b/src/runtime/time_windows_386.s
@@ -0,0 +1,85 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+// +build !faketime
+
+#include "go_asm.h"
+#include "textflag.h"
+#include "time_windows.h"
+
+TEXT time·now(SB),NOSPLIT,$0-20
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+loop:
+	MOVL	(_INTERRUPT_TIME+time_hi1), AX
+	MOVL	(_INTERRUPT_TIME+time_lo), CX
+	MOVL	(_INTERRUPT_TIME+time_hi2), DI
+	CMPL	AX, DI
+	JNE	loop
+
+	// w = DI:CX
+	// multiply by 100
+	MOVL	$100, AX
+	MULL	CX
+	IMULL	$100, DI
+	ADDL	DI, DX
+	// w*100 = DX:AX
+	MOVL	AX, mono+12(FP)
+	MOVL	DX, mono+16(FP)
+
+wall:
+	MOVL	(_SYSTEM_TIME+time_hi1), CX
+	MOVL	(_SYSTEM_TIME+time_lo), AX
+	MOVL	(_SYSTEM_TIME+time_hi2), DX
+	CMPL	CX, DX
+	JNE	wall
+
+	// w = DX:AX
+	// convert to Unix epoch (but still 100ns units)
+	#define delta 116444736000000000
+	SUBL	$(delta & 0xFFFFFFFF), AX
+	SBBL $(delta >> 32), DX
+
+	// nano/100 = DX:AX
+	// split into two decimal halves by div 1e9.
+	// (decimal point is two spots over from correct place,
+	// but we avoid overflow in the high word.)
+	MOVL	$1000000000, CX
+	DIVL	CX
+	MOVL	AX, DI
+	MOVL	DX, SI
+
+	// DI = nano/100/1e9 = nano/1e11 = sec/100, DX = SI = nano/100%1e9
+	// split DX into seconds and nanoseconds by div 1e7 magic multiply.
+	MOVL	DX, AX
+	MOVL	$1801439851, CX
+	MULL	CX
+	SHRL	$22, DX
+	MOVL	DX, BX
+	IMULL	$10000000, DX
+	MOVL	SI, CX
+	SUBL	DX, CX
+
+	// DI = sec/100 (still)
+	// BX = (nano/100%1e9)/1e7 = (nano/1e9)%100 = sec%100
+	// CX = (nano/100%1e9)%1e7 = (nano%1e9)/100 = nsec/100
+	// store nsec for return
+	IMULL	$100, CX
+	MOVL	CX, nsec+8(FP)
+
+	// DI = sec/100 (still)
+	// BX = sec%100
+	// construct DX:AX = 64-bit sec and store for return
+	MOVL	$0, DX
+	MOVL	$100, AX
+	MULL	DI
+	ADDL	BX, AX
+	ADCL	$0, DX
+	MOVL	AX, sec+0(FP)
+	MOVL	DX, sec+4(FP)
+	RET
+useQPC:
+	JMP	runtime·nowQPC(SB)
+	RET
diff --git a/src/runtime/time_windows_amd64.s b/src/runtime/time_windows_amd64.s
new file mode 100644
index 0000000000..93ab960b06
--- /dev/null
+++ b/src/runtime/time_windows_amd64.s
@@ -0,0 +1,56 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+// +build !faketime
+
+#include "go_asm.h"
+#include "textflag.h"
+#include "time_windows.h"
+
+TEXT time·now(SB),NOSPLIT,$0-24
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+	MOVQ	$_INTERRUPT_TIME, DI
+loop:
+	MOVL	time_hi1(DI), AX
+	MOVL	time_lo(DI), BX
+	MOVL	time_hi2(DI), CX
+	CMPL	AX, CX
+	JNE	loop
+	SHLQ	$32, AX
+	ORQ	BX, AX
+	IMULQ	$100, AX
+	MOVQ	AX, mono+16(FP)
+
+	MOVQ	$_SYSTEM_TIME, DI
+wall:
+	MOVL	time_hi1(DI), AX
+	MOVL	time_lo(DI), BX
+	MOVL	time_hi2(DI), CX
+	CMPL	AX, CX
+	JNE	wall
+	SHLQ	$32, AX
+	ORQ	BX, AX
+	MOVQ	$116444736000000000, DI
+	SUBQ	DI, AX
+	IMULQ	$100, AX
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
+useQPC:
+	JMP	runtime·nowQPC(SB)
+	RET
diff --git a/src/runtime/time_windows_arm.s b/src/runtime/time_windows_arm.s
new file mode 100644
index 0000000000..7c763b66ed
--- /dev/null
+++ b/src/runtime/time_windows_arm.s
@@ -0,0 +1,87 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+// +build !faketime
+
+#include "go_asm.h"
+#include "textflag.h"
+#include "time_windows.h"
+
+TEXT time·now(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW    $0, R0
+	MOVB    runtime·useQPCTime(SB), R0
+	CMP	$0, R0
+	BNE	useQPC
+	MOVW	$_INTERRUPT_TIME, R3
+loop:
+	MOVW	time_hi1(R3), R1
+	MOVW	time_lo(R3), R0
+	MOVW	time_hi2(R3), R2
+	CMP	R1, R2
+	BNE	loop
+
+	// wintime = R1:R0, multiply by 100
+	MOVW	$100, R2
+	MULLU	R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
+	MULA	R1, R2, R4, R4
+
+	// wintime*100 = R4:R3
+	MOVW	R3, mono+12(FP)
+	MOVW	R4, mono+16(FP)
+
+	MOVW	$_SYSTEM_TIME, R3
+wall:
+	MOVW	time_hi1(R3), R1
+	MOVW	time_lo(R3), R0
+	MOVW	time_hi2(R3), R2
+	CMP	R1, R2
+	BNE	wall
+
+	// w = R1:R0 in 100ns untis
+	// convert to Unix epoch (but still 100ns units)
+	#define delta 116444736000000000
+	SUB.S   $(delta & 0xFFFFFFFF), R0
+	SBC     $(delta >> 32), R1
+
+	// Convert to nSec
+	MOVW    $100, R2
+	MULLU   R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
+	MULA    R1, R2, R4, R4
+	// w = R2:R1 in nSec
+	MOVW    R3, R1	      // R4:R3 -> R2:R1
+	MOVW    R4, R2
+
+	// multiply nanoseconds by reciprocal of 10**9 (scaled by 2**61)
+	// to get seconds (96 bit scaled result)
+	MOVW	$0x89705f41, R3		// 2**61 * 10**-9
+	MULLU	R1,R3,(R6,R5)		// R7:R6:R5 = R2:R1 * R3
+	MOVW	$0,R7
+	MULALU	R2,R3,(R7,R6)
+
+	// unscale by discarding low 32 bits, shifting the rest by 29
+	MOVW	R6>>29,R6		// R7:R6 = (R7:R6:R5 >> 61)
+	ORR	R7<<3,R6
+	MOVW	R7>>29,R7
+
+	// subtract (10**9 * sec) from nsec to get nanosecond remainder
+	MOVW	$1000000000, R5	// 10**9
+	MULLU	R6,R5,(R9,R8)   // R9:R8 = R7:R6 * R5
+	MULA	R7,R5,R9,R9
+	SUB.S	R8,R1		// R2:R1 -= R9:R8
+	SBC	R9,R2
+
+	// because reciprocal was a truncated repeating fraction, quotient
+	// may be slightly too small -- adjust to make remainder < 10**9
+	CMP	R5,R1	// if remainder > 10**9
+	SUB.HS	R5,R1   //    remainder -= 10**9
+	ADD.HS	$1,R6	//    sec += 1
+
+	MOVW	R6,sec_lo+0(FP)
+	MOVW	R7,sec_hi+4(FP)
+	MOVW	R1,nsec+8(FP)
+	RET
+useQPC:
+	B	runtime·nowQPC(SB)		// tail call
+
diff --git a/src/runtime/time_windows_arm64.s b/src/runtime/time_windows_arm64.s
new file mode 100644
index 0000000000..ef52ce4c99
--- /dev/null
+++ b/src/runtime/time_windows_arm64.s
@@ -0,0 +1,67 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+// +build !faketime
+
+#include "go_asm.h"
+#include "textflag.h"
+#include "time_windows.h"
+
+TEXT time·now(SB),NOSPLIT|NOFRAME,$0-24
+	MOVB    runtime·useQPCTime(SB), R0
+	CMP	$0, R0
+	BNE	useQPC
+	MOVD	$_INTERRUPT_TIME, R3
+loop:
+	MOVWU	time_hi1(R3), R1
+	MOVWU	time_lo(R3), R0
+	MOVWU	time_hi2(R3), R2
+	CMP	R1, R2
+	BNE	loop
+
+	// wintime = R1:R0, multiply by 100
+	ORR	R1<<32, R0
+	MOVD	$100, R1
+	MUL	R1, R0
+	MOVD	R0, mono+16(FP)
+
+	MOVD	$_SYSTEM_TIME, R3
+wall:
+	MOVWU	time_hi1(R3), R1
+	MOVWU	time_lo(R3), R0
+	MOVWU	time_hi2(R3), R2
+	CMP	R1, R2
+	BNE	wall
+
+	// w = R1:R0 in 100ns units
+	// convert to Unix epoch (but still 100ns units)
+	#define delta 116444736000000000
+	ORR	R1<<32, R0
+	SUB	$delta, R0
+
+	// Convert to nSec
+	MOVD	$100, R1
+	MUL	R1, R0
+
+	// Code stolen from compiler output for:
+	//
+	//	var x uint64
+	//	func f() (sec uint64, nsec uint32) { return x / 1000000000, uint32(x % 100000000) }
+	//
+	LSR	$1, R0, R1
+	MOVD	$-8543223759426509416, R2
+	UMULH	R2, R1, R1
+	LSR	$28, R1, R1
+	MOVD	R1, sec+0(FP)
+	MOVD	$-6067343680855748867, R1
+	UMULH	R0, R1, R1
+	LSR	$26, R1, R1
+	MOVD	$100000000, R2
+	MSUB	R1, R0, R2, R0
+	MOVW	R0, nsec+8(FP)
+	RET
+useQPC:
+	B	runtime·nowQPC(SB)		// tail call
+
diff --git a/src/runtime/timeasm.go b/src/runtime/timeasm.go
index 82cf63edff..468ff8a0d3 100644
--- a/src/runtime/timeasm.go
+++ b/src/runtime/timeasm.go
@@ -4,7 +4,9 @@
 
 // Declarations for operating systems implementing time.now directly in assembly.
 
-// +build windows
+//go:build !faketime && (windows || (linux && amd64))
+// +build !faketime
+// +build windows linux,amd64
 
 package runtime
 
diff --git a/src/runtime/timestub.go b/src/runtime/timestub.go
index 459bf8e543..6f16c70b81 100644
--- a/src/runtime/timestub.go
+++ b/src/runtime/timestub.go
@@ -5,7 +5,10 @@
 // Declarations for operating systems implementing time.now
 // indirectly, in terms of walltime and nanotime assembly.
 
+//go:build !faketime && !windows && !(linux && amd64)
+// +build !faketime
 // +build !windows
+// +build !linux !amd64
 
 package runtime
 
diff --git a/src/runtime/timestub2.go b/src/runtime/timestub2.go
index 68777ee4a9..800a2a94e0 100644
--- a/src/runtime/timestub2.go
+++ b/src/runtime/timestub2.go
@@ -2,13 +2,15 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !aix && !darwin && !freebsd && !openbsd && !solaris && !windows && !(linux && amd64)
 // +build !aix
 // +build !darwin
 // +build !freebsd
 // +build !openbsd
 // +build !solaris
 // +build !windows
+// +build !linux !amd64
 
 package runtime
 
-func walltime1() (sec int64, nsec int32)
+func walltime() (sec int64, nsec int32)
diff --git a/src/runtime/tls_ppc64x.s b/src/runtime/tls_ppc64x.s
index c697449282..25d796fcc6 100644
--- a/src/runtime/tls_ppc64x.s
+++ b/src/runtime/tls_ppc64x.s
@@ -29,7 +29,7 @@ TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
 	BEQ	nocgo
 #endif
 	MOVD	runtime·tls_g(SB), R31
-	MOVD	g, 0(R13)(R31*1)
+	MOVD	g, 0(R31)
 
 nocgo:
 	RET
@@ -45,7 +45,7 @@ nocgo:
 // NOTE: _cgo_topofstack assumes this only clobbers g (R30), and R31.
 TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	runtime·tls_g(SB), R31
-	MOVD	0(R13)(R31*1), g
+	MOVD	0(R31), g
 	RET
 
 GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
diff --git a/src/runtime/tls_stub.go b/src/runtime/tls_stub.go
new file mode 100644
index 0000000000..95dafd007c
--- /dev/null
+++ b/src/runtime/tls_stub.go
@@ -0,0 +1,11 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (windows && !amd64) || !windows
+// +build windows,!amd64 !windows
+
+package runtime
+
+//go:nosplit
+func osSetupTLS(mp *m) {}
diff --git a/src/runtime/tls_windows_amd64.go b/src/runtime/tls_windows_amd64.go
new file mode 100644
index 0000000000..cacaa84496
--- /dev/null
+++ b/src/runtime/tls_windows_amd64.go
@@ -0,0 +1,10 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// osSetupTLS is called by needm to set up TLS for non-Go threads.
+//
+// Defined in assembly.
+func osSetupTLS(mp *m)
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index bcd0b9d56c..1530178c85 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -53,8 +53,8 @@ const (
 	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
 	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
 	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
-	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
-	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
+	traceEvHeapAlloc         = 33 // gcController.heapLive change [timestamp, heap_alloc]
+	traceEvHeapGoal          = 34 // gcController.heapGoal (formerly next_gc) change [timestamp, heap goal in bytes]
 	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
 	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	traceEvString            = 37 // string dictionary entry [ID, length, string]
@@ -221,7 +221,8 @@ func StartTrace() error {
 	stackID := traceStackID(mp, stkBuf, 2)
 	releasem(mp)
 
-	for _, gp := range allgs {
+	// World is stopped, no need to lock.
+	forEachGRace(func(gp *g) {
 		status := readgstatus(gp)
 		if status != _Gdead {
 			gp.traceseq = 0
@@ -241,7 +242,7 @@ func StartTrace() error {
 		} else {
 			gp.sysblocktraced = false
 		}
-	}
+	})
 	traceProcStart()
 	traceGoStart()
 	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
@@ -1143,15 +1144,15 @@ func traceGoSysBlock(pp *p) {
 }
 
 func traceHeapAlloc() {
-	traceEvent(traceEvHeapAlloc, -1, memstats.heap_live)
+	traceEvent(traceEvHeapAlloc, -1, gcController.heapLive)
 }
 
-func traceNextGC() {
-	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
+func traceHeapGoal() {
+	if heapGoal := atomic.Load64(&gcController.heapGoal); heapGoal == ^uint64(0) {
 		// Heap-based triggering is disabled.
-		traceEvent(traceEvNextGC, -1, 0)
+		traceEvent(traceEvHeapGoal, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, nextGC)
+		traceEvent(traceEvHeapGoal, -1, heapGoal)
 	}
 }
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 53eb689848..167d51c452 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -457,17 +457,8 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 					name = "panic"
 				}
 				print(name, "(")
-				argp := (*[100]uintptr)(unsafe.Pointer(frame.argp))
-				for i := uintptr(0); i < frame.arglen/sys.PtrSize; i++ {
-					if i >= 10 {
-						print(", ...")
-						break
-					}
-					if i != 0 {
-						print(", ")
-					}
-					print(hex(argp[i]))
-				}
+				argp := unsafe.Pointer(frame.argp)
+				printArgs(f, argp)
 				print(")\n")
 				print("\t", file, ":", line)
 				if frame.pc > f.entry {
@@ -579,6 +570,82 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 	return n
 }
 
+// printArgs prints function arguments in traceback.
+func printArgs(f funcInfo, argp unsafe.Pointer) {
+	// The "instruction" of argument printing is encoded in _FUNCDATA_ArgInfo.
+	// See cmd/compile/internal/ssagen.emitArgInfo for the description of the
+	// encoding.
+	// These constants need to be in sync with the compiler.
+	const (
+		_endSeq         = 0xff
+		_startAgg       = 0xfe
+		_endAgg         = 0xfd
+		_dotdotdot      = 0xfc
+		_offsetTooLarge = 0xfb
+	)
+
+	const (
+		limit    = 10                       // print no more than 10 args/components
+		maxDepth = 5                        // no more than 5 layers of nesting
+		maxLen   = (maxDepth*3+2)*limit + 1 // max length of _FUNCDATA_ArgInfo (see the compiler side for reasoning)
+	)
+
+	p := (*[maxLen]uint8)(funcdata(f, _FUNCDATA_ArgInfo))
+	if p == nil {
+		return
+	}
+
+	print1 := func(off, sz uint8) {
+		x := readUnaligned64(add(argp, uintptr(off)))
+		// mask out irrelavant bits
+		if sz < 8 {
+			shift := 64 - sz*8
+			if sys.BigEndian {
+				x = x >> shift
+			} else {
+				x = x << shift >> shift
+			}
+		}
+		print(hex(x))
+	}
+
+	start := true
+	printcomma := func() {
+		if !start {
+			print(", ")
+		}
+	}
+	pi := 0
+printloop:
+	for {
+		o := p[pi]
+		pi++
+		switch o {
+		case _endSeq:
+			break printloop
+		case _startAgg:
+			printcomma()
+			print("{")
+			start = true
+			continue
+		case _endAgg:
+			print("}")
+		case _dotdotdot:
+			printcomma()
+			print("...")
+		case _offsetTooLarge:
+			printcomma()
+			print("_")
+		default:
+			printcomma()
+			sz := p[pi]
+			pi++
+			print1(o, sz)
+		}
+		start = false
+	}
+}
+
 // reflectMethodValue is a partial duplicate of reflect.makeFuncImpl
 // and reflect.methodValue.
 type reflectMethodValue struct {
@@ -630,7 +697,7 @@ func getArgInfo(frame *stkframe, f funcInfo, needArgMap bool, ctxt *funcval) (ar
 				// Figure out whether the return values are valid.
 				// Reflect will update this value after it copies
 				// in the return values.
-				retValid = *(*bool)(unsafe.Pointer(arg0 + 3*sys.PtrSize))
+				retValid = *(*bool)(unsafe.Pointer(arg0 + 4*sys.PtrSize))
 			}
 			if mv.fn != f.entry {
 				print("runtime: confused by ", funcname(f), "\n")
@@ -945,19 +1012,16 @@ func tracebackothers(me *g) {
 		traceback(^uintptr(0), ^uintptr(0), 0, curgp)
 	}
 
-	// We can't take allglock here because this may be during fatal
-	// throw/panic, where locking allglock could be out-of-order or a
-	// direct deadlock.
+	// We can't call locking forEachG here because this may be during fatal
+	// throw/panic, where locking could be out-of-order or a direct
+	// deadlock.
 	//
-	// Instead, use atomic access to allgs which requires no locking. We
-	// don't lock against concurrent creation of new Gs, but even with
-	// allglock we may miss Gs created after this loop.
-	ptr, length := atomicAllG()
-	for i := uintptr(0); i < length; i++ {
-		gp := atomicAllGIndex(ptr, i)
-
+	// Instead, use forEachGRace, which requires no locking. We don't lock
+	// against concurrent creation of new Gs, but even with allglock we may
+	// miss Gs created after this loop.
+	forEachGRace(func(gp *g) {
 		if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
-			continue
+			return
 		}
 		print("\n")
 		goroutineheader(gp)
@@ -971,7 +1035,7 @@ func tracebackothers(me *g) {
 		} else {
 			traceback(^uintptr(0), ^uintptr(0), 0, gp)
 		}
-	}
+	})
 }
 
 // tracebackHexdump hexdumps part of stk around frame.sp and frame.fp
diff --git a/src/runtime/traceback_test.go b/src/runtime/traceback_test.go
new file mode 100644
index 0000000000..2a0497e9a9
--- /dev/null
+++ b/src/runtime/traceback_test.go
@@ -0,0 +1,121 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"runtime"
+	"testing"
+)
+
+var testTracebackArgsBuf [1000]byte
+
+func TestTracebackArgs(t *testing.T) {
+	tests := []struct {
+		fn     func() int
+		expect string
+	}{
+		// simple ints
+		{
+			func() int { return testTracebackArgs1(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) },
+			"testTracebackArgs1(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...)",
+		},
+		// some aggregates
+		{
+			func() int {
+				return testTracebackArgs2(false, struct {
+					a, b, c int
+					x       [2]int
+				}{1, 2, 3, [2]int{4, 5}}, [0]int{}, [3]byte{6, 7, 8})
+			},
+			"testTracebackArgs2(0x0, {0x1, 0x2, 0x3, {0x4, 0x5}}, {}, {0x6, 0x7, 0x8})",
+		},
+		{
+			func() int { return testTracebackArgs3([3]byte{1, 2, 3}, 4, 5, 6, [3]byte{7, 8, 9}) },
+			"testTracebackArgs3({0x1, 0x2, 0x3}, 0x4, 0x5, 0x6, {0x7, 0x8, 0x9})",
+		},
+		// too deeply nested type
+		{
+			func() int { return testTracebackArgs4(false, [1][1][1][1][1][1][1][1][1][1]int{}) },
+			"testTracebackArgs4(0x0, {{{{{...}}}}})",
+		},
+		// a lot of zero-sized type
+		{
+			func() int {
+				z := [0]int{}
+				return testTracebackArgs5(false, struct {
+					x int
+					y [0]int
+					z [2][0]int
+				}{1, z, [2][0]int{}}, z, z, z, z, z, z, z, z, z, z, z, z)
+			},
+			"testTracebackArgs5(0x0, {0x1, {}, {{}, {}}}, {}, {}, {}, {}, {}, ...)",
+		},
+	}
+	for _, test := range tests {
+		n := test.fn()
+		got := testTracebackArgsBuf[:n]
+		if !bytes.Contains(got, []byte(test.expect)) {
+			t.Errorf("traceback does not contain expected string: want %q, got\n%s", test.expect, got)
+		}
+	}
+}
+
+//go:noinline
+func testTracebackArgs1(a, b, c, d, e, f, g, h, i, j, k, l int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e + f + g + h + i + j + k + l
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs2(a bool, b struct {
+	a, b, c int
+	x       [2]int
+}, _ [0]int, d [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		// use in-reg args to keep them alive
+		return b.a + b.b + b.c + b.x[0] + b.x[1] + int(d[0]) + int(d[1]) + int(d[2])
+	}
+	return n
+
+}
+
+//go:noinline
+//go:registerparams
+func testTracebackArgs3(x [3]byte, a, b, c int, y [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return int(x[0]) + int(x[1]) + int(x[2]) + a + b + c + int(y[0]) + int(y[1]) + int(y[2])
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs4(a bool, x [1][1][1][1][1][1][1][1][1][1]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs5(a bool, x struct {
+	x int
+	y [0]int
+	z [2][0]int
+}, _, _, _, _, _, _, _, _, _, _, _, _ [0]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 18fc4bbfad..c0911b1dcb 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -262,7 +262,7 @@ func (t *_type) textOff(off textOff) unsafe.Pointer {
 	if off == -1 {
 		// -1 is the sentinel value for unreachable code.
 		// See cmd/link/internal/ld/data.go:relocsym.
-		return unsafe.Pointer(^uintptr(0))
+		return unsafe.Pointer(funcPC(unreachableMethod))
 	}
 	base := uintptr(unsafe.Pointer(t))
 	var md *moduledata
diff --git a/src/runtime/vdso_elf32.go b/src/runtime/vdso_elf32.go
index 2720f33eed..456173b0f5 100644
--- a/src/runtime/vdso_elf32.go
+++ b/src/runtime/vdso_elf32.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (386 || arm)
 // +build linux
 // +build 386 arm
 
diff --git a/src/runtime/vdso_elf64.go b/src/runtime/vdso_elf64.go
index 6ded9d621a..9923bd4697 100644
--- a/src/runtime/vdso_elf64.go
+++ b/src/runtime/vdso_elf64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le)
 // +build linux
 // +build amd64 arm64 mips64 mips64le ppc64 ppc64le
 
diff --git a/src/runtime/vdso_freebsd.go b/src/runtime/vdso_freebsd.go
index 122cc8b128..7ca7b2810b 100644
--- a/src/runtime/vdso_freebsd.go
+++ b/src/runtime/vdso_freebsd.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build freebsd
 // +build freebsd
 
 package runtime
@@ -105,7 +106,7 @@ func nanotime1() int64 {
 	return int64((1e9 * uint64(bt.sec)) + ((1e9 * uint64(bt.frac>>32)) >> 32))
 }
 
-func walltime1() (sec int64, nsec int32) {
+func walltime() (sec int64, nsec int32) {
 	bt := vdsoClockGettime(_CLOCK_REALTIME)
 	if bt == zeroBintime {
 		return fallback_walltime()
diff --git a/src/runtime/vdso_freebsd_x86.go b/src/runtime/vdso_freebsd_x86.go
index 1fa5d80dcc..23a5a8c322 100644
--- a/src/runtime/vdso_freebsd_x86.go
+++ b/src/runtime/vdso_freebsd_x86.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build freebsd && (386 || amd64)
 // +build freebsd
 // +build 386 amd64
 
diff --git a/src/runtime/vdso_in_none.go b/src/runtime/vdso_in_none.go
index 7f4019c0d6..c66fbf8216 100644
--- a/src/runtime/vdso_in_none.go
+++ b/src/runtime/vdso_in_none.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (linux && !386 && !amd64 && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le) || !linux
 // +build linux,!386,!amd64,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le !linux
 
 package runtime
diff --git a/src/runtime/vdso_linux.go b/src/runtime/vdso_linux.go
index 6e2942498d..ae211f96b1 100644
--- a/src/runtime/vdso_linux.go
+++ b/src/runtime/vdso_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (386 || amd64 || arm || arm64 || mips64 || mips64le || ppc64 || ppc64le)
 // +build linux
 // +build 386 amd64 arm arm64 mips64 mips64le ppc64 ppc64le
 
diff --git a/src/runtime/vdso_linux_mips64x.go b/src/runtime/vdso_linux_mips64x.go
index 3a0f947612..395ddbba69 100644
--- a/src/runtime/vdso_linux_mips64x.go
+++ b/src/runtime/vdso_linux_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips64 || mips64le)
 // +build linux
 // +build mips64 mips64le
 
diff --git a/src/runtime/vdso_linux_ppc64x.go b/src/runtime/vdso_linux_ppc64x.go
index f30946e4c5..b741dbfcdc 100644
--- a/src/runtime/vdso_linux_ppc64x.go
+++ b/src/runtime/vdso_linux_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (ppc64 || ppc64le)
 // +build linux
 // +build ppc64 ppc64le
 
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 996c0611fd..cf631bdcca 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -23,6 +23,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+//go:build arm || 386 || mips || mipsle
 // +build arm 386 mips mipsle
 
 package runtime
diff --git a/src/runtime/wincallback.go b/src/runtime/wincallback.go
index cf3327c6fe..8411c98412 100644
--- a/src/runtime/wincallback.go
+++ b/src/runtime/wincallback.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // Generate Windows callback assembly file.
@@ -30,7 +31,7 @@ func genasm386Amd64() {
 // CALL instruction in runtime·callbackasm. This determines
 // which Go callback function is executed later on.
 
-TEXT runtime·callbackasm(SB),7,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),7,$0
 `)
 	for i := 0; i < maxCallback; i++ {
 		buf.WriteString("\tCALL\truntime·callbackasm1(SB)\n")
@@ -58,7 +59,7 @@ func genasmArm() {
 // It then calls the Go implementation for that callback.
 #include "textflag.h"
 
-TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 `)
 	for i := 0; i < maxCallback; i++ {
 		buf.WriteString(fmt.Sprintf("\tMOVW\t$%d, R12\n", i))
@@ -86,7 +87,7 @@ func genasmArm64() {
 // It then calls the Go implementation for that callback.
 #include "textflag.h"
 
-TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 `)
 	for i := 0; i < maxCallback; i++ {
 		buf.WriteString(fmt.Sprintf("\tMOVD\t$%d, R12\n", i))
diff --git a/src/runtime/write_err.go b/src/runtime/write_err.go
index 6b1467b1c4..a4656fd728 100644
--- a/src/runtime/write_err.go
+++ b/src/runtime/write_err.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !android
 // +build !android
 
 package runtime
diff --git a/src/runtime/write_err_android.go b/src/runtime/write_err_android.go
index 2419fc8663..a876900c95 100644
--- a/src/runtime/write_err_android.go
+++ b/src/runtime/write_err_android.go
@@ -144,7 +144,7 @@ func writeLogdHeader() int {
 	//      hdr[3:7] sec unsigned uint32, little endian.
 	//      hdr[7:11] nsec unsigned uint32, little endian.
 	hdr[0] = 0 // LOG_ID_MAIN
-	sec, nsec := walltime()
+	sec, nsec, _ := time_now()
 	packUint32(hdr[3:7], uint32(sec))
 	packUint32(hdr[7:11], uint32(nsec))
 
diff --git a/src/runtime/zcallback_windows.s b/src/runtime/zcallback_windows.s
index 7772eef329..37ffb38aca 100644
--- a/src/runtime/zcallback_windows.s
+++ b/src/runtime/zcallback_windows.s
@@ -9,7 +9,7 @@
 // CALL instruction in runtime·callbackasm. This determines
 // which Go callback function is executed later on.
 
-TEXT runtime·callbackasm(SB),7,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),7,$0
 	CALL	runtime·callbackasm1(SB)
 	CALL	runtime·callbackasm1(SB)
 	CALL	runtime·callbackasm1(SB)
diff --git a/src/runtime/zcallback_windows_arm.s b/src/runtime/zcallback_windows_arm.s
index f943d84cbf..a73a813acb 100644
--- a/src/runtime/zcallback_windows_arm.s
+++ b/src/runtime/zcallback_windows_arm.s
@@ -9,7 +9,7 @@
 // It then calls the Go implementation for that callback.
 #include "textflag.h"
 
-TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$0, R12
 	B	runtime·callbackasm1(SB)
 	MOVW	$1, R12
diff --git a/src/runtime/zcallback_windows_arm64.s b/src/runtime/zcallback_windows_arm64.s
index 69fb05788c..2a6bda0990 100644
--- a/src/runtime/zcallback_windows_arm64.s
+++ b/src/runtime/zcallback_windows_arm64.s
@@ -9,7 +9,7 @@
 // It then calls the Go implementation for that callback.
 #include "textflag.h"
 
-TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
+TEXT runtime·callbackasm<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
 	MOVD	$0, R12
 	B	runtime·callbackasm1(SB)
 	MOVD	$1, R12
author	Filippo Valsorda <filippo@golang.org>	2021-05-12 19:23:21 +0200
committer	Filippo Valsorda <filippo@golang.org>	2021-05-13 12:59:22 -0400
commit	ed1f812cefc3ece4b21241ba4cba0272cd2484ed (patch)
tree	2e336c94eb9797488234c0cb86b123c1c5f2932b /src/runtime
parent	ad1b6f3ee00ce2592503efec7a9793c4786f6274 (diff)
parent	9d0819b27ca248f9949e7cf6bf7cb9fe7cf574e8 (diff)
download	go-ed1f812cefc3ece4b21241ba4cba0272cd2484ed.tar.xz