35 files changed, 3004 insertions, 176 deletions
diff --git a/src/runtime/_mkmalloc/mkmalloc.go b/src/runtime/_mkmalloc/mkmalloc.go
index 986b0aa9f8..1f040c8861 100644
--- a/src/runtime/_mkmalloc/mkmalloc.go
+++ b/src/runtime/_mkmalloc/mkmalloc.go
@@ -254,7 +254,8 @@ func inline(config generatorConfig) []byte {
 	}
 
 	// Write out the package and import declarations.
-	out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n\n")
+	out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n")
+	out.WriteString("// See overview in malloc_stubs.go.\n\n")
 	out.WriteString("package " + f.Name.Name + "\n\n")
 	for _, importDecl := range importDecls {
 		out.Write(mustFormatNode(fset, importDecl))
diff --git a/src/runtime/arena_test.go b/src/runtime/arena_test.go
index ca5223b59c..0bb1950464 100644
--- a/src/runtime/arena_test.go
+++ b/src/runtime/arena_test.go
@@ -36,6 +36,11 @@ type largeScalar [UserArenaChunkBytes + 1]byte
 type largePointer [UserArenaChunkBytes/unsafe.Sizeof(&smallPointer{}) + 1]*smallPointer
 
 func TestUserArena(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	// Set GOMAXPROCS to 2 so we don't run too many of these
 	// tests in parallel.
 	defer GOMAXPROCS(GOMAXPROCS(2))
@@ -228,6 +233,11 @@ func runSubTestUserArenaSlice[S comparable](t *testing.T, value []S, parallel bo
 }
 
 func TestUserArenaLiveness(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	t.Run("Free", func(t *testing.T) {
 		testUserArenaLiveness(t, false)
 	})
@@ -320,6 +330,11 @@ func testUserArenaLiveness(t *testing.T, useArenaFinalizer bool) {
 }
 
 func TestUserArenaClearsPointerBits(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	// This is a regression test for a serious issue wherein if pointer bits
 	// aren't properly cleared, it's possible to allocate scalar data down
 	// into a previously pointer-ful area, causing misinterpretation by the GC.
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index ea85146936..7c746803a8 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -181,6 +181,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
 	MOVQ	AX, 24(SP)
 	MOVQ	BX, 32(SP)
 
+	// This is typically the entry point for Go programs.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVQ	$0, BP
+
 	// create istack out of the given (operating system) stack.
 	// _cgo_init may update stackguard.
 	MOVQ	$runtime·g0(SB), DI
@@ -408,6 +416,13 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0
 	RET
 
 TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
+	// This is the root frame of new Go-created OS threads.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, BP
 	CALL	runtime·mstart0(SB)
 	RET // not reached
 
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 902a7066aa..01f2690f4e 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -109,6 +109,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
 	MOVW	R0, 8(RSP) // argc
 	MOVD	R1, 16(RSP) // argv
 
+	// This is typically the entry point for Go programs.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, R29
+
 #ifdef TLS_darwin
 	// Initialize TLS.
 	MOVD	ZR, g // clear g, make sure it's not junk.
@@ -248,6 +256,13 @@ TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
 	RET
 
 TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME,$0
+	// This is the root frame of new Go-created OS threads.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, R29
 	BL	runtime·mstart0(SB)
 	RET // not reached
 
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 5bd16181ee..428701a503 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -623,14 +623,14 @@ TEXT _cgo_topofstack(SB),NOSPLIT,$8
 	RET
 
 // func goexit(neverCallThisFunction)
-// The top-most function running on a goroutine
-// returns to goexit+PCQuantum.
+// The top-most function running on a goroutine, returns to goexit+PCQuantum*2.
+// Note that the NOPs are written in a manner that will not be compressed,
+// since the offset must be known by the runtime.
 TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
-	MOV	ZERO, ZERO	// NOP
+	WORD	$0x00000013	// NOP
 	JMP	runtime·goexit1(SB)	// does not return
 	// traceback from goexit1 must hit code range of goexit
-	MOV	ZERO, ZERO	// NOP
-
+	WORD	$0x00000013	// NOP
 
 // This is called from .init_array and follows the platform, not the Go ABI.
 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 2b8ca549ad..00e67aeca0 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -413,6 +413,15 @@ func TestRepanickedPanicSandwich(t *testing.T) {
 	}
 }
 
+func TestDoublePanicWithSameValue(t *testing.T) {
+	output := runTestProg(t, "testprog", "DoublePanicWithSameValue")
+	want := `panic: message
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
 func TestGoexitCrash(t *testing.T) {
 	// External linking brings in cgo, causing deadlock detection not working.
 	testenv.MustInternalLink(t, deadlockBuildTypes)
diff --git a/src/runtime/debuglog.go b/src/runtime/debuglog.go
index e993e396c1..405f2455c6 100644
--- a/src/runtime/debuglog.go
+++ b/src/runtime/debuglog.go
@@ -196,7 +196,8 @@ const (
 	debugLogPtr
 	debugLogString
 	debugLogConstString
-	debugLogStringOverflow
+	debugLogHexdump
+	debugLogOverflow
 
 	debugLogPC
 	debugLogTraceback
@@ -365,7 +366,7 @@ func (l *dloggerImpl) s(x string) *dloggerImpl {
 		l.w.uvarint(uint64(len(b)))
 		l.w.bytes(b)
 		if len(b) != len(x) {
-			l.w.byte(debugLogStringOverflow)
+			l.w.byte(debugLogOverflow)
 			l.w.uvarint(uint64(len(x) - len(b)))
 		}
 	}
@@ -373,6 +374,32 @@ func (l *dloggerImpl) s(x string) *dloggerImpl {
 }
 
 //go:nosplit
+func (l dloggerFake) hexdump(p unsafe.Pointer, bytes uintptr) dloggerFake { return l }
+
+//go:nosplit
+func (l *dloggerImpl) hexdump(p unsafe.Pointer, bytes uintptr) *dloggerImpl {
+	var b []byte
+	bb := (*slice)(unsafe.Pointer(&b))
+	bb.array = unsafe.Pointer(p)
+	bb.len, bb.cap = int(bytes), int(bytes)
+	if len(b) > debugLogStringLimit {
+		b = b[:debugLogStringLimit]
+	}
+
+	l.w.byte(debugLogHexdump)
+	l.w.uvarint(uint64(uintptr(p)))
+	l.w.uvarint(uint64(len(b)))
+	l.w.bytes(b)
+
+	if uintptr(len(b)) != bytes {
+		l.w.byte(debugLogOverflow)
+		l.w.uvarint(uint64(bytes) - uint64(len(b)))
+	}
+
+	return l
+}
+
+//go:nosplit
 func (l dloggerFake) pc(x uintptr) dloggerFake { return l }
 
 //go:nosplit
@@ -708,9 +735,30 @@ func (r *debugLogReader) printVal() bool {
 		s := *(*string)(unsafe.Pointer(&str))
 		print(s)
 
-	case debugLogStringOverflow:
+	case debugLogOverflow:
 		print("..(", r.uvarint(), " more bytes)..")
 
+	case debugLogHexdump:
+		p := uintptr(r.uvarint())
+		bl := r.uvarint()
+		if r.begin+bl > r.end {
+			r.begin = r.end
+			print("<hexdump length corrupted>")
+			break
+		}
+		println() // Start on a new line
+		hd := hexdumper{addr: p}
+		for bl > 0 {
+			b := r.data.b[r.begin%uint64(len(r.data.b)):]
+			if uint64(len(b)) > bl {
+				b = b[:bl]
+			}
+			r.begin += uint64(len(b))
+			bl -= uint64(len(b))
+			hd.write(b)
+		}
+		hd.close()
+
 	case debugLogPC:
 		printDebugLogPC(uintptr(r.uvarint()), false)
 
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 3a781b7551..6e0360aaca 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -238,6 +238,12 @@ func SetEnvs(e []string) { envs = e }
 
 const PtrSize = goarch.PtrSize
 
+const ClobberdeadPtr = clobberdeadPtr
+
+func Clobberfree() bool {
+	return debug.clobberfree != 0
+}
+
 var ForceGCPeriod = &forcegcperiod
 
 // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
@@ -633,6 +639,34 @@ func RunGetgThreadSwitchTest() {
 	}
 }
 
+// Expose freegc for testing.
+func Freegc(p unsafe.Pointer, size uintptr, noscan bool) {
+	freegc(p, size, noscan)
+}
+
+// Expose gcAssistBytes for the current g for testing.
+func AssistCredit() int64 {
+	assistG := getg()
+	if assistG.m.curg != nil {
+		assistG = assistG.m.curg
+	}
+	return assistG.gcAssistBytes
+}
+
+// Expose gcBlackenEnabled for testing.
+func GcBlackenEnable() bool {
+	// Note we do a non-atomic load here.
+	// Some checks against gcBlackenEnabled (e.g., in mallocgc)
+	// are currently done via non-atomic load for performance reasons,
+	// but other checks are done via atomic load (e.g., in mgcmark.go),
+	// so interpreting this value in a test may be subtle.
+	return gcBlackenEnabled != 0
+}
+
+const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled
+
+const RuntimeFreegcEnabled = runtimeFreegcEnabled
+
 const (
 	PageSize         = pageSize
 	PallocChunkPages = pallocChunkPages
@@ -1472,6 +1506,15 @@ func Releasem() {
 	releasem(getg().m)
 }
 
+// GoschedIfBusy is an explicit preemption check to call back
+// into the scheduler. This is useful for tests that run code
+// which spend most of their time as non-preemptible, as it
+// can be placed right after becoming preemptible again to ensure
+// that the scheduler gets a chance to preempt the goroutine.
+func GoschedIfBusy() {
+	goschedIfBusy()
+}
+
 type PIController struct {
 	piController
 }
@@ -1988,3 +2031,36 @@ func (head *ListHeadManual) Pop() unsafe.Pointer {
 func (head *ListHeadManual) Remove(p unsafe.Pointer) {
 	head.l.remove(p)
 }
+
+func Hexdumper(base uintptr, wordBytes int, mark func(addr uintptr, start func()), data ...[]byte) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	h := hexdumper{addr: base, addrBytes: 4, wordBytes: uint8(wordBytes)}
+	if mark != nil {
+		h.mark = func(addr uintptr, m hexdumpMarker) {
+			mark(addr, m.start)
+		}
+	}
+	for _, d := range data {
+		h.write(d)
+	}
+	h.close()
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("Hexdumper buf too small")
+	}
+	return string(buf[:n])
+}
+
+func HexdumpWords(p, bytes uintptr) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	hexdumpWords(p, bytes, nil)
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("HexdumpWords buf too small")
+	}
+	return string(buf[:n])
+}
diff --git a/src/runtime/hexdump.go b/src/runtime/hexdump.go
new file mode 100644
index 0000000000..0d7dbb540b
--- /dev/null
+++ b/src/runtime/hexdump.go
@@ -0,0 +1,269 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// hexdumpWords prints a word-oriented hex dump of [p, p+len).
+//
+// If mark != nil, it will be passed to hexdumper.mark.
+func hexdumpWords(p, len uintptr, mark func(uintptr, hexdumpMarker)) {
+	printlock()
+
+	// Provide a default annotation
+	symMark := func(u uintptr, hm hexdumpMarker) {
+		if mark != nil {
+			mark(u, hm)
+		}
+
+		// Can we symbolize this value?
+		val := *(*uintptr)(unsafe.Pointer(u))
+		fn := findfunc(val)
+		if fn.valid() {
+			hm.start()
+			print("<", funcname(fn), "+", hex(val-fn.entry()), ">\n")
+		}
+	}
+
+	h := hexdumper{addr: p, mark: symMark}
+	h.write(unsafe.Slice((*byte)(unsafe.Pointer(p)), len))
+	h.close()
+	printunlock()
+}
+
+// hexdumper is a Swiss-army knife hex dumper.
+//
+// To use, optionally set addr and wordBytes, then call write repeatedly,
+// followed by close.
+type hexdumper struct {
+	// addr is the address to print for the first byte of data.
+	addr uintptr
+
+	// addrBytes is the number of bytes of addr to print. If this is 0, it
+	// defaults to goarch.PtrSize.
+	addrBytes uint8
+
+	// wordBytes is the number of bytes in a word. If wordBytes is 1, this
+	// prints a byte-oriented dump. If it's > 1, this interprets the data as a
+	// sequence of words of the given size. If it's 0, it's treated as
+	// goarch.PtrSize.
+	wordBytes uint8
+
+	// mark is an optional function that can annotate values in the hex dump.
+	//
+	// If non-nil, it is called with the address of every complete, aligned word
+	// in the hex dump.
+	//
+	// If it decides to print an annotation, it must first call m.start(), then
+	// print the annotation, followed by a new line.
+	mark func(addr uintptr, m hexdumpMarker)
+
+	// Below here is state
+
+	ready int8 // 0=need to init state; 1=need to print header; 2=ready
+
+	// dataBuf accumulates a line at a time of data, in case it's split across
+	// buffers.
+	dataBuf  [16]byte
+	dataPos  uint8
+	dataSkip uint8 // Skip first n bytes of buf on first line
+
+	// toPos maps from byte offset in data to a visual offset in the printed line.
+	toPos [16]byte
+}
+
+type hexdumpMarker struct {
+	chars int
+}
+
+func (h *hexdumper) write(data []byte) {
+	if h.ready == 0 {
+		h.init()
+	}
+
+	// Handle leading data
+	if h.dataPos > 0 {
+		n := copy(h.dataBuf[h.dataPos:], data)
+		h.dataPos += uint8(n)
+		data = data[n:]
+		if h.dataPos < uint8(len(h.dataBuf)) {
+			return
+		}
+		h.flushLine(h.dataBuf[:])
+		h.dataPos = 0
+	}
+
+	// Handle full lines in data
+	for len(data) >= len(h.dataBuf) {
+		h.flushLine(data[:len(h.dataBuf)])
+		data = data[len(h.dataBuf):]
+	}
+
+	// Handle trailing data
+	h.dataPos = uint8(copy(h.dataBuf[:], data))
+}
+
+func (h *hexdumper) close() {
+	if h.dataPos > 0 {
+		h.flushLine(h.dataBuf[:h.dataPos])
+	}
+}
+
+func (h *hexdumper) init() {
+	const bytesPerLine = len(h.dataBuf)
+
+	if h.addrBytes == 0 {
+		h.addrBytes = goarch.PtrSize
+	} else if h.addrBytes < 0 || h.addrBytes > goarch.PtrSize {
+		throw("invalid addrBytes")
+	}
+
+	if h.wordBytes == 0 {
+		h.wordBytes = goarch.PtrSize
+	}
+	wb := int(h.wordBytes)
+	if wb < 0 || wb >= bytesPerLine || wb&(wb-1) != 0 {
+		throw("invalid wordBytes")
+	}
+
+	// Construct position mapping.
+	for i := range h.toPos {
+		// First, calculate the "field" within the line, applying byte swizzling.
+		field := 0
+		if goarch.BigEndian {
+			field = i
+		} else {
+			field = i ^ int(wb-1)
+		}
+		// Translate this field into a visual offset.
+		// "00112233 44556677  8899AABB CCDDEEFF"
+		h.toPos[i] = byte(field*2 + field/4 + field/8)
+	}
+
+	// The first line may need to skip some fields to get to alignment.
+	// Round down the starting address.
+	nAddr := h.addr &^ uintptr(bytesPerLine-1)
+	// Skip bytes to get to alignment.
+	h.dataPos = uint8(h.addr - nAddr)
+	h.dataSkip = uint8(h.addr - nAddr)
+	h.addr = nAddr
+
+	// We're ready to print the header.
+	h.ready = 1
+}
+
+func (h *hexdumper) flushLine(data []byte) {
+	const bytesPerLine = len(h.dataBuf)
+
+	const maxAddrChars = 2 * goarch.PtrSize
+	const addrSep = ": "
+	dataStart := int(2*h.addrBytes) + len(addrSep)
+	// dataChars uses the same formula to toPos above. We calculate it with the
+	// "last field", then add the size of the last field.
+	const dataChars = (bytesPerLine-1)*2 + (bytesPerLine-1)/4 + (bytesPerLine-1)/8 + 2
+	const asciiSep = "  "
+	asciiStart := dataStart + dataChars + len(asciiSep)
+	const asciiChars = bytesPerLine
+	nlPos := asciiStart + asciiChars
+
+	var lineBuf [maxAddrChars + len(addrSep) + dataChars + len(asciiSep) + asciiChars + 1]byte
+	clear := func() {
+		for i := range lineBuf {
+			lineBuf[i] = ' '
+		}
+	}
+	clear()
+
+	if h.ready == 1 {
+		// Print column offsets header.
+		for offset, pos := range h.toPos {
+			h.fmtHex(lineBuf[dataStart+int(pos+1):][:1], uint64(offset))
+		}
+		// Print ASCII offsets.
+		for offset := range asciiChars {
+			h.fmtHex(lineBuf[asciiStart+offset:][:1], uint64(offset))
+		}
+		lineBuf[nlPos] = '\n'
+		gwrite(lineBuf[:nlPos+1])
+		clear()
+		h.ready = 2
+	}
+
+	// Format address.
+	h.fmtHex(lineBuf[:2*h.addrBytes], uint64(h.addr))
+	copy(lineBuf[2*h.addrBytes:], addrSep)
+	// Format data in hex and ASCII.
+	for offset, b := range data {
+		if offset < int(h.dataSkip) {
+			continue
+		}
+
+		pos := h.toPos[offset]
+		h.fmtHex(lineBuf[dataStart+int(pos):][:2], uint64(b))
+
+		copy(lineBuf[dataStart+dataChars:], asciiSep)
+		ascii := uint8('.')
+		if b >= ' ' && b <= '~' {
+			ascii = b
+		}
+		lineBuf[asciiStart+offset] = ascii
+	}
+	// Trim buffer.
+	end := asciiStart + len(data)
+	lineBuf[end] = '\n'
+	buf := lineBuf[:end+1]
+
+	// Print.
+	gwrite(buf)
+
+	// Print marks.
+	if h.mark != nil {
+		clear()
+		for offset := 0; offset+int(h.wordBytes) <= len(data); offset += int(h.wordBytes) {
+			if offset < int(h.dataSkip) {
+				continue
+			}
+			addr := h.addr + uintptr(offset)
+			// Find the position of the left edge of this word
+			caret := dataStart + int(min(h.toPos[offset], h.toPos[offset+int(h.wordBytes)-1]))
+			h.mark(addr, hexdumpMarker{caret})
+		}
+	}
+
+	h.addr += uintptr(bytesPerLine)
+	h.dataPos = 0
+	h.dataSkip = 0
+}
+
+// fmtHex formats v in base 16 into buf. It fills all of buf. If buf is too
+// small to represent v, it the output will start with '*'.
+func (h *hexdumper) fmtHex(buf []byte, v uint64) {
+	const dig = "0123456789abcdef"
+	i := len(buf) - 1
+	for ; i >= 0; i-- {
+		buf[i] = dig[v%16]
+		v /= 16
+	}
+	if v != 0 {
+		// Indicate that we couldn't fit the whole number.
+		buf[0] = '*'
+	}
+}
+
+func (m hexdumpMarker) start() {
+	var spaces [64]byte
+	for i := range spaces {
+		spaces[i] = ' '
+	}
+	for m.chars > len(spaces) {
+		gwrite(spaces[:])
+		m.chars -= len(spaces)
+	}
+	gwrite(spaces[:m.chars])
+	print("^ ")
+}
diff --git a/src/runtime/hexdump_test.go b/src/runtime/hexdump_test.go
new file mode 100644
index 0000000000..cc44e48e4b
--- /dev/null
+++ b/src/runtime/hexdump_test.go
@@ -0,0 +1,151 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"internal/abi"
+	"internal/goarch"
+	"runtime"
+	"slices"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+func TestHexdumper(t *testing.T) {
+	check := func(label, got, want string) {
+		got = strings.TrimRight(got, "\n")
+		want = strings.TrimPrefix(want, "\n")
+		want = strings.TrimRight(want, "\n")
+		if got != want {
+			t.Errorf("%s: got\n%s\nwant\n%s", label, got, want)
+		}
+	}
+
+	data := make([]byte, 32)
+	for i := range data {
+		data[i] = 0x10 + byte(i)
+	}
+
+	check("basic", runtime.Hexdumper(0, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./`)
+
+	if !goarch.BigEndian {
+		// Different word sizes
+		check("word=4", runtime.Hexdumper(0, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./`)
+		check("word=8", runtime.Hexdumper(0, 8, nil, data), `
+           7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+00000000: 17161514 13121110  1f1e1d1c 1b1a1918  ................
+00000010: 27262524 23222120  2f2e2d2c 2b2a2928   !"#$%&'()*+,-./`)
+	}
+
+	// Starting offset
+	check("offset=1", runtime.Hexdumper(1, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+	if !goarch.BigEndian {
+		// ... combined with a word size
+		check("offset=1 and word=4", runtime.Hexdumper(1, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 121110   16151413  1a191817 1e1d1c1b   ...............
+00000010: 2221201f 26252423  2a292827 2e2d2c2b  . !"#$%&'()*+,-.
+00000020:       2f                              /`)
+	}
+
+	// Partial data full of annoying boundaries.
+	partials := make([][]byte, 0)
+	for i := 0; i < len(data); i += 2 {
+		partials = append(partials, data[i:i+2])
+	}
+	check("partials", runtime.Hexdumper(1, 1, nil, partials...), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+
+	// Marks.
+	check("marks", runtime.Hexdumper(0, 1, func(addr uintptr, start func()) {
+		if addr%7 == 0 {
+			start()
+			println("mark")
+		}
+	}, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+          ^ mark
+                         ^ mark
+                                          ^ mark
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./
+                     ^ mark
+                                      ^ mark`)
+	if !goarch.BigEndian {
+		check("marks and word=4", runtime.Hexdumper(0, 4, func(addr uintptr, start func()) {
+			if addr%7 == 0 {
+				start()
+				println("mark")
+			}
+		}, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+          ^ mark
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./
+                                      ^ mark`)
+	}
+}
+
+func TestHexdumpWords(t *testing.T) {
+	if goarch.BigEndian || goarch.PtrSize != 8 {
+		// We could support these, but it's kind of a pain.
+		t.Skip("requires 64-bit little endian")
+	}
+
+	// Most of this is in hexdumper. Here we just test the symbolizer.
+
+	pc := abi.FuncPCABIInternal(TestHexdumpWords)
+	pcs := slices.Repeat([]uintptr{pc}, 3)
+
+	// Make sure pcs doesn't move around on us.
+	var p runtime.Pinner
+	defer p.Unpin()
+	p.Pin(&pcs[0])
+	// Get a 16 byte, 16-byte-aligned chunk of pcs so the hexdump is simple.
+	start := uintptr(unsafe.Pointer(&pcs[0]))
+	start = (start + 15) &^ uintptr(15)
+
+	// Do the hex dump.
+	got := runtime.HexdumpWords(start, 16)
+
+	// Construct the expected output.
+	pcStr := fmt.Sprintf("%016x", pc)
+	pcStr = pcStr[:8] + " " + pcStr[8:] // Add middle space
+	ascii := make([]byte, 8)
+	for i := range ascii {
+		b := byte(pc >> (8 * i))
+		if b >= ' ' && b <= '~' {
+			ascii[i] = b
+		} else {
+			ascii[i] = '.'
+		}
+	}
+	want := fmt.Sprintf(`
+                   7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+%016x: %s  %s  %s%s
+                  ^ <runtime_test.TestHexdumpWords+0x0>
+                                     ^ <runtime_test.TestHexdumpWords+0x0>
+`, start, pcStr, pcStr, ascii, ascii)
+	want = strings.TrimPrefix(want, "\n")
+
+	if got != want {
+		t.Errorf("got\n%s\nwant\n%s", got, want)
+	}
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index fc4f21b532..d49dacaf68 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1080,7 +1080,8 @@ func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, checkGCTrigger
 //
 // We might consider turning these on by default; many of them previously were.
 // They account for a few % of mallocgc's cost though, which does matter somewhat
-// at scale.
+// at scale. (When testing changes to malloc, consider enabling this, and also
+// some function-local 'doubleCheck' consts such as in mbitmap.go currently.)
 const doubleCheckMalloc = false
 
 // sizeSpecializedMallocEnabled is the set of conditions where we enable the size-specialized
@@ -1089,6 +1090,14 @@ const doubleCheckMalloc = false
 // properly on plan9, so size-specialized malloc is also disabled on plan9.
 const sizeSpecializedMallocEnabled = goexperiment.SizeSpecializedMalloc && GOOS != "plan9" && !asanenabled && !raceenabled && !msanenabled && !valgrindenabled
 
+// runtimeFreegcEnabled is the set of conditions where we enable the runtime.freegc
+// implementation and the corresponding allocation-related changes: the experiment must be
+// enabled, and none of the memory sanitizers should be enabled. We allow the race detector,
+// in contrast to sizeSpecializedMallocEnabled.
+// TODO(thepudds): it would be nice to check Valgrind integration, though there are some hints
+// there might not be any canned tests in tree for Go's integration with Valgrind.
+const runtimeFreegcEnabled = goexperiment.RuntimeFreegc && !asanenabled && !msanenabled && !valgrindenabled
+
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -1150,7 +1159,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		size += asanRZ
 	}
 
-	// Assist the GC if needed.
+	// Assist the GC if needed. (On the reuse path, we currently compensate for this;
+	// changes here might require changes there.)
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}
@@ -1413,6 +1423,16 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	size = uintptr(gc.SizeClassToSize[sizeclass])
 	spc := makeSpanClass(sizeclass, true)
 	span := c.alloc[spc]
+
+	// First, check for a reusable object.
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+		// We have a reusable object, use it.
+		x := mallocgcSmallNoscanReuse(c, span, spc, size, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		return x, size
+	}
+
 	v := nextFreeFast(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
@@ -1472,6 +1492,55 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	return x, size
 }
 
+// mallocgcSmallNoscanReuse returns a previously freed noscan object after preparing it for reuse.
+// It must only be called if hasReusableNoscan returned true.
+func mallocgcSmallNoscanReuse(c *mcache, span *mspan, spc spanClass, size uintptr, needzero bool) unsafe.Pointer {
+	// TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer?
+	// Maybe doesn't matter. gclinkptr might be for historical reasons.
+	v, span := c.nextReusableNoScan(span, spc)
+	x := unsafe.Pointer(v)
+
+	// Compensate for the GC assist credit deducted in mallocgc (before calling us and
+	// after we return) because this is not a newly allocated object. We use the full slot
+	// size (elemsize) here because that's what mallocgc deducts overall. Note we only
+	// adjust this when gcBlackenEnabled is true, which follows mallocgc behavior.
+	// TODO(thepudds): a follow-up CL adds a more specific test of our assist credit
+	// handling, including for validating internal fragmentation handling.
+	if gcBlackenEnabled != 0 {
+		addAssistCredit(size)
+	}
+
+	// This is a previously used object, so only check needzero (and not span.needzero)
+	// for clearing.
+	if needzero {
+		memclrNoHeapPointers(x, size)
+	}
+
+	// See publicationBarrier comment in mallocgcSmallNoscan.
+	publicationBarrier()
+
+	// Finish and return. Note that we do not update span.freeIndexForScan, profiling info,
+	// nor do we check gcTrigger.
+	// TODO(thepudds): the current approach is viable for a GOEXPERIMENT, but
+	// means we do not profile reused heap objects. Ultimately, we will need a better
+	// approach for profiling, or at least ensure we are not introducing bias in the
+	// profiled allocations.
+	// TODO(thepudds): related, we probably want to adjust how allocs and frees are counted
+	// in the existing stats. Currently, reused objects are not counted as allocs nor
+	// frees, but instead roughly appear as if the original heap object lived on. We
+	// probably will also want some additional runtime/metrics, and generally think about
+	// user-facing observability & diagnostics, though all this likely can wait for an
+	// official proposal.
+	if writeBarrier.enabled {
+		// Allocate black during GC.
+		// All slots hold nil so no scanning is needed.
+		// This may be racing with GC so do it atomically if there can be
+		// a race marking the bit.
+		gcmarknewobject(span, uintptr(x))
+	}
+	return x
+}
+
 func mallocgcSmallScanNoHeader(size uintptr, typ *_type) (unsafe.Pointer, uintptr) {
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
@@ -1816,8 +1885,6 @@ func postMallocgcDebug(x unsafe.Pointer, elemsize uintptr, typ *_type) {
 // by size bytes, and assists the GC if necessary.
 //
 // Caller must be preemptible.
-//
-// Returns the G for which the assist credit was accounted.
 func deductAssistCredit(size uintptr) {
 	// Charge the current user G for this allocation.
 	assistG := getg()
@@ -1836,6 +1903,267 @@ func deductAssistCredit(size uintptr) {
 	}
 }
 
+// addAssistCredit is like deductAssistCredit,
+// but adds credit rather than removes,
+// and never calls gcAssistAlloc.
+func addAssistCredit(size uintptr) {
+	// Credit the current user G.
+	assistG := getg()
+	if assistG.m.curg != nil { // TODO(thepudds): do we need to do this?
+		assistG = assistG.m.curg
+	}
+	// Credit the size against the G.
+	assistG.gcAssistBytes += int64(size)
+}
+
+const (
+	// doubleCheckReusable enables some additional invariant checks for the
+	// runtime.freegc and reusable objects. Note that some of these checks alter timing,
+	// and it is good to test changes with and without this enabled.
+	doubleCheckReusable = false
+
+	// debugReusableLog enables some printlns for runtime.freegc and reusable objects.
+	debugReusableLog = false
+)
+
+// freegc records that a heap object is reusable and available for
+// immediate reuse in a subsequent mallocgc allocation, without
+// needing to wait for the GC cycle to progress.
+//
+// The information is recorded in a free list stored in the
+// current P's mcache. The caller must pass in the user size
+// and whether the object has pointers, which allows a faster free
+// operation.
+//
+// freegc must be called by the effective owner of ptr who knows
+// the pointer is logically dead, with no possible aliases that might
+// be used past that moment. In other words, ptr must be the
+// last and only pointer to its referent.
+//
+// The intended caller is the compiler.
+//
+// Note: please do not send changes that attempt to add freegc calls
+// to the standard library.
+//
+// ptr must point to a heap object or into the current g's stack,
+// in which case freegc is a no-op. In particular, ptr must not point
+// to memory in the data or bss sections, which is partially enforced.
+// For objects with a malloc header, ptr should point mallocHeaderSize bytes
+// past the base; otherwise, ptr should point to the base of the heap object.
+// In other words, ptr should be the same pointer that was returned by mallocgc.
+//
+// In addition, the caller must know that ptr's object has no specials, such
+// as might have been created by a call to SetFinalizer or AddCleanup.
+// (Internally, the runtime deals appropriately with internally-created
+// specials, such as specials for memory profiling).
+//
+// If the size of ptr's object is less than 16 bytes or greater than
+// 32KiB - gc.MallocHeaderSize bytes, freegc is currently a no-op. It must only
+// be called in alloc-safe places. It currently throws if noscan is false
+// (support for which is implemented in a later CL in our stack).
+//
+// Note that freegc accepts an unsafe.Pointer and hence keeps the pointer
+// alive. It therefore could be a pessimization in some cases (such
+// as a long-lived function) if the caller does not call freegc before
+// or roughly when the liveness analysis of the compiler
+// would otherwise have determined ptr's object is reclaimable by the GC.
+func freegc(ptr unsafe.Pointer, size uintptr, noscan bool) bool {
+	if !runtimeFreegcEnabled || !reusableSize(size) {
+		return false
+	}
+	if sizeSpecializedMallocEnabled && !noscan {
+		// TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc for pointer types
+		// until we finish integrating.
+		return false
+	}
+
+	if ptr == nil {
+		throw("freegc nil")
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	// Otherwise, the GC could flush our mcache or otherwise cause problems.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("freegc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("freegc during signal")
+	}
+	mp.mallocing = 1
+
+	if mp.curg.stack.lo <= uintptr(ptr) && uintptr(ptr) < mp.curg.stack.hi {
+		// This points into our stack, so free is a no-op.
+		mp.mallocing = 0
+		releasem(mp)
+		return false
+	}
+
+	if doubleCheckReusable {
+		// TODO(thepudds): we could enforce no free on globals in bss or data. Maybe by
+		// checking span via spanOf or spanOfHeap, or maybe walk from firstmoduledata
+		// like isGoPointerWithoutSpan, or activeModules, or something. If so, we might
+		// be able to delay checking until reuse (e.g., check span just before reusing,
+		// though currently we don't always need to lookup a span on reuse). If we think
+		// no usage patterns could result in globals, maybe enforcement for globals could
+		// be behind -d=checkptr=1 or similar. The compiler can have knowledge of where
+		// a variable is allocated, but stdlib does not, although there are certain
+		// usage patterns that cannot result in a global.
+		// TODO(thepudds): separately, consider a local debugReusableMcacheOnly here
+		// to ignore freed objects if not in mspan in mcache,  maybe when freeing and reading,
+		// by checking something like s.base() <= uintptr(v) && uintptr(v) < s.limit. Or
+		// maybe a GODEBUG or compiler debug flag.
+		span := spanOf(uintptr(ptr))
+		if span == nil {
+			throw("nextReusable: nil span for pointer in free list")
+		}
+		if state := span.state.get(); state != mSpanInUse {
+			throw("nextReusable: span is not in use")
+		}
+	}
+
+	if debug.clobberfree != 0 {
+		clobberfree(ptr, size)
+	}
+
+	// We first check if p is still in our per-P cache.
+	// Get our per-P cache for small objects.
+	c := getMCache(mp)
+	if c == nil {
+		throw("freegc called without a P or outside bootstrapping")
+	}
+
+	v := uintptr(ptr)
+	if !noscan && !heapBitsInSpan(size) {
+		// mallocgcSmallScanHeader expects to get the base address of the object back
+		// from the findReusable funcs (as well as from nextFreeFast and nextFree), and
+		// not mallocHeaderSize bytes into a object, so adjust that here.
+		v -= mallocHeaderSize
+
+		// The size class lookup wants size to be adjusted by mallocHeaderSize.
+		size += mallocHeaderSize
+	}
+
+	// TODO(thepudds): should verify (behind doubleCheckReusable constant) that our calculated
+	// sizeclass here matches what's in span found via spanOf(ptr) or findObject(ptr).
+	var sizeclass uint8
+	if size <= gc.SmallSizeMax-8 {
+		sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
+	} else {
+		sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
+	}
+
+	spc := makeSpanClass(sizeclass, noscan)
+	s := c.alloc[spc]
+
+	if debugReusableLog {
+		if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+			println("freegc [in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		} else {
+			println("freegc [NOT in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		}
+	}
+
+	if noscan {
+		c.addReusableNoscan(spc, uintptr(v))
+	} else {
+		// TODO(thepudds): implemented in later CL in our stack.
+		throw("freegc called for object with pointers, not yet implemented")
+	}
+
+	// For stats, for now we leave allocCount alone, roughly pretending to the rest
+	// of the system that this potential reuse never happened.
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	return true
+}
+
+// nextReusableNoScan returns the next reusable object for a noscan span,
+// or 0 if no reusable object is found.
+func (c *mcache) nextReusableNoScan(s *mspan, spc spanClass) (gclinkptr, *mspan) {
+	if !runtimeFreegcEnabled {
+		return 0, s
+	}
+
+	// Pop a reusable pointer from the free list for this span class.
+	v := c.reusableNoscan[spc]
+	if v == 0 {
+		return 0, s
+	}
+	c.reusableNoscan[spc] = v.ptr().next
+
+	if debugReusableLog {
+		println("reusing from ptr free list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+	}
+	if doubleCheckReusable {
+		doubleCheckNextReusable(v) // debug only sanity check
+	}
+
+	// For noscan spans, we only need the span if the write barrier is enabled (so that our caller
+	// can call gcmarknewobject to allocate black). If the write barrier is enabled, we can skip
+	// looking up the span when the pointer is in a span in the mcache.
+	if !writeBarrier.enabled {
+		return v, nil
+	}
+	if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+		// Return the original span.
+		return v, s
+	}
+
+	// We must find and return the span.
+	span := spanOf(uintptr(v))
+	if span == nil {
+		// TODO(thepudds): construct a test that triggers this throw.
+		throw("nextReusableNoScan: nil span for pointer in reusable object free list")
+	}
+
+	return v, span
+}
+
+// doubleCheckNextReusable checks some invariants.
+// TODO(thepudds): will probably delete some of this. Can mostly be ignored for review.
+func doubleCheckNextReusable(v gclinkptr) {
+	// TODO(thepudds): should probably take the spanClass as well to confirm expected
+	// sizeclass match.
+	_, span, objIndex := findObject(uintptr(v), 0, 0)
+	if span == nil {
+		throw("nextReusable: nil span for pointer in free list")
+	}
+	if state := span.state.get(); state != mSpanInUse {
+		throw("nextReusable: span is not in use")
+	}
+	if uintptr(v) < span.base() || uintptr(v) >= span.limit {
+		throw("nextReusable: span is not in range")
+	}
+	if span.objBase(uintptr(v)) != uintptr(v) {
+		print("nextReusable: v=", hex(v), " base=", hex(span.objBase(uintptr(v))), "\n")
+		throw("nextReusable: v is non-base-address for object found on pointer free list")
+	}
+	if span.isFree(objIndex) {
+		throw("nextReusable: pointer on free list is free")
+	}
+
+	const debugReusableEnsureSwept = false
+	if debugReusableEnsureSwept {
+		// Currently disabled.
+		// Note: ensureSwept here alters behavior (not just an invariant check).
+		span.ensureSwept()
+		if span.isFree(objIndex) {
+			throw("nextReusable: pointer on free list is free after ensureSwept")
+		}
+	}
+}
+
+// reusableSize reports if size is a currently supported size for a reusable object.
+func reusableSize(size uintptr) bool {
+	if size < maxTinySize || size > maxSmallSize-mallocHeaderSize {
+		return false
+	}
+	return true
+}
+
 // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
 // on chunks of the buffer to be zeroed, with opportunities for preemption
 // along the way.  memclrNoHeapPointers contains no safepoints and also
diff --git a/src/runtime/malloc_generated.go b/src/runtime/malloc_generated.go
index 2215dbaddb..5abb61257a 100644
--- a/src/runtime/malloc_generated.go
+++ b/src/runtime/malloc_generated.go
@@ -1,4 +1,5 @@
 // Code generated by mkmalloc.go; DO NOT EDIT.
+// See overview in malloc_stubs.go.
 
 package runtime
 
@@ -6400,6 +6401,32 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6497,6 +6524,32 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6594,6 +6647,32 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6691,6 +6770,32 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6788,6 +6893,32 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6885,6 +7016,32 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6982,6 +7139,32 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7079,6 +7262,32 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7176,6 +7385,32 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7273,6 +7508,32 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7370,6 +7631,32 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7467,6 +7754,32 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7564,6 +7877,32 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7661,6 +8000,32 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7758,6 +8123,32 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7855,6 +8246,32 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7952,6 +8369,32 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8049,6 +8492,32 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8146,6 +8615,32 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8243,6 +8738,32 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8340,6 +8861,32 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8437,6 +8984,32 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8534,6 +9107,32 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8631,6 +9230,32 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8728,6 +9353,32 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go
index 224746f3d4..e9752956b8 100644
--- a/src/runtime/malloc_stubs.go
+++ b/src/runtime/malloc_stubs.go
@@ -7,6 +7,8 @@
 // to produce a full mallocgc function that's specialized for a span class
 // or specific size in the case of the tiny allocator.
 //
+// To generate the specialized mallocgc functions, do 'go run .' inside runtime/_mkmalloc.
+//
 // To assemble a mallocgc function, the mallocStub function is cloned, and the call to
 // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,
 // smallNoScanStub or tinyStub, depending on the parameters being specialized.
@@ -71,7 +73,8 @@ func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	}
 
-	// Assist the GC if needed.
+	// Assist the GC if needed. (On the reuse path, we currently compensate for this;
+	// changes here might require changes there.)
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}
@@ -242,6 +245,23 @@ func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, u
 	c := getMCache(mp)
 	const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
 	span := c.alloc[spc]
+
+	// First, check for a reusable object.
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+		// We have a reusable object, use it.
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+
+		// TODO(thepudds): note that the generated return path is essentially duplicated
+		// by the generator. For example, see the two postMallocgcDebug calls and
+		// related duplicated code on the return path currently in the generated
+		// mallocgcSmallNoScanSC2 function. One set of those correspond to this
+		// return here. We might be able to de-duplicate the generated return path
+		// by updating the generator, perhaps by jumping to a shared return or similar.
+		return v, elemsize
+	}
+
 	v := nextFreeFastStub(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index bf58947bbc..97cf0eed54 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -16,6 +16,7 @@ import (
 	"runtime"
 	. "runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -234,6 +235,364 @@ func TestTinyAllocIssue37262(t *testing.T) {
 	runtime.Releasem()
 }
 
+// TestFreegc does basic testing of explicit frees.
+func TestFreegc(t *testing.T) {
+	tests := []struct {
+		size   string
+		f      func(noscan bool) func(*testing.T)
+		noscan bool
+	}{
+		// Types without pointers.
+		{"size=16", testFreegc[[16]byte], true}, // smallest we support currently
+		{"size=17", testFreegc[[17]byte], true},
+		{"size=64", testFreegc[[64]byte], true},
+		{"size=500", testFreegc[[500]byte], true},
+		{"size=512", testFreegc[[512]byte], true},
+		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=20000", testFreegc[[20000]byte], true},       // not power of 2 or spc boundary
+		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
+	}
+
+	// Run the tests twice if not in -short mode or not otherwise saving test time.
+	// First while manually calling runtime.GC to slightly increase isolation (perhaps making
+	// problems more reproducible).
+	for _, tt := range tests {
+		runtime.GC()
+		t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		return
+	}
+
+	// Again, but without manually calling runtime.GC in the loop (perhaps less isolation might
+	// trigger problems).
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+}
+
+func testFreegc[T comparable](noscan bool) func(*testing.T) {
+	// We use stressMultiple to influence the duration of the tests.
+	// When testing freegc changes, stressMultiple can be increased locally
+	// to test longer or in some cases with more goroutines.
+	// It can also be helpful to test with GODEBUG=clobberfree=1 and
+	// with and without doubleCheckMalloc and doubleCheckReusable enabled.
+	stressMultiple := 10
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		stressMultiple = 1
+	}
+
+	return func(t *testing.T) {
+		alloc := func() *T {
+			// Force heap alloc, plus some light validation of zeroed memory.
+			t.Helper()
+			p := Escape(new(T))
+			var zero T
+			if *p != zero {
+				t.Fatalf("allocator returned non-zero memory: %v", *p)
+			}
+			return p
+		}
+
+		free := func(p *T) {
+			t.Helper()
+			var zero T
+			if *p != zero {
+				t.Fatalf("found non-zero memory before freegc (tests do not modify memory): %v", *p)
+			}
+			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
+		}
+
+		t.Run("basic-free", func(t *testing.T) {
+			// Test that freeing a live heap object doesn't crash.
+			for range 100 {
+				p := alloc()
+				free(p)
+			}
+		})
+
+		t.Run("stack-free", func(t *testing.T) {
+			// Test that freeing a stack object doesn't crash.
+			for range 100 {
+				var x [32]byte
+				var y [32]*int
+				runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true)  // noscan
+				runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan
+			}
+		})
+
+		// Check our allocations. These tests rely on the
+		// current implementation treating a re-used object
+		// as not adding to the allocation counts seen
+		// by testing.AllocsPerRun. (This is not the desired
+		// long-term behavior, but it is the current behavior and
+		// makes these tests convenient).
+
+		t.Run("allocs-baseline", func(t *testing.T) {
+			// Baseline result without any explicit free.
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					_ = p
+				}
+			})
+			if allocs < 100 {
+				// TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why
+				// ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8.
+				t.Fatalf("expected >=100 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("allocs-with-free", func(t *testing.T) {
+			// Same allocations, but now using explicit free so that
+			// no allocs get reported. (Again, not the desired long-term behavior).
+			if SizeSpecializedMallocEnabled && !noscan {
+				// TODO(thepudds): skip at this point in the stack for size-specialized malloc
+				// with !noscan. Additional integration with sizespecializedmalloc is in a later CL.
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("free-multiple", func(t *testing.T) {
+			// Multiple allocations outstanding before explicitly freeing,
+			// but still within the limit of our smallest free list size
+			// so that no allocs are reported. (Again, not long-term behavior).
+			if SizeSpecializedMallocEnabled && !noscan {
+				// TODO(thepudds): skip at this point in the stack for size-specialized malloc
+				// with !noscan. Additional integration with sizespecializedmalloc is in a later CL.
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			const maxOutstanding = 20
+			s := make([]*T, 0, maxOutstanding)
+			allocs := testing.AllocsPerRun(100*stressMultiple, func() {
+				s = s[:0]
+				for range maxOutstanding {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		if runtime.GOARCH == "wasm" {
+			// TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem,
+			// vs. something else. It might have been wasm was slowest with tests that spawn
+			// many goroutines, which might be expected for wasm. This skip might no longer be
+			// needed now that we have tuned test execution time more, or perhaps wasm should just
+			// always run in short mode, which might also let us remove this skip.
+			t.Skip("skipping remaining freegc tests, was timing out on wasm")
+		}
+
+		t.Run("free-many", func(t *testing.T) {
+			// Confirm we are graceful if we have more freed elements at once
+			// than the max free list size.
+			s := make([]*T, 0, 1000)
+			iterations := stressMultiple * stressMultiple // currently 1 (-short) or 100
+			for range iterations {
+				s = s[:0]
+				for range 1000 {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			}
+		})
+
+		t.Run("duplicate-check", func(t *testing.T) {
+			// A simple duplicate allocation test. We track what should be the set
+			// of live pointers in a map across a series of allocs and frees,
+			// and fail if a live pointer value is returned by an allocation.
+			// TODO: maybe add randomness? allow more live pointers? do across goroutines?
+			live := make(map[uintptr]bool)
+			for i := range 100 * stressMultiple {
+				var s []*T
+				// Alloc 10 times, tracking the live pointer values.
+				for j := range 10 {
+					p := alloc()
+					uptr := uintptr(unsafe.Pointer(p))
+					if live[uptr] {
+						t.Fatalf("found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+					}
+					live[uptr] = true
+					s = append(s, p)
+				}
+				// Explicitly free those pointers, removing them from the live map.
+				for k := range s {
+					p := s[k]
+					s[k] = nil
+					uptr := uintptr(unsafe.Pointer(p))
+					free(p)
+					delete(live, uptr)
+				}
+			}
+		})
+
+		t.Run("free-other-goroutine", func(t *testing.T) {
+			// Use explicit free, but the free happens on a different goroutine than the alloc.
+			// This also lightly simulates how the free code sees P migration or flushing
+			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 (-short) or 1000
+			for _, capacity := range []int{2} {
+				for range iterations {
+					ch := make(chan *T, capacity)
+					var wg sync.WaitGroup
+					for range 2 {
+						wg.Add(1)
+						go func() {
+							defer wg.Done()
+							for p := range ch {
+								free(p)
+							}
+						}()
+					}
+					for range 100 {
+						p := alloc()
+						ch <- p
+					}
+					close(ch)
+					wg.Wait()
+				}
+			}
+		})
+
+		t.Run("many-goroutines", func(t *testing.T) {
+			// Allocate across multiple goroutines, freeing on the same goroutine.
+			// TODO: probably remove the duplicate checking here; not that useful.
+			counts := []int{1, 2, 4, 8, 10 * stressMultiple}
+			for _, goroutines := range counts {
+				var wg sync.WaitGroup
+				for range goroutines {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						live := make(map[uintptr]bool)
+						for range 100 * stressMultiple {
+							p := alloc()
+							uptr := uintptr(unsafe.Pointer(p))
+							if live[uptr] {
+								panic("TestFreeLive: found duplicate pointer")
+							}
+							live[uptr] = true
+							free(p)
+							delete(live, uptr)
+						}
+					}()
+				}
+				wg.Wait()
+			}
+		})
+
+		t.Run("assist-credit", func(t *testing.T) {
+			// Allocate and free using the same span class repeatedly while
+			// verifying it results in a net zero change in assist credit.
+			// This helps double-check our manipulation of the assist credit
+			// during mallocgc/freegc, including in cases when there is
+			// internal fragmentation when the requested mallocgc size is
+			// smaller than the size class.
+			//
+			// See https://go.dev/cl/717520 for some additional discussion,
+			// including how we can deliberately cause the test to fail currently
+			// if we purposefully introduce some assist credit bugs.
+			if SizeSpecializedMallocEnabled && !noscan {
+				// TODO(thepudds): skip this test at this point in the stack; later CL has
+				// integration with sizespecializedmalloc.
+				t.Skip("temporarily skip assist credit tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping assist credit test with runtime.freegc disabled")
+			}
+
+			// Use a background goroutine to continuously run the GC.
+			done := make(chan struct{})
+			defer close(done)
+			go func() {
+				for {
+					select {
+					case <-done:
+						return
+					default:
+						runtime.GC()
+					}
+				}
+			}()
+
+			// If making changes related to this test, consider testing locally with
+			// larger counts, like 100K or 1M.
+			counts := []int{1, 2, 10, 100 * stressMultiple}
+			// Dropping down to GOMAXPROCS=1 might help reduce noise.
+			defer GOMAXPROCS(GOMAXPROCS(1))
+			size := int64(unsafe.Sizeof(*new(T)))
+			for _, count := range counts {
+				// Start by forcing a GC to reset this g's assist credit
+				// and perhaps help us get a cleaner measurement of GC cycle count.
+				runtime.GC()
+				for i := range count {
+					// We disable preemption to reduce other code's ability to adjust this g's
+					// assist credit or otherwise change things while we are measuring.
+					Acquirem()
+
+					// We do two allocations per loop, with the second allocation being
+					// the one we measure. The first allocation tries to ensure at least one
+					// reusable object on the mspan's free list when we do our measured allocation.
+					p := alloc()
+					free(p)
+
+					// Now do our primary allocation of interest, bracketed by measurements.
+					// We measure more than we strictly need (to log details in case of a failure).
+					creditStart := AssistCredit()
+					blackenStart := GcBlackenEnable()
+					p = alloc()
+					blackenAfterAlloc := GcBlackenEnable()
+					creditAfterAlloc := AssistCredit()
+					free(p)
+					blackenEnd := GcBlackenEnable()
+					creditEnd := AssistCredit()
+
+					Releasem()
+					GoschedIfBusy()
+
+					delta := creditEnd - creditStart
+					if delta != 0 {
+						t.Logf("assist credit non-zero delta: %d", delta)
+						t.Logf("\t| size: %d i: %d count: %d", size, i, count)
+						t.Logf("\t| credit before: %d credit after: %d", creditStart, creditEnd)
+						t.Logf("\t| alloc delta: %d free delta: %d",
+							creditAfterAlloc-creditStart, creditEnd-creditAfterAlloc)
+						t.Logf("\t| gcBlackenEnable (start / after alloc / end): %v/%v/%v",
+							blackenStart, blackenAfterAlloc, blackenEnd)
+						t.FailNow()
+					}
+				}
+			}
+		})
+	}
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
@@ -337,6 +696,13 @@ func BenchmarkMalloc16(b *testing.B) {
 	}
 }
 
+func BenchmarkMalloc32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new([4]int64)
+		Escape(p)
+	}
+}
+
 func BenchmarkMallocTypeInfo8(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
@@ -355,6 +721,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) {
 	}
 }
 
+func BenchmarkMallocTypeInfo32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [32 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		Escape(p)
+	}
+}
+
 type LargeStruct struct {
 	x [16][]byte
 }
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index cade81031d..82872f1454 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -44,7 +44,17 @@ type mcache struct {
 
 	// The rest is not accessed on every malloc.
 
-	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
+	// alloc contains spans to allocate from, indexed by spanClass.
+	alloc [numSpanClasses]*mspan
+
+	// TODO(thepudds): better to interleave alloc and reusableScan/reusableNoscan so that
+	// a single malloc call can often access both in the same cache line for a given spanClass.
+	// It's not interleaved right now in part to have slightly smaller diff, and might be
+	// negligible effect on current microbenchmarks.
+
+	// reusableNoscan contains linked lists of reusable noscan heap objects, indexed by spanClass.
+	// The next pointers are stored in the first word of the heap objects.
+	reusableNoscan [numSpanClasses]gclinkptr
 
 	stackcache [_NumStackOrders]stackfreelist
 
@@ -96,6 +106,7 @@ func allocmcache() *mcache {
 		c.alloc[i] = &emptymspan
 	}
 	c.nextSample = nextSample()
+
 	return c
 }
 
@@ -153,6 +164,16 @@ func (c *mcache) refill(spc spanClass) {
 	if s.allocCount != s.nelems {
 		throw("refill of span with free space remaining")
 	}
+
+	// TODO(thepudds): we might be able to allow mallocgcTiny to reuse 16 byte objects from spc==5,
+	// but for now, just clear our reusable objects for tinySpanClass.
+	if spc == tinySpanClass {
+		c.reusableNoscan[spc] = 0
+	}
+	if c.reusableNoscan[spc] != 0 {
+		throw("refill of span with reusable pointers remaining on pointer free list")
+	}
+
 	if s != &emptymspan {
 		// Mark this span as no longer cached.
 		if s.sweepgen != mheap_.sweepgen+3 {
@@ -312,6 +333,13 @@ func (c *mcache) releaseAll() {
 	c.tinyAllocs = 0
 	memstats.heapStats.release()
 
+	// Clear the reusable linked lists.
+	// For noscan objects, the nodes of the linked lists are the reusable heap objects themselves,
+	// so we can simply clear the linked list head pointers.
+	// TODO(thepudds): consider having debug logging of a non-empty reusable lists getting cleared,
+	// maybe based on the existing debugReusableLog.
+	clear(c.reusableNoscan[:])
+
 	// Update heapLive and heapScan.
 	gcController.update(dHeapLive, scanAlloc)
 }
@@ -339,3 +367,25 @@ func (c *mcache) prepareForSweep() {
 	stackcache_clear(c)
 	c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart
 }
+
+// addReusableNoscan adds a noscan object pointer to the reusable pointer free list
+// for a span class.
+func (c *mcache) addReusableNoscan(spc spanClass, ptr uintptr) {
+	if !runtimeFreegcEnabled {
+		return
+	}
+
+	// Add to the reusable pointers free list.
+	v := gclinkptr(ptr)
+	v.ptr().next = c.reusableNoscan[spc]
+	c.reusableNoscan[spc] = v
+}
+
+// hasReusableNoscan reports whether there is a reusable object available for
+// a noscan spc.
+func (c *mcache) hasReusableNoscan(spc spanClass) bool {
+	if !runtimeFreegcEnabled {
+		return false
+	}
+	return c.reusableNoscan[spc] != 0
+}
diff --git a/src/runtime/mcleanup.go b/src/runtime/mcleanup.go
index 383217aa05..fc71af9f3f 100644
--- a/src/runtime/mcleanup.go
+++ b/src/runtime/mcleanup.go
@@ -72,8 +72,9 @@ import (
 // pass the object to the [KeepAlive] function after the last point
 // where the object must remain reachable.
 func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
-	// Explicitly force ptr to escape to the heap.
+	// Explicitly force ptr and cleanup to escape to the heap.
 	ptr = abi.Escape(ptr)
+	cleanup = abi.Escape(cleanup)
 
 	// The pointer to the object must be valid.
 	if ptr == nil {
@@ -82,7 +83,8 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 	usptr := uintptr(unsafe.Pointer(ptr))
 
 	// Check that arg is not equal to ptr.
-	if kind := abi.TypeOf(arg).Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
+	argType := abi.TypeOf(arg)
+	if kind := argType.Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
 		if unsafe.Pointer(ptr) == *((*unsafe.Pointer)(unsafe.Pointer(&arg))) {
 			panic("runtime.AddCleanup: ptr is equal to arg, cleanup will never run")
 		}
@@ -98,12 +100,23 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 		return Cleanup{}
 	}
 
-	fn := func() {
-		cleanup(arg)
+	// Create new storage for the argument.
+	var argv *S
+	if size := unsafe.Sizeof(arg); size < maxTinySize && argType.PtrBytes == 0 {
+		// Side-step the tiny allocator to avoid liveness issues, since this box
+		// will be treated like a root by the GC. We model the box as an array of
+		// uintptrs to guarantee maximum allocator alignment.
+		//
+		// TODO(mknyszek): Consider just making space in cleanupFn for this. The
+		// unfortunate part of this is it would grow specialCleanup by 16 bytes, so
+		// while there wouldn't be an allocation, *every* cleanup would take the
+		// memory overhead hit.
+		box := new([maxTinySize / goarch.PtrSize]uintptr)
+		argv = (*S)(unsafe.Pointer(box))
+	} else {
+		argv = new(S)
 	}
-	// Closure must escape.
-	fv := *(**funcval)(unsafe.Pointer(&fn))
-	fv = abi.Escape(fv)
+	*argv = arg
 
 	// Find the containing object.
 	base, _, _ := findObject(usptr, 0, 0)
@@ -120,7 +133,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 		gcCleanups.createGs()
 	}
 
-	id := addCleanup(unsafe.Pointer(ptr), fv)
+	id := addCleanup(unsafe.Pointer(ptr), cleanupFn{
+		// Instantiate a caller function to call the cleanup, that is cleanup(*argv).
+		//
+		// TODO(mknyszek): This allocates because the generic dictionary argument
+		// gets closed over, but callCleanup doesn't even use the dictionary argument,
+		// so theoretically that could be removed, eliminating an allocation.
+		call: callCleanup[S],
+		fn:   *(**funcval)(unsafe.Pointer(&cleanup)),
+		arg:  unsafe.Pointer(argv),
+	})
 	if debug.checkfinalizers != 0 {
 		cleanupFn := *(**funcval)(unsafe.Pointer(&cleanup))
 		setCleanupContext(unsafe.Pointer(ptr), abi.TypeFor[T](), sys.GetCallerPC(), cleanupFn.fn, id)
@@ -131,6 +153,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 	}
 }
 
+// callCleanup is a helper for calling cleanups in a polymorphic way.
+//
+// In practice, all it does is call fn(*arg). arg must be a *T.
+//
+//go:noinline
+func callCleanup[T any](fn *funcval, arg unsafe.Pointer) {
+	cleanup := *(*func(T))(unsafe.Pointer(&fn))
+	cleanup(*(*T)(arg))
+}
+
 // Cleanup is a handle to a cleanup call for a specific object.
 type Cleanup struct {
 	// id is the unique identifier for the cleanup within the arena.
@@ -216,7 +248,17 @@ const cleanupBlockSize = 512
 // that the cleanup queue does not grow during marking (but it can shrink).
 type cleanupBlock struct {
 	cleanupBlockHeader
-	cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / goarch.PtrSize]*funcval
+	cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / unsafe.Sizeof(cleanupFn{})]cleanupFn
+}
+
+var cleanupFnPtrMask = [...]uint8{0b111}
+
+// cleanupFn represents a cleanup function with it's argument, yet to be called.
+type cleanupFn struct {
+	// call is an adapter function that understands how to safely call fn(*arg).
+	call func(*funcval, unsafe.Pointer)
+	fn   *funcval       // cleanup function passed to AddCleanup.
+	arg  unsafe.Pointer // pointer to argument to pass to cleanup function.
 }
 
 var cleanupBlockPtrMask [cleanupBlockSize / goarch.PtrSize / 8]byte
@@ -245,8 +287,8 @@ type cleanupBlockHeader struct {
 //
 // Must only be called if the GC is in the sweep phase (gcphase == _GCoff),
 // because it does not synchronize with the garbage collector.
-func (b *cleanupBlock) enqueue(fn *funcval) bool {
-	b.cleanups[b.n] = fn
+func (b *cleanupBlock) enqueue(c cleanupFn) bool {
+	b.cleanups[b.n] = c
 	b.n++
 	return b.full()
 }
@@ -375,7 +417,7 @@ func (q *cleanupQueue) tryTakeWork() bool {
 // enqueue queues a single cleanup for execution.
 //
 // Called by the sweeper, and only the sweeper.
-func (q *cleanupQueue) enqueue(fn *funcval) {
+func (q *cleanupQueue) enqueue(c cleanupFn) {
 	mp := acquirem()
 	pp := mp.p.ptr()
 	b := pp.cleanups
@@ -396,7 +438,7 @@ func (q *cleanupQueue) enqueue(fn *funcval) {
 		}
 		pp.cleanups = b
 	}
-	if full := b.enqueue(fn); full {
+	if full := b.enqueue(c); full {
 		q.full.push(&b.lfnode)
 		pp.cleanups = nil
 		q.addWork(1)
@@ -641,7 +683,8 @@ func runCleanups() {
 
 		gcCleanups.beginRunningCleanups()
 		for i := 0; i < int(b.n); i++ {
-			fn := b.cleanups[i]
+			c := b.cleanups[i]
+			b.cleanups[i] = cleanupFn{}
 
 			var racectx uintptr
 			if raceenabled {
@@ -650,20 +693,15 @@ func runCleanups() {
 				// the same goroutine.
 				//
 				// Synchronize on fn. This would fail to find races on the
-				// closed-over values in fn (suppose fn is passed to multiple
-				// AddCleanup calls) if fn was not unique, but it is. Update
-				// the synchronization on fn if you intend to optimize it
-				// and store the cleanup function and cleanup argument on the
-				// queue directly.
-				racerelease(unsafe.Pointer(fn))
+				// closed-over values in fn (suppose arg is passed to multiple
+				// AddCleanup calls) if arg was not unique, but it is.
+				racerelease(unsafe.Pointer(c.arg))
 				racectx = raceEnterNewCtx()
-				raceacquire(unsafe.Pointer(fn))
+				raceacquire(unsafe.Pointer(c.arg))
 			}
 
 			// Execute the next cleanup.
-			cleanup := *(*func())(unsafe.Pointer(&fn))
-			cleanup()
-			b.cleanups[i] = nil
+			c.call(c.fn, c.arg)
 
 			if raceenabled {
 				// Restore the old context.
diff --git a/src/runtime/mcleanup_test.go b/src/runtime/mcleanup_test.go
index 22b9eccd20..341d30afa7 100644
--- a/src/runtime/mcleanup_test.go
+++ b/src/runtime/mcleanup_test.go
@@ -336,3 +336,31 @@ func TestCleanupLost(t *testing.T) {
 		t.Errorf("expected %d cleanups to be executed, got %d", got, want)
 	}
 }
+
+// BenchmarkAddCleanupAndStop benchmarks adding and removing a cleanup
+// from the same allocation.
+//
+// At face value, this benchmark is unrealistic, since no program would
+// do this in practice. However, adding cleanups to new allocations in a
+// loop is also unrealistic. It adds additional unused allocations,
+// exercises uncommon performance pitfalls in AddCleanup (traversing the
+// specials list, which should just be its own benchmark), and executing
+// cleanups at a frequency that is unlikely to appear in real programs.
+//
+// This benchmark is still useful however, since we can get a low-noise
+// measurement of the cost of AddCleanup and Stop all in one without the
+// above pitfalls: we can measure the pure overhead. We can then separate
+// out the cost of each in CPU profiles if we so choose (they're not so
+// inexpensive as to make this infeasible).
+func BenchmarkAddCleanupAndStop(b *testing.B) {
+	b.ReportAllocs()
+
+	type T struct {
+		v int
+		p unsafe.Pointer
+	}
+	x := new(T)
+	for b.Loop() {
+		runtime.AddCleanup(x, func(int) {}, 14).Stop()
+	}
+}
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 43afbc330b..febcd9558c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1727,7 +1727,13 @@ func gcBgMarkWorker(ready chan struct{}) {
 	// the stack (see gopark). Prevent deadlock from recursively
 	// starting GC by disabling preemption.
 	gp.m.preemptoff = "GC worker init"
-	node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode // TODO: technically not allowed in the heap. See comment in tagptr.go.
+	// TODO: This is technically not allowed in the heap. See comment in tagptr.go.
+	//
+	// It is kept alive simply by virtue of being used in the infinite loop
+	// below. gcBgMarkWorkerPool keeps pointers to nodes that are not
+	// GC-visible, so this must be kept alive indefinitely (even if
+	// GOMAXPROCS decreases).
+	node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode
 	gp.m.preemptoff = ""
 
 	node.gp.set(gp)
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index dd76973c62..714b9a51df 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -204,7 +204,7 @@ func gcMarkRootCheck() {
 	})
 }
 
-// ptrmask for an allocation containing a single pointer.
+// oneptrmask for an allocation containing a single pointer.
 var oneptrmask = [...]uint8{1}
 
 // markroot scans the i'th root.
@@ -251,7 +251,7 @@ func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
 			// N.B. This only needs to synchronize with cleanup execution, which only resets these blocks.
 			// All cleanup queueing happens during sweep.
 			n := uintptr(atomic.Load(&cb.n))
-			scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*goarch.PtrSize, &cleanupBlockPtrMask[0], gcw, nil)
+			scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*unsafe.Sizeof(cleanupFn{}), &cleanupBlockPtrMask[0], gcw, nil)
 		}
 
 	case work.baseSpans <= i && i < work.baseStacks:
@@ -489,7 +489,7 @@ func gcScanFinalizer(spf *specialfinalizer, s *mspan, gcw *gcWork) {
 // gcScanCleanup scans the relevant parts of a cleanup special as a root.
 func gcScanCleanup(spc *specialCleanup, gcw *gcWork) {
 	// The special itself is a root.
-	scanblock(uintptr(unsafe.Pointer(&spc.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+	scanblock(uintptr(unsafe.Pointer(&spc.cleanup)), unsafe.Sizeof(cleanupFn{}), &cleanupFnPtrMask[0], gcw, nil)
 }
 
 // gcAssistAlloc performs GC work to make gp's assist debt positive.
@@ -1524,29 +1524,32 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
 	if debugScanConservative {
 		printlock()
 		print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n")
-		hexdumpWords(b, b+n, func(p uintptr) byte {
+		hexdumpWords(b, n, func(p uintptr, m hexdumpMarker) {
 			if ptrmask != nil {
 				word := (p - b) / goarch.PtrSize
 				bits := *addb(ptrmask, word/8)
 				if (bits>>(word%8))&1 == 0 {
-					return '$'
+					return
 				}
 			}
 
 			val := *(*uintptr)(unsafe.Pointer(p))
 			if state != nil && state.stack.lo <= val && val < state.stack.hi {
-				return '@'
+				m.start()
+				println("ptr to stack")
+				return
 			}
 
 			span := spanOfHeap(val)
 			if span == nil {
-				return ' '
+				return
 			}
 			idx := span.objIndex(val)
 			if span.isFreeOrNewlyAllocated(idx) {
-				return ' '
+				return
 			}
-			return '*'
+			m.start()
+			println("ptr to heap")
 		})
 		printunlock()
 	}
diff --git a/src/runtime/mgcmark_greenteagc.go b/src/runtime/mgcmark_greenteagc.go
index 3594b33cfd..fa560f9966 100644
--- a/src/runtime/mgcmark_greenteagc.go
+++ b/src/runtime/mgcmark_greenteagc.go
@@ -978,7 +978,9 @@ func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toSc
 }
 
 func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
-	ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize)
+	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, objSize)
+	hbits := (*byte)(unsafe.Pointer(hbitsBase))
+	ptrBits := extractHeapBitsSmall(hbits, spanBase, b, objSize)
 	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
 	nptrs := 0
 	n := sys.OnesCount64(uint64(ptrBits))
@@ -1017,12 +1019,14 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g
 			break
 		}
 		n := sys.OnesCount64(uint64(bits))
+		hbitsBase, _ := spanHeapBitsRange(base, gc.PageSize, objSize)
+		hbits := (*byte)(unsafe.Pointer(hbitsBase))
 		for range n {
 			j := sys.TrailingZeros64(uint64(bits))
 			bits &^= 1 << j
 
 			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
-			ptrBits := heapBitsSmallForAddrInline(base, b, objSize)
+			ptrBits := extractHeapBitsSmall(hbits, base, b, objSize)
 			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
 
 			n := sys.OnesCount64(uint64(ptrBits))
@@ -1056,10 +1060,7 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g
 	}
 }
 
-func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr {
-	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize)
-	hbits := (*byte)(unsafe.Pointer(hbitsBase))
-
+func extractHeapBitsSmall(hbits *byte, spanBase, addr, elemsize uintptr) uintptr {
 	// These objects are always small enough that their bitmaps
 	// fit in a single word, so just load the word or two we need.
 	//
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 32c1b941e5..388cce83cd 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -10,7 +10,7 @@ import (
 	"internal/runtime/atomic"
 	"internal/runtime/math"
 	"internal/strconv"
-	_ "unsafe" // for go:linkname
+	_ "unsafe"
 )
 
 const (
@@ -749,30 +749,33 @@ func (c *gcControllerState) enlistWorker() {
 	}
 }
 
-// findRunnableGCWorker returns a background mark worker for pp if it
-// should be run. This must only be called when gcBlackenEnabled != 0.
-func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
+// assignWaitingGCWorker assigns a background mark worker to pp if one should
+// be run.
+//
+// If a worker is selected, it is assigned to pp.nextMarkGCWorker and the P is
+// wired as a GC mark worker. The G is still in _Gwaiting. If no worker is
+// selected, ok returns false.
+//
+// If assignedWaitingGCWorker returns true, this P must either:
+// - Mark the G as runnable and run it, clearing pp.nextMarkGCWorker.
+// - Or, call c.releaseNextGCMarkWorker.
+//
+// This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) assignWaitingGCWorker(pp *p, now int64) (bool, int64) {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
 
-	// Since we have the current time, check if the GC CPU limiter
-	// hasn't had an update in a while. This check is necessary in
-	// case the limiter is on but hasn't been checked in a while and
-	// so may have left sufficient headroom to turn off again.
 	if now == 0 {
 		now = nanotime()
 	}
-	if gcCPULimiter.needUpdate(now) {
-		gcCPULimiter.update(now)
-	}
 
 	if !gcShouldScheduleWorker(pp) {
 		// No good reason to schedule a worker. This can happen at
 		// the end of the mark phase when there are still
 		// assists tapering off. Don't bother running a worker
 		// now because it'll just return immediately.
-		return nil, now
+		return false, now
 	}
 
 	if c.dedicatedMarkWorkersNeeded.Load() <= 0 && c.fractionalUtilizationGoal == 0 {
@@ -783,7 +786,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		// When a dedicated worker stops running, the gcBgMarkWorker loop notes
 		// the need for the worker before returning it to the pool. If we don't
 		// see the need now, we wouldn't have found it in the pool anyway.
-		return nil, now
+		return false, now
 	}
 
 	// Grab a worker before we commit to running below.
@@ -800,7 +803,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		// it will always do so with queued global work. Thus, that P
 		// will be immediately eligible to re-run the worker G it was
 		// just using, ensuring work can complete.
-		return nil, now
+		return false, now
 	}
 
 	decIfPositive := func(val *atomic.Int64) bool {
@@ -823,7 +826,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 	} else if c.fractionalUtilizationGoal == 0 {
 		// No need for fractional workers.
 		gcBgMarkWorkerPool.push(&node.node)
-		return nil, now
+		return false, now
 	} else {
 		// Is this P behind on the fractional utilization
 		// goal?
@@ -833,12 +836,51 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		if delta > 0 && float64(pp.gcFractionalMarkTime.Load())/float64(delta) > c.fractionalUtilizationGoal {
 			// Nope. No need to run a fractional worker.
 			gcBgMarkWorkerPool.push(&node.node)
-			return nil, now
+			return false, now
 		}
 		// Run a fractional worker.
 		pp.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
+	pp.nextGCMarkWorker = node
+	return true, now
+}
+
+// findRunnableGCWorker returns a background mark worker for pp if it
+// should be run.
+//
+// If findRunnableGCWorker returns a G, this P is wired as a GC mark worker and
+// must run the G.
+//
+// This must only be called when gcBlackenEnabled != 0.
+//
+// This function is allowed to have write barriers because it is called from
+// the portion of findRunnable that always has a P.
+//
+//go:yeswritebarrierrec
+func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
+	// Since we have the current time, check if the GC CPU limiter
+	// hasn't had an update in a while. This check is necessary in
+	// case the limiter is on but hasn't been checked in a while and
+	// so may have left sufficient headroom to turn off again.
+	if now == 0 {
+		now = nanotime()
+	}
+	if gcCPULimiter.needUpdate(now) {
+		gcCPULimiter.update(now)
+	}
+
+	// If a worker wasn't already assigned by procresize, assign one now.
+	if pp.nextGCMarkWorker == nil {
+		ok, now := c.assignWaitingGCWorker(pp, now)
+		if !ok {
+			return nil, now
+		}
+	}
+
+	node := pp.nextGCMarkWorker
+	pp.nextGCMarkWorker = nil
+
 	// Run the background mark worker.
 	gp := node.gp.ptr()
 	trace := traceAcquire()
@@ -850,6 +892,23 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 	return gp, now
 }
 
+// Release an unused pp.nextGCMarkWorker, if any.
+//
+// This function is allowed to have write barriers because it is called from
+// the portion of schedule.
+//
+//go:yeswritebarrierrec
+func (c *gcControllerState) releaseNextGCMarkWorker(pp *p) {
+	node := pp.nextGCMarkWorker
+	if node == nil {
+		return
+	}
+
+	c.markWorkerStop(pp.gcMarkWorkerMode, 0)
+	gcBgMarkWorkerPool.push(&node.node)
+	pp.nextGCMarkWorker = nil
+}
+
 // resetLive sets up the controller state for the next mark phase after the end
 // of the previous one. Must be called after endCycle and before commit, before
 // the world is started.
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index c3d6afb90a..4eecb1cfd9 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -885,7 +885,7 @@ func (s *mspan) reportZombies() {
 			if length > 1024 {
 				length = 1024
 			}
-			hexdumpWords(addr, addr+length, nil)
+			hexdumpWords(addr, length, nil)
 		}
 		mbits.advance()
 		abits.advance()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 711c7790eb..d2ff063b00 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -435,7 +435,7 @@ type mspan struct {
 	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
 	// just past the newly discovered free object.
 	//
-	// If freeindex == nelems, this span has no free objects.
+	// If freeindex == nelems, this span has no free objects, though might have reusable objects.
 	//
 	// allocBits is a bitmap of objects in this span.
 	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
@@ -2161,7 +2161,7 @@ func removefinalizer(p unsafe.Pointer) {
 type specialCleanup struct {
 	_       sys.NotInHeap
 	special special
-	fn      *funcval
+	cleanup cleanupFn
 	// Globally unique ID for the cleanup, obtained from mheap_.cleanupID.
 	id uint64
 }
@@ -2170,14 +2170,18 @@ type specialCleanup struct {
 // cleanups are allowed on an object, and even the same pointer.
 // A cleanup id is returned which can be used to uniquely identify
 // the cleanup.
-func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
+func addCleanup(p unsafe.Pointer, c cleanupFn) uint64 {
+	// TODO(mknyszek): Consider pooling specialCleanups on the P
+	// so we don't have to take the lock every time. Just locking
+	// is a considerable part of the cost of AddCleanup. This
+	// would also require reserving some cleanup IDs on the P.
 	lock(&mheap_.speciallock)
 	s := (*specialCleanup)(mheap_.specialCleanupAlloc.alloc())
 	mheap_.cleanupID++ // Increment first. ID 0 is reserved.
 	id := mheap_.cleanupID
 	unlock(&mheap_.speciallock)
 	s.special.kind = _KindSpecialCleanup
-	s.fn = f
+	s.cleanup = c
 	s.id = id
 
 	mp := acquirem()
@@ -2187,17 +2191,16 @@ func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
 	// situation where it's possible that markrootSpans
 	// has already run but mark termination hasn't yet.
 	if gcphase != _GCoff {
-		gcw := &mp.p.ptr().gcw
 		// Mark the cleanup itself, since the
 		// special isn't part of the GC'd heap.
-		scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+		gcScanCleanup(s, &mp.p.ptr().gcw)
 	}
 	releasem(mp)
-	// Keep f alive. There's a window in this function where it's
-	// only reachable via the special while the special hasn't been
-	// added to the specials list yet. This is similar to a bug
+	// Keep c and its referents alive. There's a window in this function
+	// where it's only reachable via the special while the special hasn't
+	// been added to the specials list yet. This is similar to a bug
 	// discovered for weak handles, see #70455.
-	KeepAlive(f)
+	KeepAlive(c)
 	return id
 }
 
@@ -2534,7 +2537,15 @@ func getOrAddWeakHandle(p unsafe.Pointer) *atomic.Uintptr {
 	s := (*specialWeakHandle)(mheap_.specialWeakHandleAlloc.alloc())
 	unlock(&mheap_.speciallock)
 
-	handle := new(atomic.Uintptr)
+	// N.B. Pad the weak handle to ensure it doesn't share a tiny
+	// block with any other allocations. This can lead to leaks, such
+	// as in go.dev/issue/76007. As an alternative, we could consider
+	// using the currently-unused 8-byte noscan size class.
+	type weakHandleBox struct {
+		h atomic.Uintptr
+		_ [maxTinySize - unsafe.Sizeof(atomic.Uintptr{})]byte
+	}
+	handle := &(new(weakHandleBox).h)
 	s.special.kind = _KindSpecialWeakHandle
 	s.handle = handle
 	handle.Store(uintptr(p))
@@ -2792,7 +2803,7 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 		// Cleanups, unlike finalizers, do not resurrect the objects
 		// they're attached to, so we only need to pass the cleanup
 		// function, not the object.
-		gcCleanups.enqueue(sc.fn)
+		gcCleanups.enqueue(sc.cleanup)
 		lock(&mheap_.speciallock)
 		mheap_.specialCleanupAlloc.free(unsafe.Pointer(sc))
 		unlock(&mheap_.speciallock)
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index e1105afd0f..ff2dec386f 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -746,7 +746,7 @@ func printpanics(p *_panic) {
 	}
 	print("panic: ")
 	printpanicval(p.arg)
-	if p.repanicked {
+	if p.recovered && p.repanicked {
 		print(" [recovered, repanicked]")
 	} else if p.recovered {
 		print(" [recovered]")
diff --git a/src/runtime/print.go b/src/runtime/print.go
index c01db9d7f9..d2733fb266 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"internal/goarch"
 	"internal/strconv"
 	"unsafe"
 )
@@ -212,43 +211,3 @@ func printeface(e eface) {
 func printiface(i iface) {
 	print("(", i.tab, ",", i.data, ")")
 }
-
-// hexdumpWords prints a word-oriented hex dump of [p, end).
-//
-// If mark != nil, it will be called with each printed word's address
-// and should return a character mark to appear just before that
-// word's value. It can return 0 to indicate no mark.
-func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
-	printlock()
-	var markbuf [1]byte
-	markbuf[0] = ' '
-	minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2)
-	for i := uintptr(0); p+i < end; i += goarch.PtrSize {
-		if i%16 == 0 {
-			if i != 0 {
-				println()
-			}
-			print(hex(p+i), ": ")
-		}
-
-		if mark != nil {
-			markbuf[0] = mark(p + i)
-			if markbuf[0] == 0 {
-				markbuf[0] = ' '
-			}
-		}
-		gwrite(markbuf[:])
-		val := *(*uintptr)(unsafe.Pointer(p + i))
-		print(hex(val))
-		print(" ")
-
-		// Can we symbolize val?
-		fn := findfunc(val)
-		if fn.valid() {
-			print("<", funcname(fn), "+", hex(val-fn.entry()), "> ")
-		}
-	}
-	minhexdigits = 0
-	println()
-	printunlock()
-}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 21b276cabf..58fb4bd681 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -3120,7 +3120,7 @@ func startm(pp *p, spinning, lockheld bool) {
 //go:nowritebarrierrec
 func handoffp(pp *p) {
 	// handoffp must start an M in any situation where
-	// findrunnable would return a G to run on pp.
+	// findRunnable would return a G to run on pp.
 
 	// if it has local work, start it straight away
 	if !runqempty(pp) || !sched.runq.empty() {
@@ -3363,7 +3363,7 @@ func findRunnable() (gp *g, inheritTime, tryWakeP bool) {
 	mp := getg().m
 
 	// The conditions here and in handoffp must agree: if
-	// findrunnable would return a G to run, handoffp must start
+	// findRunnable would return a G to run, handoffp must start
 	// an M.
 
 top:
@@ -3587,7 +3587,7 @@ top:
 		goto top
 	}
 	if releasep() != pp {
-		throw("findrunnable: wrong p")
+		throw("findRunnable: wrong p")
 	}
 	now = pidleput(pp, now)
 	unlock(&sched.lock)
@@ -3632,7 +3632,7 @@ top:
 	if mp.spinning {
 		mp.spinning = false
 		if sched.nmspinning.Add(-1) < 0 {
-			throw("findrunnable: negative nmspinning")
+			throw("findRunnable: negative nmspinning")
 		}
 
 		// Note the for correctness, only the last M transitioning from
@@ -3705,10 +3705,10 @@ top:
 	if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
 		sched.pollUntil.Store(pollUntil)
 		if mp.p != 0 {
-			throw("findrunnable: netpoll with p")
+			throw("findRunnable: netpoll with p")
 		}
 		if mp.spinning {
-			throw("findrunnable: netpoll with spinning")
+			throw("findRunnable: netpoll with spinning")
 		}
 		delay := int64(-1)
 		if pollUntil != 0 {
@@ -3974,7 +3974,7 @@ func checkIdleGCNoP() (*p, *g) {
 // timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if sched.lastpoll.Load() == 0 {
-		// In findrunnable we ensure that when polling the pollUntil
+		// In findRunnable we ensure that when polling the pollUntil
 		// field is either zero or the time to which the current
 		// poll is expected to run. This can have a spurious wakeup
 		// but should never miss a wakeup.
@@ -3999,7 +3999,7 @@ func resetspinning() {
 	gp.m.spinning = false
 	nmspinning := sched.nmspinning.Add(-1)
 	if nmspinning < 0 {
-		throw("findrunnable: negative nmspinning")
+		throw("findRunnable: negative nmspinning")
 	}
 	// M wakeup policy is deliberately somewhat conservative, so check if we
 	// need to wakeup another P here. See "Worker thread parking/unparking"
@@ -4136,11 +4136,23 @@ top:
 
 	gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
 
+	// May be on a new P.
+	pp = mp.p.ptr()
+
 	// findRunnable may have collected an allp snapshot. The snapshot is
 	// only required within findRunnable. Clear it to all GC to collect the
 	// slice.
 	mp.clearAllpSnapshot()
 
+	// If the P was assigned a next GC mark worker but findRunnable
+	// selected anything else, release the worker so another P may run it.
+	//
+	// N.B. If this occurs because a higher-priority goroutine was selected
+	// (trace reader), then tryWakeP is set, which will wake another P to
+	// run the worker. If this occurs because the GC is no longer active,
+	// there is no need to wakep.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	if debug.dontfreezetheworld > 0 && freezing.Load() {
 		// See comment in freezetheworld. We don't want to perturb
 		// scheduler state, so we didn't gcstopm in findRunnable, but
@@ -4659,6 +4671,11 @@ func reentersyscall(pc, sp, bp uintptr) {
 	gp.m.locks--
 }
 
+// debugExtendGrunningNoP is a debug mode that extends the windows in which
+// we're _Grunning without a P in order to try to shake out bugs with code
+// assuming this state is impossible.
+const debugExtendGrunningNoP = false
+
 // Standard syscall entry used by the go syscall library and normal cgo calls.
 //
 // This is exported via linkname to assembly in the syscall package and x/sys.
@@ -4771,6 +4788,9 @@ func entersyscallblock() {
 	// <--
 	// Caution: we're in a small window where we are in _Grunning without a P.
 	// -->
+	if debugExtendGrunningNoP {
+		usleep(10)
+	}
 	casgstatus(gp, _Grunning, _Gsyscall)
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
@@ -4853,6 +4873,9 @@ func exitsyscall() {
 	// Caution: we're in a window where we may be in _Grunning without a P.
 	// Either we will grab a P or call exitsyscall0, where we'll switch to
 	// _Grunnable.
+	if debugExtendGrunningNoP {
+		usleep(10)
+	}
 
 	// Grab and clear our old P.
 	oldp := gp.m.oldp.ptr()
@@ -6026,8 +6049,10 @@ func procresize(nprocs int32) *p {
 		unlock(&allpLock)
 	}
 
+	// Assign Ms to Ps with runnable goroutines.
 	var runnablePs *p
 	var runnablePsNeedM *p
+	var idlePs *p
 	for i := nprocs - 1; i >= 0; i-- {
 		pp := allp[i]
 		if gp.m.p.ptr() == pp {
@@ -6035,7 +6060,8 @@ func procresize(nprocs int32) *p {
 		}
 		pp.status = _Pidle
 		if runqempty(pp) {
-			pidleput(pp, now)
+			pp.link.set(idlePs)
+			idlePs = pp
 			continue
 		}
 
@@ -6061,6 +6087,8 @@ func procresize(nprocs int32) *p {
 		pp.link.set(runnablePs)
 		runnablePs = pp
 	}
+	// Assign Ms to remaining runnable Ps without usable oldm. See comment
+	// above.
 	for runnablePsNeedM != nil {
 		pp := runnablePsNeedM
 		runnablePsNeedM = pp.link.ptr()
@@ -6071,6 +6099,62 @@ func procresize(nprocs int32) *p {
 		runnablePs = pp
 	}
 
+	// Now that we've assigned Ms to Ps with runnable goroutines, assign GC
+	// mark workers to remaining idle Ps, if needed.
+	//
+	// By assigning GC workers to Ps here, we slightly speed up starting
+	// the world, as we will start enough Ps to run all of the user
+	// goroutines and GC mark workers all at once, rather than using a
+	// sequence of wakep calls as each P's findRunnable realizes it needs
+	// to run a mark worker instead of a user goroutine.
+	//
+	// By assigning GC workers to Ps only _after_ previously-running Ps are
+	// assigned Ms, we ensure that goroutines previously running on a P
+	// continue to run on the same P, with GC mark workers preferring
+	// previously-idle Ps. This helps prevent goroutines from shuffling
+	// around too much across STW.
+	//
+	// N.B., if there aren't enough Ps left in idlePs for all of the GC
+	// mark workers, then findRunnable will still choose to run mark
+	// workers on Ps assigned above.
+	//
+	// N.B., we do this during any STW in the mark phase, not just the
+	// sweep termination STW that starts the mark phase. gcBgMarkWorker
+	// always preempts by removing itself from the P, so even unrelated
+	// STWs during the mark require that Ps reselect mark workers upon
+	// restart.
+	if gcBlackenEnabled != 0 {
+		for idlePs != nil {
+			pp := idlePs
+
+			ok, _ := gcController.assignWaitingGCWorker(pp, now)
+			if !ok {
+				// No more mark workers needed.
+				break
+			}
+
+			// Got a worker, P is now runnable.
+			//
+			// mget may return nil if there aren't enough Ms, in
+			// which case startTheWorldWithSema will start one.
+			//
+			// N.B. findRunnableGCWorker will make the worker G
+			// itself runnable.
+			idlePs = pp.link.ptr()
+			mp := mget()
+			pp.m.set(mp)
+			pp.link.set(runnablePs)
+			runnablePs = pp
+		}
+	}
+
+	// Finally, any remaining Ps are truly idle.
+	for idlePs != nil {
+		pp := idlePs
+		idlePs = pp.link.ptr()
+		pidleput(pp, now)
+	}
+
 	stealOrder.reset(uint32(nprocs))
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
 	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@@ -6173,6 +6257,10 @@ func releasepNoTrace() *p {
 		print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
 		throw("releasep: invalid p state")
 	}
+
+	// P must clear if nextGCMarkWorker if it stops.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	gp.m.p = 0
 	pp.m = 0
 	pp.status = _Pidle
@@ -7259,7 +7347,7 @@ func pidlegetSpinning(now int64) (*p, int64) {
 
 	pp, now := pidleget(now)
 	if pp == nil {
-		// See "Delicate dance" comment in findrunnable. We found work
+		// See "Delicate dance" comment in findRunnable. We found work
 		// that we cannot take, we must synchronize with non-spinning
 		// Ms that may be preparing to drop their P.
 		sched.needspinning.Store(1)
@@ -7497,23 +7585,36 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
 				// Try to steal from pp.runnext.
 				if next := pp.runnext; next != 0 {
 					if pp.status == _Prunning {
-						// Sleep to ensure that pp isn't about to run the g
-						// we are about to steal.
-						// The important use case here is when the g running
-						// on pp ready()s another g and then almost
-						// immediately blocks. Instead of stealing runnext
-						// in this window, back off to give pp a chance to
-						// schedule runnext. This will avoid thrashing gs
-						// between different Ps.
-						// A sync chan send/recv takes ~50ns as of time of
-						// writing, so 3us gives ~50x overshoot.
-						if !osHasLowResTimer {
-							usleep(3)
-						} else {
-							// On some platforms system timer granularity is
-							// 1-15ms, which is way too much for this
-							// optimization. So just yield.
-							osyield()
+						if mp := pp.m.ptr(); mp != nil {
+							if gp := mp.curg; gp == nil || readgstatus(gp)&^_Gscan != _Gsyscall {
+								// Sleep to ensure that pp isn't about to run the g
+								// we are about to steal.
+								// The important use case here is when the g running
+								// on pp ready()s another g and then almost
+								// immediately blocks. Instead of stealing runnext
+								// in this window, back off to give pp a chance to
+								// schedule runnext. This will avoid thrashing gs
+								// between different Ps.
+								// A sync chan send/recv takes ~50ns as of time of
+								// writing, so 3us gives ~50x overshoot.
+								// If curg is nil, we assume that the P is likely
+								// to be in the scheduler. If curg isn't nil and isn't
+								// in a syscall, then it's either running, waiting, or
+								// runnable. In this case we want to sleep because the
+								// P might either call into the scheduler soon (running),
+								// or already is (since we found a waiting or runnable
+								// goroutine hanging off of a running P, suggesting it
+								// either recently transitioned out of running, or will
+								// transition to running shortly).
+								if !osHasLowResTimer {
+									usleep(3)
+								} else {
+									// On some platforms system timer granularity is
+									// 1-15ms, which is way too much for this
+									// optimization. So just yield.
+									osyield()
+								}
+							}
 						}
 					}
 					if !pp.runnext.cas(next, 0) {
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index b3084f4895..35a1aeab1f 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) {
 
 	var errors int
 	for i := range runs {
-		err := runTestTracesSTW(t, i)
+		err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
 		if err != nil {
 			t.Logf("Run %d failed: %v", i, err)
 			errors++
@@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) {
 	}
 }
 
-func runTestTracesSTW(t *testing.T, run int) (err error) {
+// TestTraceGCSTW verifies that goroutines continue running on the same M and P
+// after a GC STW.
+func TestTraceGCSTW(t *testing.T) {
+	// Very similar to TestTraceSTW, but using a STW that starts the GC.
+	// When the GC starts, the background GC mark workers start running,
+	// which provide an additional source of disturbance to the scheduler.
+	//
+	// procresize assigns GC workers to previously-idle Ps to avoid
+	// changing what the previously-running Ps are doing.
+
+	if testing.Short() {
+		t.Skip("skipping in -short mode")
+	}
+
+	if runtime.NumCPU() < 8 {
+		t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
+	}
+
+	const runs = 50
+
+	var errors int
+	for i := range runs {
+		err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
+		if err != nil {
+			t.Logf("Run %d failed: %v", i, err)
+			errors++
+		}
+	}
+
+	pct := float64(errors)/float64(runs)
+	t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
+	if pct > 0.25 {
+		t.Errorf("Error rate too high")
+	}
+}
+
+func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
 	t.Logf("Run %d", run)
 
 	// By default, TSAN sleeps for 1s at exit to allow background
@@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// much, since we are running 50 iterations, so disable the sleep.
 	//
 	// Outside of race mode, GORACE does nothing.
-	buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0"))
+	buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
 
 	// We locally "fail" the run (return an error) if the trace exhibits
 	// unwanted scheduling. i.e., the target goroutines did not remain on
@@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// occur, such as a trace parse error.
 	defer func() {
 		if err != nil || t.Failed() {
-			testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false)
+			testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
 		}
 	}()
 
@@ -1509,12 +1545,10 @@ findEnd:
 			break findEnd
 		case trace.EventRangeBegin:
 			r := ev.Range()
-			if r.Name == "stop-the-world (read mem stats)" {
+			if r.Name == stwType {
 				// Note when we see the STW begin. This is not
 				// load bearing; it's purpose is simply to fail
-				// the test if we manage to remove the STW from
-				// ReadMemStat, so we remember to change this
-				// test to add some new source of STW.
+				// the test if we accidentally remove the STW.
 				stwSeen = true
 			}
 		}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 6c955460d4..56082bf7f5 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -854,6 +854,18 @@ type p struct {
 	// mark worker started.
 	gcMarkWorkerStartTime int64
 
+	// nextGCMarkWorker is the next mark worker to run. This may be set
+	// during start-the-world to assign a worker to this P. The P runs this
+	// worker on the next call to gcController.findRunnableGCWorker. If the
+	// P runs something else or stops, it must release this worker via
+	// gcController.releaseNextGCMarkWorker.
+	//
+	// See comment in gcBgMarkWorker about the lifetime of
+	// gcBgMarkWorkerNode.
+	//
+	// Only accessed by this P or during STW.
+	nextGCMarkWorker *gcBgMarkWorkerNode
+
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
@@ -1425,9 +1437,9 @@ var (
 	// must be set. An idle P (passed to pidleput) cannot add new timers while
 	// idle, so if it has no timers at that time, its mask may be cleared.
 	//
-	// Thus, we get the following effects on timer-stealing in findrunnable:
+	// Thus, we get the following effects on timer-stealing in findRunnable:
 	//
-	//   - Idle Ps with no timers when they go idle are never checked in findrunnable
+	//   - Idle Ps with no timers when they go idle are never checked in findRunnable
 	//     (for work- or timer-stealing; this is the ideal case).
 	//   - Running Ps must always be checked.
 	//   - Idle Ps whose timers are stolen must continue to be checked until they run
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index e31d5dccb2..a9e8fc1610 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -399,3 +399,107 @@ func bytealg_MakeNoZero(len int) []byte {
 	cap := roundupsize(uintptr(len), true)
 	return unsafe.Slice((*byte)(mallocgc(cap, nil, false)), cap)[:len]
 }
+
+// moveSlice copies the input slice to the heap and returns it.
+// et is the element type of the slice.
+func moveSlice(et *_type, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) {
+	if cap == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	capmem := uintptr(cap) * et.Size_
+	new := mallocgc(capmem, et, true)
+	bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), capmem, et)
+	memmove(new, old, capmem)
+	return new, len, cap
+}
+
+// moveSliceNoScan is like moveSlice except the element type is known to
+// not have any pointers. We instead pass in the size of the element.
+func moveSliceNoScan(elemSize uintptr, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) {
+	if cap == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	capmem := uintptr(cap) * elemSize
+	new := mallocgc(capmem, nil, false)
+	memmove(new, old, capmem)
+	return new, len, cap
+}
+
+// moveSliceNoCap is like moveSlice, but can pick any appropriate capacity
+// for the returned slice.
+// Elements between len and cap in the returned slice will be zeroed.
+func moveSliceNoCap(et *_type, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) {
+	if len == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	lenmem := uintptr(len) * et.Size_
+	capmem := roundupsize(lenmem, false)
+	new := mallocgc(capmem, et, true)
+	bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), lenmem, et)
+	memmove(new, old, lenmem)
+	return new, len, int(capmem / et.Size_)
+}
+
+// moveSliceNoCapNoScan is a combination of moveSliceNoScan and moveSliceNoCap.
+func moveSliceNoCapNoScan(elemSize uintptr, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) {
+	if len == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	lenmem := uintptr(len) * elemSize
+	capmem := roundupsize(lenmem, true)
+	new := mallocgc(capmem, nil, false)
+	memmove(new, old, lenmem)
+	if capmem > lenmem {
+		memclrNoHeapPointers(add(new, lenmem), capmem-lenmem)
+	}
+	return new, len, int(capmem / elemSize)
+}
+
+// growsliceBuf is like growslice, but we can use the given buffer
+// as a backing store if we want. bufPtr must be on the stack.
+func growsliceBuf(oldPtr unsafe.Pointer, newLen, oldCap, num int, et *_type, bufPtr unsafe.Pointer, bufLen int) slice {
+	if newLen > bufLen {
+		// Doesn't fit, process like a normal growslice.
+		return growslice(oldPtr, newLen, oldCap, num, et)
+	}
+	oldLen := newLen - num
+	if oldPtr != bufPtr && oldLen != 0 {
+		// Move data to start of buffer.
+		// Note: bufPtr is on the stack, so no write barrier needed.
+		memmove(bufPtr, oldPtr, uintptr(oldLen)*et.Size_)
+	}
+	// Pick a new capacity.
+	//
+	// Unlike growslice, we don't need to double the size each time.
+	// The work done here is not proportional to the length of the slice.
+	// (Unless the memmove happens above, but that is rare, and in any
+	// case there are not many elements on this path.)
+	//
+	// Instead, we try to just bump up to the next size class.
+	// This will ensure that we don't waste any space when we eventually
+	// call moveSlice with the resulting slice.
+	newCap := int(roundupsize(uintptr(newLen)*et.Size_, !et.Pointers()) / et.Size_)
+
+	// Zero slice beyond newLen.
+	// The buffer is stack memory, so NoHeapPointers is ok.
+	// Caller will overwrite [oldLen:newLen], so we don't need to zero that portion.
+	// If et.Pointers(), buffer is at least initialized so we don't need to
+	// worry about the caller overwriting junk in [oldLen:newLen].
+	if newLen < newCap {
+		memclrNoHeapPointers(add(bufPtr, uintptr(newLen)*et.Size_), uintptr(newCap-newLen)*et.Size_)
+	}
+
+	return slice{bufPtr, newLen, newCap}
+}
diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go
index cd2bc26d1e..5463b6c02f 100644
--- a/src/runtime/slice_test.go
+++ b/src/runtime/slice_test.go
@@ -6,6 +6,9 @@ package runtime_test
 
 import (
 	"fmt"
+	"internal/race"
+	"internal/testenv"
+	"runtime"
 	"testing"
 )
 
@@ -499,3 +502,319 @@ func BenchmarkAppendInPlace(b *testing.B) {
 
 	})
 }
+
+//go:noinline
+func byteSlice(n int) []byte {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r
+}
+func TestAppendByteInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64}, // If we up the stack buffer size from 32->64, this line and the next would become 48.
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r = byteSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func ptrSlice(n int, p *[]*byte) {
+	var r []*byte
+	for range n {
+		r = append(r, nil)
+	}
+	*p = r
+}
+func TestAppendPtrInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	var tests [][3]int
+	if runtime.PtrSize == 8 {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 1},
+			{2, 1, 2},
+			{3, 1, 3}, // This is the interesting case, allocates 24 bytes when before it was 32.
+			{4, 1, 4},
+			{5, 1, 8},
+			{6, 1, 8},
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 2, 16},
+		}
+	} else {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 2},
+			{2, 1, 2},
+			{3, 1, 4},
+			{4, 1, 4},
+			{5, 1, 6}, // These two are also 24 bytes instead of 32.
+			{6, 1, 6}, //
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 1, 16},
+			{10, 1, 16},
+			{11, 1, 16},
+			{12, 1, 16},
+			{13, 1, 16},
+			{14, 1, 16},
+			{15, 1, 16},
+			{16, 1, 16},
+			{17, 2, 32},
+		}
+	}
+	for _, test := range tests {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []*byte
+		got := testing.AllocsPerRun(10, func() {
+			ptrSlice(n, &r)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func byteCapSlice(n int) ([]byte, int) {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r, cap(r)
+}
+func TestAppendByteCapInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64},
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r, _ = byteCapSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+func TestAppendGeneric(t *testing.T) {
+	type I *int
+	r := testAppendGeneric[I](100)
+	if len(r) != 100 {
+		t.Errorf("bad length")
+	}
+}
+
+//go:noinline
+func testAppendGeneric[E any](n int) []E {
+	var r []E
+	var z E
+	for range n {
+		r = append(r, z)
+	}
+	return r
+}
+
+func appendSomeBytes(r []byte, s []byte) []byte {
+	for _, b := range s {
+		r = append(r, b)
+	}
+	return r
+}
+
+func TestAppendOfArg(t *testing.T) {
+	r := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		r[i] = byte(i)
+	}
+	appendSomeBytes(r, []byte{25, 26, 27})
+	// Do the same thing, trying to overwrite any
+	// stack-allocated buffers used above.
+	s := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		s[i] = 99
+	}
+	appendSomeBytes(s, []byte{99, 99, 99})
+	// Check that we still have the right data.
+	for i, b := range r {
+		if b != byte(i) {
+			t.Errorf("r[%d]=%d, want %d", i, b, byte(i))
+		}
+	}
+
+}
+
+func BenchmarkAppendInLoop(b *testing.B) {
+	for _, size := range []int{0, 1, 8, 16, 32, 64, 128} {
+		b.Run(fmt.Sprintf("%d", size),
+			func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					byteSlice(size)
+				}
+			})
+	}
+}
+
+func TestMoveToHeapEarly(t *testing.T) {
+	// Just checking that this compiles.
+	var x []int
+	y := x // causes a move2heap in the entry block
+	for range 5 {
+		x = append(x, 5)
+	}
+	_ = y
+}
+
+func TestMoveToHeapCap(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = cap(s)
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func runit(f func()) {
+	f()
+}
+
+func TestMoveToHeapClosure1(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		runit(func() {
+			c = cap(s)
+		})
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+func TestMoveToHeapClosure2(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = func() int {
+			return cap(s)
+		}()
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func buildClosure(t *testing.T) ([]byte, func()) {
+	var s []byte
+	for i := range 20 {
+		s = append(s, byte(i))
+	}
+	c := func() {
+		for i, b := range s {
+			if b != byte(i) {
+				t.Errorf("s[%d]=%d, want %d", i, b, i)
+			}
+		}
+	}
+	return s, c
+}
+
+func TestMoveToHeapClosure3(t *testing.T) {
+	_, f := buildClosure(t)
+	overwriteStack(0)
+	f()
+}
+
+//go:noinline
+func overwriteStack(n int) uint64 {
+	var x [100]uint64
+	for i := range x {
+		x[i] = 0xabcdabcdabcdabcd
+	}
+	return x[n]
+}
+
+var sinkSlice []byte
diff --git a/src/runtime/sys_riscv64.go b/src/runtime/sys_riscv64.go
index e710840819..65dc684c33 100644
--- a/src/runtime/sys_riscv64.go
+++ b/src/runtime/sys_riscv64.go
@@ -4,7 +4,12 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"unsafe"
+
+	"internal/abi"
+	"internal/runtime/sys"
+)
 
 // adjust Gobuf as if it executed a call to fn with context ctxt
 // and then did an immediate Gosave.
@@ -12,7 +17,9 @@ func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
 	if buf.lr != 0 {
 		throw("invalid use of gostartcall")
 	}
-	buf.lr = buf.pc
+	// Use double the PC quantum on riscv64, so that we retain
+	// four byte alignment and use non-compressed instructions.
+	buf.lr = abi.FuncPCABI0(goexit) + sys.PCQuantum*2
 	buf.pc = uintptr(fn)
 	buf.ctxt = ctxt
 }
diff --git a/src/runtime/testdata/testprog/crash.go b/src/runtime/testdata/testprog/crash.go
index 556215a71e..fcce388871 100644
--- a/src/runtime/testdata/testprog/crash.go
+++ b/src/runtime/testdata/testprog/crash.go
@@ -22,6 +22,7 @@ func init() {
 	register("RepanickedPanic", RepanickedPanic)
 	register("RepanickedMiddlePanic", RepanickedMiddlePanic)
 	register("RepanickedPanicSandwich", RepanickedPanicSandwich)
+	register("DoublePanicWithSameValue", DoublePanicWithSameValue)
 }
 
 func test(name string) {
@@ -189,3 +190,13 @@ func RepanickedPanicSandwich() {
 		panic("outer")
 	}()
 }
+
+// Double panic with same value and not recovered.
+// See issue 76099.
+func DoublePanicWithSameValue() {
+	var e any = "message"
+	defer func() {
+		panic(e)
+	}()
+	panic(e)
+}
diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
index bbe1453401..32e2c5e1b4 100644
--- a/src/runtime/testdata/testprog/gc.go
+++ b/src/runtime/testdata/testprog/gc.go
@@ -396,7 +396,7 @@ func gcMemoryLimit(gcPercent int) {
 		// should do considerably better than this bound.
 		bound := int64(myLimit + 16<<20)
 		if runtime.GOOS == "darwin" {
-			bound += 16 << 20 // Be more lax on Darwin, see issue 73136.
+			bound += 24 << 20 // Be more lax on Darwin, see issue 73136.
 		}
 		start := time.Now()
 		for time.Since(start) < 200*time.Millisecond {
diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go
index 0fed55b875..0fa15da09e 100644
--- a/src/runtime/testdata/testprog/stw_trace.go
+++ b/src/runtime/testdata/testprog/stw_trace.go
@@ -7,15 +7,18 @@ package main
 import (
 	"context"
 	"log"
+	"math/rand/v2"
 	"os"
 	"runtime"
 	"runtime/debug"
+	"runtime/metrics"
 	"runtime/trace"
 	"sync/atomic"
 )
 
 func init() {
 	register("TraceSTW", TraceSTW)
+	register("TraceGCSTW", TraceGCSTW)
 }
 
 // The parent writes to ping and waits for the children to write back
@@ -53,7 +56,7 @@ func TraceSTW() {
 	// https://go.dev/issue/65694). Alternatively, we could just ignore the
 	// trace if the GC runs.
 	runtime.GOMAXPROCS(4)
-	debug.SetGCPercent(0)
+	debug.SetGCPercent(-1)
 
 	if err := trace.Start(os.Stdout); err != nil {
 		log.Fatalf("failed to start tracing: %v", err)
@@ -86,6 +89,112 @@ func TraceSTW() {
 	stop.Store(true)
 }
 
+// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
+// previously-idle Ps, rather than bumping the current P.
+func TraceGCSTW() {
+	ctx := context.Background()
+
+	// The idea here is to have 2 target goroutines that are constantly
+	// running. When the world restarts after STW, we expect these
+	// goroutines to continue execution on the same M and P.
+	//
+	// Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
+	// 2 dedicated workers, and a bit of slack.
+	//
+	// Disable the GC initially so we can be sure it only triggers once we
+	// are ready.
+	runtime.GOMAXPROCS(8)
+	debug.SetGCPercent(-1)
+
+	if err := trace.Start(os.Stdout); err != nil {
+		log.Fatalf("failed to start tracing: %v", err)
+	}
+	defer trace.Stop()
+
+	for i := range 2 {
+		go traceSTWTarget(i)
+	}
+
+	// Wait for children to start running.
+	ping.Store(1)
+	for pong[0].Load() != 1 {}
+	for pong[1].Load() != 1 {}
+
+	trace.Log(ctx, "TraceSTW", "start")
+
+	// STW
+	triggerGC()
+
+	// Make sure to run long enough for the children to schedule again
+	// after STW. This is included for good measure, but the goroutines
+	// really ought to have already scheduled since the entire GC
+	// completed.
+	ping.Store(2)
+	for pong[0].Load() != 2 {}
+	for pong[1].Load() != 2 {}
+
+	trace.Log(ctx, "TraceSTW", "end")
+
+	stop.Store(true)
+}
+
+func triggerGC() {
+	// Allocate a bunch to trigger the GC rather than using runtime.GC. The
+	// latter blocks until the GC is complete, which is convenient, but
+	// messes with scheduling as it gives this P a chance to steal the
+	// other goroutines before their Ps get up and running again.
+
+	// Bring heap size up prior to enabling the GC to ensure that there is
+	// a decent amount of work in case the GC triggers immediately upon
+	// re-enabling.
+	for range 1000 {
+		alloc()
+	}
+
+	sample := make([]metrics.Sample, 1)
+	sample[0].Name = "/gc/cycles/total:gc-cycles"
+	metrics.Read(sample)
+
+	start := sample[0].Value.Uint64()
+
+	debug.SetGCPercent(100)
+
+	// Keep allocating until the GC is complete. We really only need to
+	// continue until the mark workers are scheduled, but there isn't a
+	// good way to measure that.
+	for {
+		metrics.Read(sample)
+		if sample[0].Value.Uint64() != start {
+			return
+		}
+
+		alloc()
+	}
+}
+
+// Allocate a tree data structure to generate plenty of scan work for the GC.
+
+type node struct {
+	children []*node
+}
+
+var gcSink node
+
+func alloc() {
+	// 10% chance of adding a node a each layer.
+
+	curr := &gcSink
+	for {
+		if len(curr.children) == 0 || rand.Float32() < 0.1 {
+			curr.children = append(curr.children, new(node))
+			return
+		}
+
+		i := rand.IntN(len(curr.children))
+		curr = curr.children[i]
+	}
+}
+
 // Manually insert a morestack call. Leaf functions can omit morestack, but
 // non-leaf functions should include them.
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 6649f72471..74aaeba876 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1366,16 +1366,19 @@ func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) {
 
 	// Print the hex dump.
 	print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n")
-	hexdumpWords(lo, hi, func(p uintptr) byte {
-		switch p {
-		case frame.fp:
-			return '>'
-		case frame.sp:
-			return '<'
-		case bad:
-			return '!'
+	hexdumpWords(lo, hi-lo, func(p uintptr, m hexdumpMarker) {
+		if p == frame.fp {
+			m.start()
+			println("FP")
+		}
+		if p == frame.sp {
+			m.start()
+			println("SP")
+		}
+		if p == bad {
+			m.start()
+			println("bad")
 		}
-		return 0
 	})
 }