From 7a8d0b5d53dc7be331016b6903707aa256e22c2a Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 30 Oct 2025 17:24:47 +0000
Subject: runtime: add debug mode to extend _Grunning-without-P windows

This was suggested in CL 646198, and I tested with it, I forgot to push
it so it wasn't merged. I think it might be worth keeping.

Change-Id: Ibf97d969fe7d0eeb365f5f7b1fbeadea3a1076ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/716580
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/proc.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 44b64913a5..44dbb2fd44 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4658,6 +4658,11 @@ func reentersyscall(pc, sp, bp uintptr) {
 	gp.m.locks--
 }
 
+// debugExtendGrunningNoP is a debug mode that extends the windows in which
+// we're _Grunning without a P in order to try to shake out bugs with code
+// assuming this state is impossible.
+const debugExtendGrunningNoP = false
+
 // Standard syscall entry used by the go syscall library and normal cgo calls.
 //
 // This is exported via linkname to assembly in the syscall package and x/sys.
@@ -4770,6 +4775,9 @@ func entersyscallblock() {
 	// <--
 	// Caution: we're in a small window where we are in _Grunning without a P.
 	// -->
+	if debugExtendGrunningNoP {
+		usleep(10)
+	}
 	casgstatus(gp, _Grunning, _Gsyscall)
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
@@ -4852,6 +4860,9 @@ func exitsyscall() {
 	// Caution: we're in a window where we may be in _Grunning without a P.
 	// Either we will grab a P or call exitsyscall0, where we'll switch to
 	// _Grunnable.
+	if debugExtendGrunningNoP {
+		usleep(10)
+	}
 
 	// Grab and clear our old P.
 	oldp := gp.m.oldp.ptr()
-- 
cgit v1.3


From 80c91eedbbf3c041e9b0c03c28fae2c73dbb7df4 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 7 Nov 2025 18:22:29 +0000
Subject: runtime: ensure weak handles end up in their own allocation

Currently weak handles are atomic.Uintptr values that may end up in
a tiny block which can cause all sorts of surprising leaks. See #76007
for one example.

This change pads out the underlying allocation of the atomic.Uintptr to
16 bytes to ensure we bypass the tiny allocator, and it gets its own
block. This wastes 8 bytes per weak handle. We could potentially do
better by using the 8 byte noscan size class, but this can be a
follow-up change.

For #76007.

Change-Id: I3ab74dda9bf312ea0007f167093052de28134944
Reviewed-on: https://go-review.googlesource.com/c/go/+/719960
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/mheap.go     | 10 +++++++++-
 src/weak/pointer_test.go | 40 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 711c7790eb..3334099092 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -2534,7 +2534,15 @@ func getOrAddWeakHandle(p unsafe.Pointer) *atomic.Uintptr {
 	s := (*specialWeakHandle)(mheap_.specialWeakHandleAlloc.alloc())
 	unlock(&mheap_.speciallock)
 
-	handle := new(atomic.Uintptr)
+	// N.B. Pad the weak handle to ensure it doesn't share a tiny
+	// block with any other allocations. This can lead to leaks, such
+	// as in go.dev/issue/76007. As an alternative, we could consider
+	// using the currently-unused 8-byte noscan size class.
+	type weakHandleBox struct {
+		h atomic.Uintptr
+		_ [maxTinySize - unsafe.Sizeof(atomic.Uintptr{})]byte
+	}
+	handle := &(new(weakHandleBox).h)
 	s.special.kind = _KindSpecialWeakHandle
 	s.handle = handle
 	handle.Store(uintptr(p))
diff --git a/src/weak/pointer_test.go b/src/weak/pointer_test.go
index da464a8d01..5e8b9bef58 100644
--- a/src/weak/pointer_test.go
+++ b/src/weak/pointer_test.go
@@ -16,8 +16,10 @@ import (
 )
 
 type T struct {
-	// N.B. This must contain a pointer, otherwise the weak handle might get placed
-	// in a tiny block making the tests in this package flaky.
+	// N.B. T is what it is to avoid having test values get tiny-allocated
+	// in the same block as the weak handle, but since the fix to
+	// go.dev/issue/76007, this should no longer be possible.
+	// TODO(mknyszek): Consider using tiny-allocated values for all the tests.
 	t *T
 	a int
 	b int
@@ -327,3 +329,37 @@ func TestImmortalPointer(t *testing.T) {
 		t.Errorf("immortal weak pointer to %p has unexpected Value %p", want, got)
 	}
 }
+
+func TestPointerTiny(t *testing.T) {
+	runtime.GC() // Clear tiny-alloc caches.
+
+	const N = 1000
+	wps := make([]weak.Pointer[int], N)
+	for i := range N {
+		// N.B. *x is just an int, so the value is very likely
+		// tiny-allocated alongside the weak handle, assuming bug
+		// from go.dev/issue/76007 exists.
+		x := new(int)
+		*x = i
+		wps[i] = weak.Make(x)
+	}
+
+	// Get the cleanups to start running.
+	runtime.GC()
+
+	// Expect at least 3/4ths of the weak pointers to have gone nil.
+	//
+	// Note that we provide some leeway since it's possible our allocation
+	// gets grouped with some other long-lived tiny allocation, but this
+	// shouldn't be the case for the vast majority of allocations.
+	n := 0
+	for _, wp := range wps {
+		if wp.Value() == nil {
+			n++
+		}
+	}
+	const want = 3 * N / 4
+	if n < want {
+		t.Fatalf("not enough weak pointers are nil: expected at least %v, got %v", want, n)
+	}
+}
-- 
cgit v1.3


From 9fd2e4443955127ed360338cb19a5aeba6be1a8c Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 13 Nov 2025 17:17:35 +0000
Subject: runtime: add AddCleanup benchmark

Change-Id: Ia463a9b3b5980670bcf9297b4bddb60980ebfde5
Reviewed-on: https://go-review.googlesource.com/c/go/+/720320
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
---
 src/runtime/mcleanup_test.go | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/mcleanup_test.go b/src/runtime/mcleanup_test.go
index 22b9eccd20..341d30afa7 100644
--- a/src/runtime/mcleanup_test.go
+++ b/src/runtime/mcleanup_test.go
@@ -336,3 +336,31 @@ func TestCleanupLost(t *testing.T) {
 		t.Errorf("expected %d cleanups to be executed, got %d", got, want)
 	}
 }
+
+// BenchmarkAddCleanupAndStop benchmarks adding and removing a cleanup
+// from the same allocation.
+//
+// At face value, this benchmark is unrealistic, since no program would
+// do this in practice. However, adding cleanups to new allocations in a
+// loop is also unrealistic. It adds additional unused allocations,
+// exercises uncommon performance pitfalls in AddCleanup (traversing the
+// specials list, which should just be its own benchmark), and executing
+// cleanups at a frequency that is unlikely to appear in real programs.
+//
+// This benchmark is still useful however, since we can get a low-noise
+// measurement of the cost of AddCleanup and Stop all in one without the
+// above pitfalls: we can measure the pure overhead. We can then separate
+// out the cost of each in CPU profiles if we so choose (they're not so
+// inexpensive as to make this infeasible).
+func BenchmarkAddCleanupAndStop(b *testing.B) {
+	b.ReportAllocs()
+
+	type T struct {
+		v int
+		p unsafe.Pointer
+	}
+	x := new(T)
+	for b.Loop() {
+		runtime.AddCleanup(x, func(int) {}, 14).Stop()
+	}
+}
-- 
cgit v1.3


From 1bb1f2bf0c07bbc583063a21b324407f7041e316 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 13 Nov 2025 18:29:23 +0000
Subject: runtime: put AddCleanup cleanup arguments in their own allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, AddCleanup just creates a simple closure that calls
`cleanup(arg)` as the actual cleanup function tracked internally.
However, the argument ends up getting its own allocation. If it's tiny,
then it can also end up sharing a tiny allocation slot with the object
we're adding the cleanup to. Since the closure is a GC root, we can end
up with cleanups that never fire.

This change refactors the AddCleanup machinery to make the storage for
the argument separate and explicit. With that in place, it explicitly
allocates 16 bytes of storage for tiny arguments to side-step the tiny
allocator.

One would think this would cause an increase in memory use and more
bytes allocated, but that's actually wrong! It turns out that the
current "simple closure" actually creates _two_ closures. By making the
argument passing explicit, we eliminate one layer of closures, so this
actually results in a slightly faster AddCleanup overall and 16 bytes
less memory allocated.

goos: linux
goarch: amd64
pkg: runtime
cpu: AMD EPYC 7B13
                     │ before.bench │            after.bench             │
                     │    sec/op    │   sec/op     vs base               │
AddCleanupAndStop-64    124.5n ± 2%   103.7n ± 2%  -16.71% (p=0.002 n=6)

                     │ before.bench │            after.bench            │
                     │     B/op     │    B/op     vs base               │
AddCleanupAndStop-64     48.00 ± 0%   32.00 ± 0%  -33.33% (p=0.002 n=6)

                     │ before.bench │            after.bench            │
                     │  allocs/op   │ allocs/op   vs base               │
AddCleanupAndStop-64     3.000 ± 0%   2.000 ± 0%  -33.33% (p=0.002 n=6)

This change does, however, does add 16 bytes of overhead to the cleanup
special itself, and makes each cleanup block entry 24 bytes instead of 8
bytes. This means the overall memory overhead delta with this change is
neutral, and we just have a faster cleanup. (Cleanup block entries are
transient, so I suspect any increase in memory overhead there is
negligible.)

Together with CL 719960, fixes #76007.

Change-Id: I81bf3e44339e71c016c30d80bb4ee151c8263d5c
Reviewed-on: https://go-review.googlesource.com/c/go/+/720321
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/mcleanup.go | 86 +++++++++++++++++++++++++++++++++++--------------
 src/runtime/mgcmark.go  |  6 ++--
 src/runtime/mheap.go    | 23 +++++++------
 3 files changed, 78 insertions(+), 37 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mcleanup.go b/src/runtime/mcleanup.go
index 383217aa05..fc71af9f3f 100644
--- a/src/runtime/mcleanup.go
+++ b/src/runtime/mcleanup.go
@@ -72,8 +72,9 @@ import (
 // pass the object to the [KeepAlive] function after the last point
 // where the object must remain reachable.
 func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
-	// Explicitly force ptr to escape to the heap.
+	// Explicitly force ptr and cleanup to escape to the heap.
 	ptr = abi.Escape(ptr)
+	cleanup = abi.Escape(cleanup)
 
 	// The pointer to the object must be valid.
 	if ptr == nil {
@@ -82,7 +83,8 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 	usptr := uintptr(unsafe.Pointer(ptr))
 
 	// Check that arg is not equal to ptr.
-	if kind := abi.TypeOf(arg).Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
+	argType := abi.TypeOf(arg)
+	if kind := argType.Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
 		if unsafe.Pointer(ptr) == *((*unsafe.Pointer)(unsafe.Pointer(&arg))) {
 			panic("runtime.AddCleanup: ptr is equal to arg, cleanup will never run")
 		}
@@ -98,12 +100,23 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 		return Cleanup{}
 	}
 
-	fn := func() {
-		cleanup(arg)
+	// Create new storage for the argument.
+	var argv *S
+	if size := unsafe.Sizeof(arg); size < maxTinySize && argType.PtrBytes == 0 {
+		// Side-step the tiny allocator to avoid liveness issues, since this box
+		// will be treated like a root by the GC. We model the box as an array of
+		// uintptrs to guarantee maximum allocator alignment.
+		//
+		// TODO(mknyszek): Consider just making space in cleanupFn for this. The
+		// unfortunate part of this is it would grow specialCleanup by 16 bytes, so
+		// while there wouldn't be an allocation, *every* cleanup would take the
+		// memory overhead hit.
+		box := new([maxTinySize / goarch.PtrSize]uintptr)
+		argv = (*S)(unsafe.Pointer(box))
+	} else {
+		argv = new(S)
 	}
-	// Closure must escape.
-	fv := *(**funcval)(unsafe.Pointer(&fn))
-	fv = abi.Escape(fv)
+	*argv = arg
 
 	// Find the containing object.
 	base, _, _ := findObject(usptr, 0, 0)
@@ -120,7 +133,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 		gcCleanups.createGs()
 	}
 
-	id := addCleanup(unsafe.Pointer(ptr), fv)
+	id := addCleanup(unsafe.Pointer(ptr), cleanupFn{
+		// Instantiate a caller function to call the cleanup, that is cleanup(*argv).
+		//
+		// TODO(mknyszek): This allocates because the generic dictionary argument
+		// gets closed over, but callCleanup doesn't even use the dictionary argument,
+		// so theoretically that could be removed, eliminating an allocation.
+		call: callCleanup[S],
+		fn:   *(**funcval)(unsafe.Pointer(&cleanup)),
+		arg:  unsafe.Pointer(argv),
+	})
 	if debug.checkfinalizers != 0 {
 		cleanupFn := *(**funcval)(unsafe.Pointer(&cleanup))
 		setCleanupContext(unsafe.Pointer(ptr), abi.TypeFor[T](), sys.GetCallerPC(), cleanupFn.fn, id)
@@ -131,6 +153,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
 	}
 }
 
+// callCleanup is a helper for calling cleanups in a polymorphic way.
+//
+// In practice, all it does is call fn(*arg). arg must be a *T.
+//
+//go:noinline
+func callCleanup[T any](fn *funcval, arg unsafe.Pointer) {
+	cleanup := *(*func(T))(unsafe.Pointer(&fn))
+	cleanup(*(*T)(arg))
+}
+
 // Cleanup is a handle to a cleanup call for a specific object.
 type Cleanup struct {
 	// id is the unique identifier for the cleanup within the arena.
@@ -216,7 +248,17 @@ const cleanupBlockSize = 512
 // that the cleanup queue does not grow during marking (but it can shrink).
 type cleanupBlock struct {
 	cleanupBlockHeader
-	cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / goarch.PtrSize]*funcval
+	cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / unsafe.Sizeof(cleanupFn{})]cleanupFn
+}
+
+var cleanupFnPtrMask = [...]uint8{0b111}
+
+// cleanupFn represents a cleanup function with it's argument, yet to be called.
+type cleanupFn struct {
+	// call is an adapter function that understands how to safely call fn(*arg).
+	call func(*funcval, unsafe.Pointer)
+	fn   *funcval       // cleanup function passed to AddCleanup.
+	arg  unsafe.Pointer // pointer to argument to pass to cleanup function.
 }
 
 var cleanupBlockPtrMask [cleanupBlockSize / goarch.PtrSize / 8]byte
@@ -245,8 +287,8 @@ type cleanupBlockHeader struct {
 //
 // Must only be called if the GC is in the sweep phase (gcphase == _GCoff),
 // because it does not synchronize with the garbage collector.
-func (b *cleanupBlock) enqueue(fn *funcval) bool {
-	b.cleanups[b.n] = fn
+func (b *cleanupBlock) enqueue(c cleanupFn) bool {
+	b.cleanups[b.n] = c
 	b.n++
 	return b.full()
 }
@@ -375,7 +417,7 @@ func (q *cleanupQueue) tryTakeWork() bool {
 // enqueue queues a single cleanup for execution.
 //
 // Called by the sweeper, and only the sweeper.
-func (q *cleanupQueue) enqueue(fn *funcval) {
+func (q *cleanupQueue) enqueue(c cleanupFn) {
 	mp := acquirem()
 	pp := mp.p.ptr()
 	b := pp.cleanups
@@ -396,7 +438,7 @@ func (q *cleanupQueue) enqueue(fn *funcval) {
 		}
 		pp.cleanups = b
 	}
-	if full := b.enqueue(fn); full {
+	if full := b.enqueue(c); full {
 		q.full.push(&b.lfnode)
 		pp.cleanups = nil
 		q.addWork(1)
@@ -641,7 +683,8 @@ func runCleanups() {
 
 		gcCleanups.beginRunningCleanups()
 		for i := 0; i < int(b.n); i++ {
-			fn := b.cleanups[i]
+			c := b.cleanups[i]
+			b.cleanups[i] = cleanupFn{}
 
 			var racectx uintptr
 			if raceenabled {
@@ -650,20 +693,15 @@ func runCleanups() {
 				// the same goroutine.
 				//
 				// Synchronize on fn. This would fail to find races on the
-				// closed-over values in fn (suppose fn is passed to multiple
-				// AddCleanup calls) if fn was not unique, but it is. Update
-				// the synchronization on fn if you intend to optimize it
-				// and store the cleanup function and cleanup argument on the
-				// queue directly.
-				racerelease(unsafe.Pointer(fn))
+				// closed-over values in fn (suppose arg is passed to multiple
+				// AddCleanup calls) if arg was not unique, but it is.
+				racerelease(unsafe.Pointer(c.arg))
 				racectx = raceEnterNewCtx()
-				raceacquire(unsafe.Pointer(fn))
+				raceacquire(unsafe.Pointer(c.arg))
 			}
 
 			// Execute the next cleanup.
-			cleanup := *(*func())(unsafe.Pointer(&fn))
-			cleanup()
-			b.cleanups[i] = nil
+			c.call(c.fn, c.arg)
 
 			if raceenabled {
 				// Restore the old context.
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index dd76973c62..c9234c5084 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -204,7 +204,7 @@ func gcMarkRootCheck() {
 	})
 }
 
-// ptrmask for an allocation containing a single pointer.
+// oneptrmask for an allocation containing a single pointer.
 var oneptrmask = [...]uint8{1}
 
 // markroot scans the i'th root.
@@ -251,7 +251,7 @@ func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
 			// N.B. This only needs to synchronize with cleanup execution, which only resets these blocks.
 			// All cleanup queueing happens during sweep.
 			n := uintptr(atomic.Load(&cb.n))
-			scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*goarch.PtrSize, &cleanupBlockPtrMask[0], gcw, nil)
+			scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*unsafe.Sizeof(cleanupFn{}), &cleanupBlockPtrMask[0], gcw, nil)
 		}
 
 	case work.baseSpans <= i && i < work.baseStacks:
@@ -489,7 +489,7 @@ func gcScanFinalizer(spf *specialfinalizer, s *mspan, gcw *gcWork) {
 // gcScanCleanup scans the relevant parts of a cleanup special as a root.
 func gcScanCleanup(spc *specialCleanup, gcw *gcWork) {
 	// The special itself is a root.
-	scanblock(uintptr(unsafe.Pointer(&spc.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+	scanblock(uintptr(unsafe.Pointer(&spc.cleanup)), unsafe.Sizeof(cleanupFn{}), &cleanupFnPtrMask[0], gcw, nil)
 }
 
 // gcAssistAlloc performs GC work to make gp's assist debt positive.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 3334099092..08a0057be7 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -2161,7 +2161,7 @@ func removefinalizer(p unsafe.Pointer) {
 type specialCleanup struct {
 	_       sys.NotInHeap
 	special special
-	fn      *funcval
+	cleanup cleanupFn
 	// Globally unique ID for the cleanup, obtained from mheap_.cleanupID.
 	id uint64
 }
@@ -2170,14 +2170,18 @@ type specialCleanup struct {
 // cleanups are allowed on an object, and even the same pointer.
 // A cleanup id is returned which can be used to uniquely identify
 // the cleanup.
-func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
+func addCleanup(p unsafe.Pointer, c cleanupFn) uint64 {
+	// TODO(mknyszek): Consider pooling specialCleanups on the P
+	// so we don't have to take the lock every time. Just locking
+	// is a considerable part of the cost of AddCleanup. This
+	// would also require reserving some cleanup IDs on the P.
 	lock(&mheap_.speciallock)
 	s := (*specialCleanup)(mheap_.specialCleanupAlloc.alloc())
 	mheap_.cleanupID++ // Increment first. ID 0 is reserved.
 	id := mheap_.cleanupID
 	unlock(&mheap_.speciallock)
 	s.special.kind = _KindSpecialCleanup
-	s.fn = f
+	s.cleanup = c
 	s.id = id
 
 	mp := acquirem()
@@ -2187,17 +2191,16 @@ func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
 	// situation where it's possible that markrootSpans
 	// has already run but mark termination hasn't yet.
 	if gcphase != _GCoff {
-		gcw := &mp.p.ptr().gcw
 		// Mark the cleanup itself, since the
 		// special isn't part of the GC'd heap.
-		scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+		gcScanCleanup(s, &mp.p.ptr().gcw)
 	}
 	releasem(mp)
-	// Keep f alive. There's a window in this function where it's
-	// only reachable via the special while the special hasn't been
-	// added to the specials list yet. This is similar to a bug
+	// Keep c and its referents alive. There's a window in this function
+	// where it's only reachable via the special while the special hasn't
+	// been added to the specials list yet. This is similar to a bug
 	// discovered for weak handles, see #70455.
-	KeepAlive(f)
+	KeepAlive(c)
 	return id
 }
 
@@ -2800,7 +2803,7 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 		// Cleanups, unlike finalizers, do not resurrect the objects
 		// they're attached to, so we only need to pass the cleanup
 		// function, not the object.
-		gcCleanups.enqueue(sc.fn)
+		gcCleanups.enqueue(sc.cleanup)
 		lock(&mheap_.speciallock)
 		mheap_.specialCleanupAlloc.free(unsafe.Pointer(sc))
 		unlock(&mheap_.speciallock)
-- 
cgit v1.3


From 1a03d0db3f36c1a81a184fa15a54f716569a9972 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Wed, 5 Nov 2025 12:18:49 -0500
Subject: runtime: skip tests for GOEXPERIMENT=arenas that do not handle
 clobberfree=1

When run with GODEBUG=clobberfree=1, three out of seven of the top-level
tests in runtime/arena_test.go fail with a SIGSEGV inside the
clobberfree function where it is overwriting freed memory
with 0xdeadbeef.

This is not a new problem. For example, this crashes in Go 1.20:

  GODEBUG=clobberfree=1 go test runtime -run=TestUserArena

It would be nice for all.bash to pass with GODEBUG=clobberfree=1,
including it is useful for testing the automatic reclaiming of
dead memory via runtime.freegc in #74299.

Given the GOEXPERIMENT=arenas in #51317 is not planned to move forward
(and is hopefully slated to be replace by regions before too long),
for now we just skip those three tests in order to get all.bash
passing with GODEBUG=clobberfree=1.

Updates #74299

Change-Id: I384d96791157b30c73457d582a45dd74c5607ee0
Reviewed-on: https://go-review.googlesource.com/c/go/+/715080
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/runtime/arena_test.go  | 15 +++++++++++++++
 src/runtime/export_test.go |  6 ++++++
 2 files changed, 21 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/arena_test.go b/src/runtime/arena_test.go
index ca5223b59c..0bb1950464 100644
--- a/src/runtime/arena_test.go
+++ b/src/runtime/arena_test.go
@@ -36,6 +36,11 @@ type largeScalar [UserArenaChunkBytes + 1]byte
 type largePointer [UserArenaChunkBytes/unsafe.Sizeof(&smallPointer{}) + 1]*smallPointer
 
 func TestUserArena(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	// Set GOMAXPROCS to 2 so we don't run too many of these
 	// tests in parallel.
 	defer GOMAXPROCS(GOMAXPROCS(2))
@@ -228,6 +233,11 @@ func runSubTestUserArenaSlice[S comparable](t *testing.T, value []S, parallel bo
 }
 
 func TestUserArenaLiveness(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	t.Run("Free", func(t *testing.T) {
 		testUserArenaLiveness(t, false)
 	})
@@ -320,6 +330,11 @@ func testUserArenaLiveness(t *testing.T, useArenaFinalizer bool) {
 }
 
 func TestUserArenaClearsPointerBits(t *testing.T) {
+	if Clobberfree() {
+		// This test crashes with SEGV in clobberfree in mgcsweep.go with GODEBUG=clobberfree=1.
+		t.Skip("triggers SEGV with GODEBUG=clobberfree=1")
+	}
+
 	// This is a regression test for a serious issue wherein if pointer bits
 	// aren't properly cleared, it's possible to allocate scalar data down
 	// into a previously pointer-ful area, causing misinterpretation by the GC.
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d17984881d..731ba5d6b9 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -238,6 +238,12 @@ func SetEnvs(e []string) { envs = e }
 
 const PtrSize = goarch.PtrSize
 
+const ClobberdeadPtr = clobberdeadPtr
+
+func Clobberfree() bool {
+	return debug.clobberfree != 0
+}
+
 var ForceGCPeriod = &forcegcperiod
 
 // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
-- 
cgit v1.3


From fecfcaa4f68a220f47e2c7c8b65d55906dbf8d46 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Tue, 4 Nov 2025 09:33:17 -0500
Subject: runtime: add runtime.freegc to reduce GC work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL adds runtime.freegc:

 func freegc(ptr unsafe.Pointer, uintptr size, noscan bool)

Memory freed via runtime.freegc is made immediately reusable for
the next allocation in the same size class, without waiting for a
GC cycle, and hence can dramatically reduce pressure on the GC. A sample
microbenchmark included below shows strings.Builder operating roughly
2x faster.

An experimental modification to reflect to use runtime.freegc
and then using that reflect with json/v2 gave reported memory
allocation reductions of -43.7%, -32.9%, -21.9%, -22.0%, -1.0%
for the 5 official real-world unmarshalling benchmarks from
go-json-experiment/jsonbench by the authors of json/v2, covering
the CanadaGeometry through TwitterStatus datasets.

Note: there is no intent to modify the standard library to have
explicit calls to runtime.freegc, and of course such an ability
would never be exposed to end-user code.

Later CLs in this stack teach the compiler how to automatically
insert runtime.freegc calls when it can prove it is safe to do so.

(The reflect modification and other experimental changes to
the standard library were just that -- experiments. It was
very helpful while initially developing runtime.freegc to see
more complex uses and closer-to-real-world benchmark results
prior to updating the compiler.)

This CL only addresses noscan span classes (heap objects without
pointers), such as the backing memory for a []byte or string. A
follow-on CL adds support for heap objects with pointers.

If we update strings.Builder to explicitly call runtime.freegc on its
internal buf after a resize operation (but without freeing the usually
final incarnation of buf that will be returned to the user as a string),
we can see some nice benchmark results on the existing strings
benchmarks that call Builder.Write N times and then call Builder.String.
Here, the (uncommon) case of a single Builder.Write is not helped (given
it never resizes after first alloc if there is only one Write), but the
impact grows such that it is up to ~2x faster as there are more resize
operations due to more strings.Builder.Write calls:

                                               │     disabled.out │      new-free-20.txt                │
                                               │      sec/op      │   sec/op     vs base                │
BuildString_Builder/1Write_36Bytes_NoGrow-4           55.82n ± 2%   55.86n ± 2%        ~ (p=0.794 n=20)
BuildString_Builder/2Write_36Bytes_NoGrow-4           125.2n ± 2%   115.4n ± 1%   -7.86% (p=0.000 n=20)
BuildString_Builder/3Write_36Bytes_NoGrow-4           224.0n ± 1%   188.2n ± 2%  -16.00% (p=0.000 n=20)
BuildString_Builder/5Write_36Bytes_NoGrow-4           239.1n ± 9%   205.1n ± 1%  -14.20% (p=0.000 n=20)
BuildString_Builder/8Write_36Bytes_NoGrow-4           422.8n ± 3%   325.4n ± 1%  -23.04% (p=0.000 n=20)
BuildString_Builder/10Write_36Bytes_NoGrow-4          436.9n ± 2%   342.3n ± 1%  -21.64% (p=0.000 n=20)
BuildString_Builder/100Write_36Bytes_NoGrow-4         4.403µ ± 1%   2.381µ ± 2%  -45.91% (p=0.000 n=20)
BuildString_Builder/1000Write_36Bytes_NoGrow-4        48.28µ ± 2%   21.38µ ± 2%  -55.71% (p=0.000 n=20)

See the design document for more discussion of the strings.Builder case.

For testing, we add tests that attempt to exercise different aspects
of the underlying freegc and mallocgc behavior on the reuse path.
Validating the assist credit manipulations turned out to be subtle,
so a test for that is added in the next CL. There are also
invariant checks added, controlled by consts (primarily the
doubleCheckReusable const currently).

This CL also adds support in runtime.freegc for GODEBUG=clobberfree=1
to immediately overwrite freed memory with 0xdeadbeef, which
can help a higher-level test fail faster in the event of a bug,
and also the GC specifically looks for that pattern and throws
a fatal error if it unexpectedly finds it.

A later CL (currently experimental) adds GODEBUG=clobberfree=2,
which uses mprotect (or VirtualProtect on Windows) to set
freed memory to fault if read or written, until the runtime
later unprotects the memory on the mallocgc reuse path.

For the cases where a normal allocation is happening without any reuse,
some initial microbenchmarks suggest the impact of these changes could
be small to negligible (at least with GOAMD64=v3):

goos: linux
goarch: amd64
pkg: runtime
cpu: AMD EPYC 7B13
                        │ base-512M-v3.bench │   ps16-512M-goamd64-v3.bench     │
                        │        sec/op      │ sec/op     vs base               │
Malloc8-16                      11.01n ± 1%   10.94n ± 1%  -0.68% (p=0.038 n=20)
Malloc16-16                     17.15n ± 1%   17.05n ± 0%  -0.55% (p=0.007 n=20)
Malloc32-16                     18.65n ± 1%   18.42n ± 0%  -1.26% (p=0.000 n=20)
MallocTypeInfo8-16              18.63n ± 0%   18.36n ± 0%  -1.45% (p=0.000 n=20)
MallocTypeInfo16-16             22.32n ± 0%   22.65n ± 0%  +1.50% (p=0.000 n=20)
MallocTypeInfo32-16             23.37n ± 0%   23.89n ± 0%  +2.23% (p=0.000 n=20)
geomean                         18.02n        18.01n       -0.05%

These last benchmark results include the runtime updates to support
span classes with pointers (which was originally part of this CL,
but later split out for ease of review).

Updates #74299

Change-Id: Icceaa0f79f85c70cd1a718f9a4e7f0cf3d77803c
Reviewed-on: https://go-review.googlesource.com/c/go/+/673695
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/cmd/link/internal/loader/loader.go   |   3 +-
 src/cmd/link/link_test.go                |   1 +
 src/cmd/link/testdata/linkname/freegc.go |  18 ++
 src/runtime/export_test.go               |   9 +
 src/runtime/malloc.go                    | 329 ++++++++++++++++++++++++++++++-
 src/runtime/malloc_test.go               | 286 +++++++++++++++++++++++++++
 src/runtime/mcache.go                    |  52 ++++-
 src/runtime/mheap.go                     |   2 +-
 8 files changed, 693 insertions(+), 7 deletions(-)
 create mode 100644 src/cmd/link/testdata/linkname/freegc.go

(limited to 'src/runtime')

diff --git a/src/cmd/link/internal/loader/loader.go b/src/cmd/link/internal/loader/loader.go
index 2d386c0c65..9ab55643f6 100644
--- a/src/cmd/link/internal/loader/loader.go
+++ b/src/cmd/link/internal/loader/loader.go
@@ -2464,10 +2464,11 @@ var blockedLinknames = map[string][]string{
 	// Experimental features
 	"runtime.goroutineLeakGC":    {"runtime/pprof"},
 	"runtime.goroutineleakcount": {"runtime/pprof"},
+	"runtime.freegc":             {}, // disallow all packages
 	// Others
 	"net.newWindowsFile":                   {"net"},              // pushed from os
 	"testing/synctest.testingSynctestTest": {"testing/synctest"}, // pushed from testing
-	"runtime.addmoduledata":                {},                   // disallow all package
+	"runtime.addmoduledata":                {},                   // disallow all packages
 }
 
 // check if a linkname reference to symbol s from pkg is allowed
diff --git a/src/cmd/link/link_test.go b/src/cmd/link/link_test.go
index 31822d21f3..6ab1246c81 100644
--- a/src/cmd/link/link_test.go
+++ b/src/cmd/link/link_test.go
@@ -1616,6 +1616,7 @@ func TestCheckLinkname(t *testing.T) {
 		// pull linkname of a builtin symbol is not ok
 		{"builtin.go", false},
 		{"addmoduledata.go", false},
+		{"freegc.go", false},
 		// legacy bad linkname is ok, for now
 		{"fastrand.go", true},
 		{"badlinkname.go", true},
diff --git a/src/cmd/link/testdata/linkname/freegc.go b/src/cmd/link/testdata/linkname/freegc.go
new file mode 100644
index 0000000000..390063f8e9
--- /dev/null
+++ b/src/cmd/link/testdata/linkname/freegc.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linkname runtime.freegc is not allowed.
+
+package main
+
+import (
+	_ "unsafe"
+)
+
+//go:linkname freegc runtime.freegc
+func freegc()
+
+func main() {
+	freegc()
+}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 731ba5d6b9..48dcf5aa39 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -639,6 +639,15 @@ func RunGetgThreadSwitchTest() {
 	}
 }
 
+// Expose freegc for testing.
+func Freegc(p unsafe.Pointer, size uintptr, noscan bool) {
+	freegc(p, size, noscan)
+}
+
+const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled
+
+const RuntimeFreegcEnabled = runtimeFreegcEnabled
+
 const (
 	PageSize         = pageSize
 	PallocChunkPages = pallocChunkPages
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index fc4f21b532..13f5fc3081 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1080,7 +1080,8 @@ func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, checkGCTrigger
 //
 // We might consider turning these on by default; many of them previously were.
 // They account for a few % of mallocgc's cost though, which does matter somewhat
-// at scale.
+// at scale. (When testing changes to malloc, consider enabling this, and also
+// some function-local 'doubleCheck' consts such as in mbitmap.go currently.)
 const doubleCheckMalloc = false
 
 // sizeSpecializedMallocEnabled is the set of conditions where we enable the size-specialized
@@ -1089,6 +1090,12 @@ const doubleCheckMalloc = false
 // properly on plan9, so size-specialized malloc is also disabled on plan9.
 const sizeSpecializedMallocEnabled = goexperiment.SizeSpecializedMalloc && GOOS != "plan9" && !asanenabled && !raceenabled && !msanenabled && !valgrindenabled
 
+// runtimeFreegcEnabled is the set of conditions where we enable the runtime.freegc
+// implementation and the corresponding allocation-related changes: the experiment must be
+// enabled, and none of the memory sanitizers should be enabled. We allow the race detector,
+// in contrast to sizeSpecializedMallocEnabled.
+const runtimeFreegcEnabled = goexperiment.RuntimeFreegc && !asanenabled && !msanenabled && !valgrindenabled
+
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -1150,7 +1157,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		size += asanRZ
 	}
 
-	// Assist the GC if needed.
+	// Assist the GC if needed. (On the reuse path, we currently compensate for this;
+	// changes here might require changes there.)
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}
@@ -1413,6 +1421,16 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	size = uintptr(gc.SizeClassToSize[sizeclass])
 	spc := makeSpanClass(sizeclass, true)
 	span := c.alloc[spc]
+
+	// First, check for a reusable object.
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+		// We have a reusable object, use it.
+		x := mallocgcSmallNoscanReuse(c, span, spc, size, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		return x, size
+	}
+
 	v := nextFreeFast(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
@@ -1472,6 +1490,55 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	return x, size
 }
 
+// mallocgcSmallNoscanReuse returns a previously freed noscan object after preparing it for reuse.
+// It must only be called if hasReusableNoscan returned true.
+func mallocgcSmallNoscanReuse(c *mcache, span *mspan, spc spanClass, size uintptr, needzero bool) unsafe.Pointer {
+	// TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer?
+	// Maybe doesn't matter. gclinkptr might be for historical reasons.
+	v, span := c.nextReusableNoScan(span, spc)
+	x := unsafe.Pointer(v)
+
+	// Compensate for the GC assist credit deducted in mallocgc (before calling us and
+	// after we return) because this is not a newly allocated object. We use the full slot
+	// size (elemsize) here because that's what mallocgc deducts overall. Note we only
+	// adjust this when gcBlackenEnabled is true, which follows mallocgc behavior.
+	// TODO(thepudds): a follow-up CL adds a more specific test of our assist credit
+	// handling, including for validating internal fragmentation handling.
+	if gcBlackenEnabled != 0 {
+		addAssistCredit(size)
+	}
+
+	// This is a previously used object, so only check needzero (and not span.needzero)
+	// for clearing.
+	if needzero {
+		memclrNoHeapPointers(x, size)
+	}
+
+	// See publicationBarrier comment in mallocgcSmallNoscan.
+	publicationBarrier()
+
+	// Finish and return. Note that we do not update span.freeIndexForScan, profiling info,
+	// nor do we check gcTrigger.
+	// TODO(thepudds): the current approach is viable for a GOEXPERIMENT, but
+	// means we do not profile reused heap objects. Ultimately, we will need a better
+	// approach for profiling, or at least ensure we are not introducing bias in the
+	// profiled allocations.
+	// TODO(thepudds): related, we probably want to adjust how allocs and frees are counted
+	// in the existing stats. Currently, reused objects are not counted as allocs nor
+	// frees, but instead roughly appear as if the original heap object lived on. We
+	// probably will also want some additional runtime/metrics, and generally think about
+	// user-facing observability & diagnostics, though all this likely can wait for an
+	// official proposal.
+	if writeBarrier.enabled {
+		// Allocate black during GC.
+		// All slots hold nil so no scanning is needed.
+		// This may be racing with GC so do it atomically if there can be
+		// a race marking the bit.
+		gcmarknewobject(span, uintptr(x))
+	}
+	return x
+}
+
 func mallocgcSmallScanNoHeader(size uintptr, typ *_type) (unsafe.Pointer, uintptr) {
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
@@ -1816,8 +1883,6 @@ func postMallocgcDebug(x unsafe.Pointer, elemsize uintptr, typ *_type) {
 // by size bytes, and assists the GC if necessary.
 //
 // Caller must be preemptible.
-//
-// Returns the G for which the assist credit was accounted.
 func deductAssistCredit(size uintptr) {
 	// Charge the current user G for this allocation.
 	assistG := getg()
@@ -1836,6 +1901,262 @@ func deductAssistCredit(size uintptr) {
 	}
 }
 
+// addAssistCredit is like deductAssistCredit,
+// but adds credit rather than removes,
+// and never calls gcAssistAlloc.
+func addAssistCredit(size uintptr) {
+	// Credit the current user G.
+	assistG := getg()
+	if assistG.m.curg != nil { // TODO(thepudds): do we need to do this?
+		assistG = assistG.m.curg
+	}
+	// Credit the size against the G.
+	assistG.gcAssistBytes += int64(size)
+}
+
+const (
+	// doubleCheckReusable enables some additional invariant checks for the
+	// runtime.freegc and reusable objects. Note that some of these checks alter timing,
+	// and it is good to test changes with and without this enabled.
+	doubleCheckReusable = false
+
+	// debugReusableLog enables some printlns for runtime.freegc and reusable objects.
+	debugReusableLog = false
+)
+
+// freegc records that a heap object is reusable and available for
+// immediate reuse in a subsequent mallocgc allocation, without
+// needing to wait for the GC cycle to progress.
+//
+// The information is recorded in a free list stored in the
+// current P's mcache. The caller must pass in the user size
+// and whether the object has pointers, which allows a faster free
+// operation.
+//
+// freegc must be called by the effective owner of ptr who knows
+// the pointer is logically dead, with no possible aliases that might
+// be used past that moment. In other words, ptr must be the
+// last and only pointer to its referent.
+//
+// The intended caller is the compiler.
+//
+// Note: please do not send changes that attempt to add freegc calls
+// to the standard library.
+//
+// ptr must point to a heap object or into the current g's stack,
+// in which case freegc is a no-op. In particular, ptr must not point
+// to memory in the data or bss sections, which is partially enforced.
+// For objects with a malloc header, ptr should point mallocHeaderSize bytes
+// past the base; otherwise, ptr should point to the base of the heap object.
+// In other words, ptr should be the same pointer that was returned by mallocgc.
+//
+// In addition, the caller must know that ptr's object has no specials, such
+// as might have been created by a call to SetFinalizer or AddCleanup.
+// (Internally, the runtime deals appropriately with internally-created
+// specials, such as specials for memory profiling).
+//
+// If the size of ptr's object is less than 16 bytes or greater than
+// 32KiB - gc.MallocHeaderSize bytes, freegc is currently a no-op. It must only
+// be called in alloc-safe places. It currently throws if noscan is false
+// (support for which is implemented in a later CL in our stack).
+//
+// Note that freegc accepts an unsafe.Pointer and hence keeps the pointer
+// alive. It therefore could be a pessimization in some cases (such
+// as a long-lived function) if the caller does not call freegc before
+// or roughly when the liveness analysis of the compiler
+// would otherwise have determined ptr's object is reclaimable by the GC.
+func freegc(ptr unsafe.Pointer, size uintptr, noscan bool) bool {
+	if !runtimeFreegcEnabled || sizeSpecializedMallocEnabled || !reusableSize(size) {
+		// TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc until we finish integrating.
+		return false
+	}
+	if ptr == nil {
+		throw("freegc nil")
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	// Otherwise, the GC could flush our mcache or otherwise cause problems.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("freegc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("freegc during signal")
+	}
+	mp.mallocing = 1
+
+	if mp.curg.stack.lo <= uintptr(ptr) && uintptr(ptr) < mp.curg.stack.hi {
+		// This points into our stack, so free is a no-op.
+		mp.mallocing = 0
+		releasem(mp)
+		return false
+	}
+
+	if doubleCheckReusable {
+		// TODO(thepudds): we could enforce no free on globals in bss or data. Maybe by
+		// checking span via spanOf or spanOfHeap, or maybe walk from firstmoduledata
+		// like isGoPointerWithoutSpan, or activeModules, or something. If so, we might
+		// be able to delay checking until reuse (e.g., check span just before reusing,
+		// though currently we don't always need to lookup a span on reuse). If we think
+		// no usage patterns could result in globals, maybe enforcement for globals could
+		// be behind -d=checkptr=1 or similar. The compiler can have knowledge of where
+		// a variable is allocated, but stdlib does not, although there are certain
+		// usage patterns that cannot result in a global.
+		// TODO(thepudds): separately, consider a local debugReusableMcacheOnly here
+		// to ignore freed objects if not in mspan in mcache,  maybe when freeing and reading,
+		// by checking something like s.base() <= uintptr(v) && uintptr(v) < s.limit. Or
+		// maybe a GODEBUG or compiler debug flag.
+		span := spanOf(uintptr(ptr))
+		if span == nil {
+			throw("nextReusable: nil span for pointer in free list")
+		}
+		if state := span.state.get(); state != mSpanInUse {
+			throw("nextReusable: span is not in use")
+		}
+	}
+
+	if debug.clobberfree != 0 {
+		clobberfree(ptr, size)
+	}
+
+	// We first check if p is still in our per-P cache.
+	// Get our per-P cache for small objects.
+	c := getMCache(mp)
+	if c == nil {
+		throw("freegc called without a P or outside bootstrapping")
+	}
+
+	v := uintptr(ptr)
+	if !noscan && !heapBitsInSpan(size) {
+		// mallocgcSmallScanHeader expects to get the base address of the object back
+		// from the findReusable funcs (as well as from nextFreeFast and nextFree), and
+		// not mallocHeaderSize bytes into a object, so adjust that here.
+		v -= mallocHeaderSize
+
+		// The size class lookup wants size to be adjusted by mallocHeaderSize.
+		size += mallocHeaderSize
+	}
+
+	// TODO(thepudds): should verify (behind doubleCheckReusable constant) that our calculated
+	// sizeclass here matches what's in span found via spanOf(ptr) or findObject(ptr).
+	var sizeclass uint8
+	if size <= gc.SmallSizeMax-8 {
+		sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
+	} else {
+		sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
+	}
+
+	spc := makeSpanClass(sizeclass, noscan)
+	s := c.alloc[spc]
+
+	if debugReusableLog {
+		if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+			println("freegc [in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		} else {
+			println("freegc [NOT in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		}
+	}
+
+	if noscan {
+		c.addReusableNoscan(spc, uintptr(v))
+	} else {
+		// TODO(thepudds): implemented in later CL in our stack.
+		throw("freegc called for object with pointers, not yet implemented")
+	}
+
+	// For stats, for now we leave allocCount alone, roughly pretending to the rest
+	// of the system that this potential reuse never happened.
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	return true
+}
+
+// nextReusableNoScan returns the next reusable object for a noscan span,
+// or 0 if no reusable object is found.
+func (c *mcache) nextReusableNoScan(s *mspan, spc spanClass) (gclinkptr, *mspan) {
+	if !runtimeFreegcEnabled {
+		return 0, s
+	}
+
+	// Pop a reusable pointer from the free list for this span class.
+	v := c.reusableNoscan[spc]
+	if v == 0 {
+		return 0, s
+	}
+	c.reusableNoscan[spc] = v.ptr().next
+
+	if debugReusableLog {
+		println("reusing from ptr free list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+	}
+	if doubleCheckReusable {
+		doubleCheckNextReusable(v) // debug only sanity check
+	}
+
+	// For noscan spans, we only need the span if the write barrier is enabled (so that our caller
+	// can call gcmarknewobject to allocate black). If the write barrier is enabled, we can skip
+	// looking up the span when the pointer is in a span in the mcache.
+	if !writeBarrier.enabled {
+		return v, nil
+	}
+	if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+		// Return the original span.
+		return v, s
+	}
+
+	// We must find and return the span.
+	span := spanOf(uintptr(v))
+	if span == nil {
+		// TODO(thepudds): construct a test that triggers this throw.
+		throw("nextReusableNoScan: nil span for pointer in reusable object free list")
+	}
+
+	return v, span
+}
+
+// doubleCheckNextReusable checks some invariants.
+// TODO(thepudds): will probably delete some of this. Can mostly be ignored for review.
+func doubleCheckNextReusable(v gclinkptr) {
+	// TODO(thepudds): should probably take the spanClass as well to confirm expected
+	// sizeclass match.
+	_, span, objIndex := findObject(uintptr(v), 0, 0)
+	if span == nil {
+		throw("nextReusable: nil span for pointer in free list")
+	}
+	if state := span.state.get(); state != mSpanInUse {
+		throw("nextReusable: span is not in use")
+	}
+	if uintptr(v) < span.base() || uintptr(v) >= span.limit {
+		throw("nextReusable: span is not in range")
+	}
+	if span.objBase(uintptr(v)) != uintptr(v) {
+		print("nextReusable: v=", hex(v), " base=", hex(span.objBase(uintptr(v))), "\n")
+		throw("nextReusable: v is non-base-address for object found on pointer free list")
+	}
+	if span.isFree(objIndex) {
+		throw("nextReusable: pointer on free list is free")
+	}
+
+	const debugReusableEnsureSwept = false
+	if debugReusableEnsureSwept {
+		// Currently disabled.
+		// Note: ensureSwept here alters behavior (not just an invariant check).
+		span.ensureSwept()
+		if span.isFree(objIndex) {
+			throw("nextReusable: pointer on free list is free after ensureSwept")
+		}
+	}
+}
+
+// reusableSize reports if size is a currently supported size for a reusable object.
+func reusableSize(size uintptr) bool {
+	if size < maxTinySize || size > maxSmallSize-mallocHeaderSize {
+		return false
+	}
+	return true
+}
+
 // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
 // on chunks of the buffer to be zeroed, with opportunities for preemption
 // along the way.  memclrNoHeapPointers contains no safepoints and also
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index bf58947bbc..6285cdaff7 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -16,6 +16,7 @@ import (
 	"runtime"
 	. "runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -234,6 +235,275 @@ func TestTinyAllocIssue37262(t *testing.T) {
 	runtime.Releasem()
 }
 
+// TestFreegc does basic testing of explicit frees.
+func TestFreegc(t *testing.T) {
+	tests := []struct {
+		size   string
+		f      func(noscan bool) func(*testing.T)
+		noscan bool
+	}{
+		// Types without pointers.
+		{"size=16", testFreegc[[16]byte], true}, // smallest we support currently
+		{"size=17", testFreegc[[17]byte], true},
+		{"size=64", testFreegc[[64]byte], true},
+		{"size=500", testFreegc[[500]byte], true},
+		{"size=512", testFreegc[[512]byte], true},
+		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
+	}
+
+	// Run the tests twice if not in -short mode or not otherwise saving test time.
+	// First while manually calling runtime.GC to slightly increase isolation (perhaps making
+	// problems more reproducible).
+	for _, tt := range tests {
+		runtime.GC()
+		t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		return
+	}
+
+	// Again, but without manually calling runtime.GC in the loop (perhaps less isolation might
+	// trigger problems).
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+}
+
+func testFreegc[T comparable](noscan bool) func(*testing.T) {
+	// We use stressMultiple to influence the duration of the tests.
+	// When testing freegc changes, stressMultiple can be increased locally
+	// to test longer or in some cases with more goroutines.
+	// It can also be helpful to test with GODEBUG=clobberfree=1 and
+	// with and without doubleCheckMalloc and doubleCheckReusable enabled.
+	stressMultiple := 10
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		stressMultiple = 1
+	}
+
+	return func(t *testing.T) {
+		alloc := func() *T {
+			// Force heap alloc, plus some light validation of zeroed memory.
+			t.Helper()
+			p := Escape(new(T))
+			var zero T
+			if *p != zero {
+				t.Fatalf("allocator returned non-zero memory: %v", *p)
+			}
+			return p
+		}
+
+		free := func(p *T) {
+			t.Helper()
+			var zero T
+			if *p != zero {
+				t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p)
+			}
+			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
+		}
+
+		t.Run("basic-free", func(t *testing.T) {
+			// Test that freeing a live heap object doesn't crash.
+			for range 100 {
+				p := alloc()
+				free(p)
+			}
+		})
+
+		t.Run("stack-free", func(t *testing.T) {
+			// Test that freeing a stack object doesn't crash.
+			for range 100 {
+				var x [32]byte
+				var y [32]*int
+				runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true)  // noscan
+				runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan
+			}
+		})
+
+		// Check our allocations. These tests rely on the
+		// current implementation treating a re-used object
+		// as not adding to the allocation counts seen
+		// by testing.AllocsPerRun. (This is not the desired
+		// long-term behavior, but it is the current behavior and
+		// makes these tests convenient).
+
+		t.Run("allocs-baseline", func(t *testing.T) {
+			// Baseline result without any explicit free.
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					_ = p
+				}
+			})
+			if allocs < 100 {
+				// TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why
+				// ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8.
+				t.Fatalf("expected >=100 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("allocs-with-free", func(t *testing.T) {
+			// Same allocations, but now using explicit free so that
+			// no allocs get reported. (Again, not the desired long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("free-multiple", func(t *testing.T) {
+			// Multiple allocations outstanding before explicitly freeing,
+			// but still within the limit of our smallest free list size
+			// so that no allocs are reported. (Again, not long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			const maxOutstanding = 20
+			s := make([]*T, 0, maxOutstanding)
+			allocs := testing.AllocsPerRun(100*stressMultiple, func() {
+				s = s[:0]
+				for range maxOutstanding {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		if runtime.GOARCH == "wasm" {
+			// TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem,
+			// vs. something else. It might have been wasm was slowest with tests that spawn
+			// many goroutines, which might be expected for wasm. This skip might no longer be
+			// needed now that we have tuned test execution time more, or perhaps wasm should just
+			// always run in short mode, which might also let us remove this skip.
+			t.Skip("skipping remaining freegc tests, was timing out on wasm")
+		}
+
+		t.Run("free-many", func(t *testing.T) {
+			// Confirm we are graceful if we have more freed elements at once
+			// than the max free list size.
+			s := make([]*T, 0, 1000)
+			iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short
+			for range iterations {
+				s = s[:0]
+				for range 1000 {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			}
+		})
+
+		t.Run("duplicate-check", func(t *testing.T) {
+			// A simple duplicate allocation test. We track what should be the set
+			// of live pointers in a map across a series of allocs and frees,
+			// and fail if a live pointer value is returned by an allocation.
+			// TODO: maybe add randomness? allow more live pointers? do across goroutines?
+			live := make(map[uintptr]bool)
+			for i := range 100 * stressMultiple {
+				var s []*T
+				// Alloc 10 times, tracking the live pointer values.
+				for j := range 10 {
+					p := alloc()
+					uptr := uintptr(unsafe.Pointer(p))
+					if live[uptr] {
+						t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+					}
+					live[uptr] = true
+					s = append(s, p)
+				}
+				// Explicitly free those pointers, removing them from the live map.
+				for k := range s {
+					p := s[k]
+					s[k] = nil
+					uptr := uintptr(unsafe.Pointer(p))
+					free(p)
+					delete(live, uptr)
+				}
+			}
+		})
+
+		t.Run("free-other-goroutine", func(t *testing.T) {
+			// Use explicit free, but the free happens on a different goroutine than the alloc.
+			// This also lightly simulates how the free code sees P migration or flushing
+			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short
+			for _, capacity := range []int{2} {
+				for range iterations {
+					ch := make(chan *T, capacity)
+					var wg sync.WaitGroup
+					for range 2 {
+						wg.Add(1)
+						go func() {
+							defer wg.Done()
+							for p := range ch {
+								free(p)
+							}
+						}()
+					}
+					for range 100 {
+						p := alloc()
+						ch <- p
+					}
+					close(ch)
+					wg.Wait()
+				}
+			}
+		})
+
+		t.Run("many-goroutines", func(t *testing.T) {
+			// Allocate across multiple goroutines, freeing on the same goroutine.
+			// TODO: probably remove the duplicate checking here; not that useful.
+			counts := []int{1, 2, 4, 8, 10 * stressMultiple}
+			for _, goroutines := range counts {
+				var wg sync.WaitGroup
+				for range goroutines {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						live := make(map[uintptr]bool)
+						for range 100 * stressMultiple {
+							p := alloc()
+							uptr := uintptr(unsafe.Pointer(p))
+							if live[uptr] {
+								panic("TestFreeLive: found duplicate pointer")
+							}
+							live[uptr] = true
+							free(p)
+							delete(live, uptr)
+						}
+					}()
+				}
+				wg.Wait()
+			}
+		})
+	}
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
@@ -337,6 +607,13 @@ func BenchmarkMalloc16(b *testing.B) {
 	}
 }
 
+func BenchmarkMalloc32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new([4]int64)
+		Escape(p)
+	}
+}
+
 func BenchmarkMallocTypeInfo8(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
@@ -355,6 +632,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) {
 	}
 }
 
+func BenchmarkMallocTypeInfo32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [32 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		Escape(p)
+	}
+}
+
 type LargeStruct struct {
 	x [16][]byte
 }
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index cade81031d..82872f1454 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -44,7 +44,17 @@ type mcache struct {
 
 	// The rest is not accessed on every malloc.
 
-	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
+	// alloc contains spans to allocate from, indexed by spanClass.
+	alloc [numSpanClasses]*mspan
+
+	// TODO(thepudds): better to interleave alloc and reusableScan/reusableNoscan so that
+	// a single malloc call can often access both in the same cache line for a given spanClass.
+	// It's not interleaved right now in part to have slightly smaller diff, and might be
+	// negligible effect on current microbenchmarks.
+
+	// reusableNoscan contains linked lists of reusable noscan heap objects, indexed by spanClass.
+	// The next pointers are stored in the first word of the heap objects.
+	reusableNoscan [numSpanClasses]gclinkptr
 
 	stackcache [_NumStackOrders]stackfreelist
 
@@ -96,6 +106,7 @@ func allocmcache() *mcache {
 		c.alloc[i] = &emptymspan
 	}
 	c.nextSample = nextSample()
+
 	return c
 }
 
@@ -153,6 +164,16 @@ func (c *mcache) refill(spc spanClass) {
 	if s.allocCount != s.nelems {
 		throw("refill of span with free space remaining")
 	}
+
+	// TODO(thepudds): we might be able to allow mallocgcTiny to reuse 16 byte objects from spc==5,
+	// but for now, just clear our reusable objects for tinySpanClass.
+	if spc == tinySpanClass {
+		c.reusableNoscan[spc] = 0
+	}
+	if c.reusableNoscan[spc] != 0 {
+		throw("refill of span with reusable pointers remaining on pointer free list")
+	}
+
 	if s != &emptymspan {
 		// Mark this span as no longer cached.
 		if s.sweepgen != mheap_.sweepgen+3 {
@@ -312,6 +333,13 @@ func (c *mcache) releaseAll() {
 	c.tinyAllocs = 0
 	memstats.heapStats.release()
 
+	// Clear the reusable linked lists.
+	// For noscan objects, the nodes of the linked lists are the reusable heap objects themselves,
+	// so we can simply clear the linked list head pointers.
+	// TODO(thepudds): consider having debug logging of a non-empty reusable lists getting cleared,
+	// maybe based on the existing debugReusableLog.
+	clear(c.reusableNoscan[:])
+
 	// Update heapLive and heapScan.
 	gcController.update(dHeapLive, scanAlloc)
 }
@@ -339,3 +367,25 @@ func (c *mcache) prepareForSweep() {
 	stackcache_clear(c)
 	c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart
 }
+
+// addReusableNoscan adds a noscan object pointer to the reusable pointer free list
+// for a span class.
+func (c *mcache) addReusableNoscan(spc spanClass, ptr uintptr) {
+	if !runtimeFreegcEnabled {
+		return
+	}
+
+	// Add to the reusable pointers free list.
+	v := gclinkptr(ptr)
+	v.ptr().next = c.reusableNoscan[spc]
+	c.reusableNoscan[spc] = v
+}
+
+// hasReusableNoscan reports whether there is a reusable object available for
+// a noscan spc.
+func (c *mcache) hasReusableNoscan(spc spanClass) bool {
+	if !runtimeFreegcEnabled {
+		return false
+	}
+	return c.reusableNoscan[spc] != 0
+}
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 08a0057be7..d2ff063b00 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -435,7 +435,7 @@ type mspan struct {
 	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
 	// just past the newly discovered free object.
 	//
-	// If freeindex == nelems, this span has no free objects.
+	// If freeindex == nelems, this span has no free objects, though might have reusable objects.
 	//
 	// allocBits is a bitmap of objects in this span.
 	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
-- 
cgit v1.3


From 120f1874ef380362cf8b8c4775a327bcd417ff70 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Mon, 3 Nov 2025 16:40:40 -0500
Subject: runtime: add more precise test of assist credit handling for
 runtime.freegc

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL adds a better test of assist credit handling when heap objects
are being reused after a runtime.freegc call.

The main approach is bracketing alloc/free pairs with measurements
of the assist credit before and after, and hoping to see a net zero
change in the assist credit.

However, validating the desired behavior is perhaps a bit subtle.
To help stabilize the measurements, we do acquirem in the test code
to avoid being preempted during the measurements to reduce other code's
ability to adjust the assist credit while we are measuring, and
we also reduce GOMAXPROCS to 1.

This test currently does fail if we deliberately introduce bugs
in the runtime.freegc implementation such as if we:
- never adjust the assist credit when reusing an object, or
- always adjust the assist credit when reusing an object, or
- deliberately mishandle internal fragmentation.

The two main cases of current interest for testing runtime.freegc
are when over the course of our bracketed measurements gcBlackenEnable
is either true or false. The test attempts to exercise both of those
case by running the GC continually in the background (which we can see
seems effective based on logging and by how our deliberate bugs fail).

This passes ~10K test executions locally via stress.

A small note to the future: a previous incarnation of this test (circa
patchset 11 of this CL) did not do acquirem but had an approach of
ignoring certain measurements, which also was able to pass ~10K runs
via stress. The current version in this CL is simpler, but
recording the existence of the prior version here in case it is
useful in the future. (Hopefully not.)

Updates #74299

Change-Id: I46c7e0295d125f5884fee0cc3d3d31aedc7e5ff4
Reviewed-on: https://go-review.googlesource.com/c/go/+/717520
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/export_test.go | 28 ++++++++++++++
 src/runtime/malloc_test.go | 93 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 117 insertions(+), 4 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 48dcf5aa39..8438603b9e 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -644,6 +644,25 @@ func Freegc(p unsafe.Pointer, size uintptr, noscan bool) {
 	freegc(p, size, noscan)
 }
 
+// Expose gcAssistBytes for the current g for testing.
+func AssistCredit() int64 {
+	assistG := getg()
+	if assistG.m.curg != nil {
+		assistG = assistG.m.curg
+	}
+	return assistG.gcAssistBytes
+}
+
+// Expose gcBlackenEnabled for testing.
+func GcBlackenEnable() bool {
+	// Note we do a non-atomic load here.
+	// Some checks against gcBlackenEnabled (e.g., in mallocgc)
+	// are currently done via non-atomic load for performance reasons,
+	// but other checks are done via atomic load (e.g., in mgcmark.go),
+	// so interpreting this value in a test may be subtle.
+	return gcBlackenEnabled != 0
+}
+
 const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled
 
 const RuntimeFreegcEnabled = runtimeFreegcEnabled
@@ -1487,6 +1506,15 @@ func Releasem() {
 	releasem(getg().m)
 }
 
+// GoschedIfBusy is an explicit preemption check to call back
+// into the scheduler. This is useful for tests that run code
+// which spend most of their time as non-preemptible, as it
+// can be placed right after becoming preemptible again to ensure
+// that the scheduler gets a chance to preempt the goroutine.
+func GoschedIfBusy() {
+	goschedIfBusy()
+}
+
 type PIController struct {
 	piController
 }
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 6285cdaff7..10c20e6c23 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -249,6 +249,7 @@ func TestFreegc(t *testing.T) {
 		{"size=500", testFreegc[[500]byte], true},
 		{"size=512", testFreegc[[512]byte], true},
 		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=20000", testFreegc[[20000]byte], true},       // not power of 2 or spc boundary
 		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
 	}
 
@@ -300,7 +301,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			t.Helper()
 			var zero T
 			if *p != zero {
-				t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p)
+				t.Fatalf("found non-zero memory before freegc (tests do not modify memory): %v", *p)
 			}
 			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
 		}
@@ -405,7 +406,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// Confirm we are graceful if we have more freed elements at once
 			// than the max free list size.
 			s := make([]*T, 0, 1000)
-			iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short
+			iterations := stressMultiple * stressMultiple // currently 1 (-short) or 100
 			for range iterations {
 				s = s[:0]
 				for range 1000 {
@@ -431,7 +432,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 					p := alloc()
 					uptr := uintptr(unsafe.Pointer(p))
 					if live[uptr] {
-						t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+						t.Fatalf("found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
 					}
 					live[uptr] = true
 					s = append(s, p)
@@ -451,7 +452,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// Use explicit free, but the free happens on a different goroutine than the alloc.
 			// This also lightly simulates how the free code sees P migration or flushing
 			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
-			iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 (-short) or 1000
 			for _, capacity := range []int{2} {
 				for range iterations {
 					ch := make(chan *T, capacity)
@@ -501,6 +502,90 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 				wg.Wait()
 			}
 		})
+
+		t.Run("assist-credit", func(t *testing.T) {
+			// Allocate and free using the same span class repeatedly while
+			// verifying it results in a net zero change in assist credit.
+			// This helps double-check our manipulation of the assist credit
+			// during mallocgc/freegc, including in cases when there is
+			// internal fragmentation when the requested mallocgc size is
+			// smaller than the size class.
+			//
+			// See https://go.dev/cl/717520 for some additional discussion,
+			// including how we can deliberately cause the test to fail currently
+			// if we purposefully introduce some assist credit bugs.
+			if SizeSpecializedMallocEnabled {
+				// TODO(thepudds): skip this test at this point in the stack; later CL has
+				// integration with sizespecializedmalloc.
+				t.Skip("temporarily skip assist credit test for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping assist credit test with runtime.freegc disabled")
+			}
+
+			// Use a background goroutine to continuously run the GC.
+			done := make(chan struct{})
+			defer close(done)
+			go func() {
+				for {
+					select {
+					case <-done:
+						return
+					default:
+						runtime.GC()
+					}
+				}
+			}()
+
+			// If making changes related to this test, consider testing locally with
+			// larger counts, like 100K or 1M.
+			counts := []int{1, 2, 10, 100 * stressMultiple}
+			// Dropping down to GOMAXPROCS=1 might help reduce noise.
+			defer GOMAXPROCS(GOMAXPROCS(1))
+			size := int64(unsafe.Sizeof(*new(T)))
+			for _, count := range counts {
+				// Start by forcing a GC to reset this g's assist credit
+				// and perhaps help us get a cleaner measurement of GC cycle count.
+				runtime.GC()
+				for i := range count {
+					// We disable preemption to reduce other code's ability to adjust this g's
+					// assist credit or otherwise change things while we are measuring.
+					Acquirem()
+
+					// We do two allocations per loop, with the second allocation being
+					// the one we measure. The first allocation tries to ensure at least one
+					// reusable object on the mspan's free list when we do our measured allocation.
+					p := alloc()
+					free(p)
+
+					// Now do our primary allocation of interest, bracketed by measurements.
+					// We measure more than we strictly need (to log details in case of a failure).
+					creditStart := AssistCredit()
+					blackenStart := GcBlackenEnable()
+					p = alloc()
+					blackenAfterAlloc := GcBlackenEnable()
+					creditAfterAlloc := AssistCredit()
+					free(p)
+					blackenEnd := GcBlackenEnable()
+					creditEnd := AssistCredit()
+
+					Releasem()
+					GoschedIfBusy()
+
+					delta := creditEnd - creditStart
+					if delta != 0 {
+						t.Logf("assist credit non-zero delta: %d", delta)
+						t.Logf("\t| size: %d i: %d count: %d", size, i, count)
+						t.Logf("\t| credit before: %d credit after: %d", creditStart, creditEnd)
+						t.Logf("\t| alloc delta: %d free delta: %d",
+							creditAfterAlloc-creditStart, creditEnd-creditAfterAlloc)
+						t.Logf("\t| gcBlackenEnable (start / after alloc / end): %v/%v/%v",
+							blackenStart, blackenAfterAlloc, blackenEnd)
+						t.FailNow()
+					}
+				}
+			}
+		})
 	}
 }
 
-- 
cgit v1.3


From 50128a21541e3fd712ad717a223aaa109cb86d43 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Sun, 9 Nov 2025 09:24:22 -0500
Subject: runtime: support runtime.freegc in size-specialized mallocs for
 noscan objects

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL updates the smallNoScanStub stub in malloc_stubs.go to reuse
heap objects that have been freed by runtime.freegc calls, and generates
the corresponding size-specialized code in malloc_generated.go.

This CL only adds support in the specialized mallocs for noscan
heap objects (objects without pointers). A later CL handles objects
with pointers.

While we are here, we leave a couple of breadcrumbs in mkmalloc.go on
how to do the generation.

Updates #74299

Change-Id: I2657622601a27211554ee862fce057e101767a70
Reviewed-on: https://go-review.googlesource.com/c/go/+/715761
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/_mkmalloc/mkmalloc.go |   3 +-
 src/runtime/malloc.go             |  11 +-
 src/runtime/malloc_generated.go   | 651 ++++++++++++++++++++++++++++++++++++++
 src/runtime/malloc_stubs.go       |  22 +-
 src/runtime/malloc_test.go        |  16 +-
 5 files changed, 693 insertions(+), 10 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/_mkmalloc/mkmalloc.go b/src/runtime/_mkmalloc/mkmalloc.go
index 986b0aa9f8..1f040c8861 100644
--- a/src/runtime/_mkmalloc/mkmalloc.go
+++ b/src/runtime/_mkmalloc/mkmalloc.go
@@ -254,7 +254,8 @@ func inline(config generatorConfig) []byte {
 	}
 
 	// Write out the package and import declarations.
-	out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n\n")
+	out.WriteString("// Code generated by mkmalloc.go; DO NOT EDIT.\n")
+	out.WriteString("// See overview in malloc_stubs.go.\n\n")
 	out.WriteString("package " + f.Name.Name + "\n\n")
 	for _, importDecl := range importDecls {
 		out.Write(mustFormatNode(fset, importDecl))
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 13f5fc3081..d49dacaf68 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1094,6 +1094,8 @@ const sizeSpecializedMallocEnabled = goexperiment.SizeSpecializedMalloc && GOOS
 // implementation and the corresponding allocation-related changes: the experiment must be
 // enabled, and none of the memory sanitizers should be enabled. We allow the race detector,
 // in contrast to sizeSpecializedMallocEnabled.
+// TODO(thepudds): it would be nice to check Valgrind integration, though there are some hints
+// there might not be any canned tests in tree for Go's integration with Valgrind.
 const runtimeFreegcEnabled = goexperiment.RuntimeFreegc && !asanenabled && !msanenabled && !valgrindenabled
 
 // Allocate an object of size bytes.
@@ -1966,10 +1968,15 @@ const (
 // or roughly when the liveness analysis of the compiler
 // would otherwise have determined ptr's object is reclaimable by the GC.
 func freegc(ptr unsafe.Pointer, size uintptr, noscan bool) bool {
-	if !runtimeFreegcEnabled || sizeSpecializedMallocEnabled || !reusableSize(size) {
-		// TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc until we finish integrating.
+	if !runtimeFreegcEnabled || !reusableSize(size) {
 		return false
 	}
+	if sizeSpecializedMallocEnabled && !noscan {
+		// TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc for pointer types
+		// until we finish integrating.
+		return false
+	}
+
 	if ptr == nil {
 		throw("freegc nil")
 	}
diff --git a/src/runtime/malloc_generated.go b/src/runtime/malloc_generated.go
index 2215dbaddb..5abb61257a 100644
--- a/src/runtime/malloc_generated.go
+++ b/src/runtime/malloc_generated.go
@@ -1,4 +1,5 @@
 // Code generated by mkmalloc.go; DO NOT EDIT.
+// See overview in malloc_stubs.go.
 
 package runtime
 
@@ -6400,6 +6401,32 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6497,6 +6524,32 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6594,6 +6647,32 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6691,6 +6770,32 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6788,6 +6893,32 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6885,6 +7016,32 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -6982,6 +7139,32 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7079,6 +7262,32 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7176,6 +7385,32 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7273,6 +7508,32 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7370,6 +7631,32 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7467,6 +7754,32 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7564,6 +7877,32 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7661,6 +8000,32 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7758,6 +8123,32 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7855,6 +8246,32 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -7952,6 +8369,32 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8049,6 +8492,32 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8146,6 +8615,32 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8243,6 +8738,32 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8340,6 +8861,32 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8437,6 +8984,32 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8534,6 +9107,32 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8631,6 +9230,32 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
@@ -8728,6 +9353,32 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	const spc = spanClass(sizeclass<<1) | spanClass(1)
 	span := c.alloc[spc]
 
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		x := v
+		{
+
+			if valgrindenabled {
+				valgrindMalloc(x, size)
+			}
+
+			if gcBlackenEnabled != 0 && elemsize != 0 {
+				if assistG := getg().m.curg; assistG != nil {
+					assistG.gcAssistBytes -= int64(elemsize - size)
+				}
+			}
+
+			if debug.malloc {
+				postMallocgcDebug(x, elemsize, typ)
+			}
+			return x
+		}
+
+	}
+
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache)
diff --git a/src/runtime/malloc_stubs.go b/src/runtime/malloc_stubs.go
index 224746f3d4..e9752956b8 100644
--- a/src/runtime/malloc_stubs.go
+++ b/src/runtime/malloc_stubs.go
@@ -7,6 +7,8 @@
 // to produce a full mallocgc function that's specialized for a span class
 // or specific size in the case of the tiny allocator.
 //
+// To generate the specialized mallocgc functions, do 'go run .' inside runtime/_mkmalloc.
+//
 // To assemble a mallocgc function, the mallocStub function is cloned, and the call to
 // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,
 // smallNoScanStub or tinyStub, depending on the parameters being specialized.
@@ -71,7 +73,8 @@ func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	}
 
-	// Assist the GC if needed.
+	// Assist the GC if needed. (On the reuse path, we currently compensate for this;
+	// changes here might require changes there.)
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}
@@ -242,6 +245,23 @@ func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, u
 	c := getMCache(mp)
 	const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
 	span := c.alloc[spc]
+
+	// First, check for a reusable object.
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+		// We have a reusable object, use it.
+		v := mallocgcSmallNoscanReuse(c, span, spc, elemsize, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+
+		// TODO(thepudds): note that the generated return path is essentially duplicated
+		// by the generator. For example, see the two postMallocgcDebug calls and
+		// related duplicated code on the return path currently in the generated
+		// mallocgcSmallNoScanSC2 function. One set of those correspond to this
+		// return here. We might be able to de-duplicate the generated return path
+		// by updating the generator, perhaps by jumping to a shared return or similar.
+		return v, elemsize
+	}
+
 	v := nextFreeFastStub(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 10c20e6c23..97cf0eed54 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -349,8 +349,10 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 		t.Run("allocs-with-free", func(t *testing.T) {
 			// Same allocations, but now using explicit free so that
 			// no allocs get reported. (Again, not the desired long-term behavior).
-			if SizeSpecializedMallocEnabled {
-				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			if SizeSpecializedMallocEnabled && !noscan {
+				// TODO(thepudds): skip at this point in the stack for size-specialized malloc
+				// with !noscan. Additional integration with sizespecializedmalloc is in a later CL.
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
 			}
 			if !RuntimeFreegcEnabled {
 				t.Skip("skipping alloc tests with runtime.freegc disabled")
@@ -370,8 +372,10 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// Multiple allocations outstanding before explicitly freeing,
 			// but still within the limit of our smallest free list size
 			// so that no allocs are reported. (Again, not long-term behavior).
-			if SizeSpecializedMallocEnabled {
-				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			if SizeSpecializedMallocEnabled && !noscan {
+				// TODO(thepudds): skip at this point in the stack for size-specialized malloc
+				// with !noscan. Additional integration with sizespecializedmalloc is in a later CL.
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
 			}
 			if !RuntimeFreegcEnabled {
 				t.Skip("skipping alloc tests with runtime.freegc disabled")
@@ -514,10 +518,10 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// See https://go.dev/cl/717520 for some additional discussion,
 			// including how we can deliberately cause the test to fail currently
 			// if we purposefully introduce some assist credit bugs.
-			if SizeSpecializedMallocEnabled {
+			if SizeSpecializedMallocEnabled && !noscan {
 				// TODO(thepudds): skip this test at this point in the stack; later CL has
 				// integration with sizespecializedmalloc.
-				t.Skip("temporarily skip assist credit test for GOEXPERIMENT=sizespecializedmalloc")
+				t.Skip("temporarily skip assist credit tests for GOEXPERIMENT=sizespecializedmalloc for pointer types")
 			}
 			if !RuntimeFreegcEnabled {
 				t.Skip("skipping assist credit test with runtime.freegc disabled")
-- 
cgit v1.3


From d55ecea9e5a5a4cfba30c6f35d4841ae66e05ccd Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 14 Nov 2025 17:41:58 +0000
Subject: runtime: usleep before stealing runnext only if not in syscall

In the scheduler's steal path, we usleep(3) before stealing a _Prunning
P's runnext slot. Before CL 646198, we would not call usleep(3) if the P
was in _Psyscall. After CL 646198, Ps with Gs in syscalls stay in
_Prunning until stolen, meaning we might unnecessarily usleep(3) where
we didn't before. This probably isn't a huge deal in most cases, but can
cause some apparent slowdowns in microbenchmarks that frequently take
the steal path while there are syscalling goroutines.

Change-Id: I5bf3df10fe61cf8d7f0e9fe9522102de66faf344
Reviewed-on: https://go-review.googlesource.com/c/go/+/720441
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/proc.go | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 44dbb2fd44..61364d91ff 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -7507,23 +7507,36 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
 				// Try to steal from pp.runnext.
 				if next := pp.runnext; next != 0 {
 					if pp.status == _Prunning {
-						// Sleep to ensure that pp isn't about to run the g
-						// we are about to steal.
-						// The important use case here is when the g running
-						// on pp ready()s another g and then almost
-						// immediately blocks. Instead of stealing runnext
-						// in this window, back off to give pp a chance to
-						// schedule runnext. This will avoid thrashing gs
-						// between different Ps.
-						// A sync chan send/recv takes ~50ns as of time of
-						// writing, so 3us gives ~50x overshoot.
-						if !osHasLowResTimer {
-							usleep(3)
-						} else {
-							// On some platforms system timer granularity is
-							// 1-15ms, which is way too much for this
-							// optimization. So just yield.
-							osyield()
+						if mp := pp.m.ptr(); mp != nil {
+							if gp := mp.curg; gp == nil || readgstatus(gp)&^_Gscan != _Gsyscall {
+								// Sleep to ensure that pp isn't about to run the g
+								// we are about to steal.
+								// The important use case here is when the g running
+								// on pp ready()s another g and then almost
+								// immediately blocks. Instead of stealing runnext
+								// in this window, back off to give pp a chance to
+								// schedule runnext. This will avoid thrashing gs
+								// between different Ps.
+								// A sync chan send/recv takes ~50ns as of time of
+								// writing, so 3us gives ~50x overshoot.
+								// If curg is nil, we assume that the P is likely
+								// to be in the scheduler. If curg isn't nil and isn't
+								// in a syscall, then it's either running, waiting, or
+								// runnable. In this case we want to sleep because the
+								// P might either call into the scheduler soon (running),
+								// or already is (since we found a waiting or runnable
+								// goroutine hanging off of a running P, suggesting it
+								// either recently transitioned out of running, or will
+								// transition to running shortly).
+								if !osHasLowResTimer {
+									usleep(3)
+								} else {
+									// On some platforms system timer granularity is
+									// 1-15ms, which is way too much for this
+									// optimization. So just yield.
+									osyield()
+								}
+							}
 						}
 					}
 					if !pp.runnext.cas(next, 0) {
-- 
cgit v1.3


From 65c09eafdf316ef691f0f8eccbf860d2ef5f7c70 Mon Sep 17 00:00:00 2001
From: Archana Ravindar <aravinda@redhat.com>
Date: Fri, 7 Nov 2025 13:15:02 +0530
Subject: runtime: hoist invariant code out of heapBitsSmallForAddrInline

The first two instructions in heapBitsSmallForAddrInline are
invariant for a given span and object and are called in
a loop within ScanObjectsSmall which figures as a hot routine
in profiles of some benchmark runs within sweet benchmark suite
(x/benchmarks/sweet), Ideally it would have been great if the
compiler hoisted this code out of the loop, Moving it out of
inner loop manually gives gains (moving it entirely out of
nested loop does not improve performance, in some cases it
even regresses it perhaps due to the early loop exit).
Tested with AMD64, ARM64, PPC64LE and S390x

Fixes #76212

Change-Id: I49c3c826b9d7bf3125ffc42c8c174cce0ecc4cbf
Reviewed-on: https://go-review.googlesource.com/c/go/+/718680
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/mgcmark_greenteagc.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgcmark_greenteagc.go b/src/runtime/mgcmark_greenteagc.go
index 3594b33cfd..fa560f9966 100644
--- a/src/runtime/mgcmark_greenteagc.go
+++ b/src/runtime/mgcmark_greenteagc.go
@@ -978,7 +978,9 @@ func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toSc
 }
 
 func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
-	ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize)
+	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, objSize)
+	hbits := (*byte)(unsafe.Pointer(hbitsBase))
+	ptrBits := extractHeapBitsSmall(hbits, spanBase, b, objSize)
 	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
 	nptrs := 0
 	n := sys.OnesCount64(uint64(ptrBits))
@@ -1017,12 +1019,14 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g
 			break
 		}
 		n := sys.OnesCount64(uint64(bits))
+		hbitsBase, _ := spanHeapBitsRange(base, gc.PageSize, objSize)
+		hbits := (*byte)(unsafe.Pointer(hbitsBase))
 		for range n {
 			j := sys.TrailingZeros64(uint64(bits))
 			bits &^= 1 << j
 
 			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
-			ptrBits := heapBitsSmallForAddrInline(base, b, objSize)
+			ptrBits := extractHeapBitsSmall(hbits, base, b, objSize)
 			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
 
 			n := sys.OnesCount64(uint64(ptrBits))
@@ -1056,10 +1060,7 @@ func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *g
 	}
 }
 
-func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr {
-	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize)
-	hbits := (*byte)(unsafe.Pointer(hbitsBase))
-
+func extractHeapBitsSmall(hbits *byte, spanBase, addr, elemsize uintptr) uintptr {
 	// These objects are always small enough that their bitmaps
 	// fit in a single word, so just load the word or two we need.
 	//
-- 
cgit v1.3


From 6919858338ae3c4f244f65ca87e9e71662a83413 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 17 Nov 2025 13:34:51 -0500
Subject: runtime: rename findrunnable references to findRunnable

These cases were missed by CL 393880.

Change-Id: I6a6a636cf0d97a4efcf4b9df766002ecef48b4de
Reviewed-on: https://go-review.googlesource.com/c/go/+/721120
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/proc.go     | 18 +++++++++---------
 src/runtime/runtime2.go |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 61364d91ff..a378b8c39d 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -3119,7 +3119,7 @@ func startm(pp *p, spinning, lockheld bool) {
 //go:nowritebarrierrec
 func handoffp(pp *p) {
 	// handoffp must start an M in any situation where
-	// findrunnable would return a G to run on pp.
+	// findRunnable would return a G to run on pp.
 
 	// if it has local work, start it straight away
 	if !runqempty(pp) || !sched.runq.empty() {
@@ -3362,7 +3362,7 @@ func findRunnable() (gp *g, inheritTime, tryWakeP bool) {
 	mp := getg().m
 
 	// The conditions here and in handoffp must agree: if
-	// findrunnable would return a G to run, handoffp must start
+	// findRunnable would return a G to run, handoffp must start
 	// an M.
 
 top:
@@ -3586,7 +3586,7 @@ top:
 		goto top
 	}
 	if releasep() != pp {
-		throw("findrunnable: wrong p")
+		throw("findRunnable: wrong p")
 	}
 	now = pidleput(pp, now)
 	unlock(&sched.lock)
@@ -3631,7 +3631,7 @@ top:
 	if mp.spinning {
 		mp.spinning = false
 		if sched.nmspinning.Add(-1) < 0 {
-			throw("findrunnable: negative nmspinning")
+			throw("findRunnable: negative nmspinning")
 		}
 
 		// Note the for correctness, only the last M transitioning from
@@ -3704,10 +3704,10 @@ top:
 	if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
 		sched.pollUntil.Store(pollUntil)
 		if mp.p != 0 {
-			throw("findrunnable: netpoll with p")
+			throw("findRunnable: netpoll with p")
 		}
 		if mp.spinning {
-			throw("findrunnable: netpoll with spinning")
+			throw("findRunnable: netpoll with spinning")
 		}
 		delay := int64(-1)
 		if pollUntil != 0 {
@@ -3973,7 +3973,7 @@ func checkIdleGCNoP() (*p, *g) {
 // timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if sched.lastpoll.Load() == 0 {
-		// In findrunnable we ensure that when polling the pollUntil
+		// In findRunnable we ensure that when polling the pollUntil
 		// field is either zero or the time to which the current
 		// poll is expected to run. This can have a spurious wakeup
 		// but should never miss a wakeup.
@@ -3998,7 +3998,7 @@ func resetspinning() {
 	gp.m.spinning = false
 	nmspinning := sched.nmspinning.Add(-1)
 	if nmspinning < 0 {
-		throw("findrunnable: negative nmspinning")
+		throw("findRunnable: negative nmspinning")
 	}
 	// M wakeup policy is deliberately somewhat conservative, so check if we
 	// need to wakeup another P here. See "Worker thread parking/unparking"
@@ -7269,7 +7269,7 @@ func pidlegetSpinning(now int64) (*p, int64) {
 
 	pp, now := pidleget(now)
 	if pp == nil {
-		// See "Delicate dance" comment in findrunnable. We found work
+		// See "Delicate dance" comment in findRunnable. We found work
 		// that we cannot take, we must synchronize with non-spinning
 		// Ms that may be preparing to drop their P.
 		sched.needspinning.Store(1)
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 6c955460d4..e415df6ec1 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -1425,9 +1425,9 @@ var (
 	// must be set. An idle P (passed to pidleput) cannot add new timers while
 	// idle, so if it has no timers at that time, its mask may be cleared.
 	//
-	// Thus, we get the following effects on timer-stealing in findrunnable:
+	// Thus, we get the following effects on timer-stealing in findRunnable:
 	//
-	//   - Idle Ps with no timers when they go idle are never checked in findrunnable
+	//   - Idle Ps with no timers when they go idle are never checked in findRunnable
 	//     (for work- or timer-stealing; this is the ideal case).
 	//   - Running Ps must always be checked.
 	//   - Idle Ps whose timers are stolen must continue to be checked until they run
-- 
cgit v1.3


From eda2e8c683798e435e725f60f0bb580eb4aa9686 Mon Sep 17 00:00:00 2001
From: Nick Ripley <nick.ripley@datadoghq.com>
Date: Mon, 17 Nov 2025 11:47:20 -0500
Subject: runtime: clear frame pointer at thread entry points

There are a few places in the runtime where new threads enter Go code
with a possibly invalid frame pointer. mstart is the entry point for new
Ms, and rt0_go is the entrypoint for the program. As we try to introduce
frame pointer unwinding in more places (e.g. for heap profiling in CL
540476 or for execution trace events on the system stack in CL 593835),
we see these functions on the stack. We need to ensure that they have
valid frame pointers. These functions are both considered the "top"
(first) frame frame of the call stack, so this CL sets the frame pointer
register to 0 in these functions.

Updates #63630

Change-Id: I6a6a6964a9ebc6f68ba23d2616e5fb6f19677f97
Reviewed-on: https://go-review.googlesource.com/c/go/+/721020
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/asm_amd64.s | 15 +++++++++++++++
 src/runtime/asm_arm64.s | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'src/runtime')

diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index a4c6c53a90..f4244f6e06 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -181,6 +181,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
 	MOVQ	AX, 24(SP)
 	MOVQ	BX, 32(SP)
 
+	// This is typically the entry point for Go programs.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVQ	$0, BP
+
 	// create istack out of the given (operating system) stack.
 	// _cgo_init may update stackguard.
 	MOVQ	$runtime·g0(SB), DI
@@ -408,6 +416,13 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0
 	RET
 
 TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
+	// This is the root frame of new Go-created OS threads.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, BP
 	CALL	runtime·mstart0(SB)
 	RET // not reached
 
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 902a7066aa..01f2690f4e 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -109,6 +109,14 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
 	MOVW	R0, 8(RSP) // argc
 	MOVD	R1, 16(RSP) // argv
 
+	// This is typically the entry point for Go programs.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, R29
+
 #ifdef TLS_darwin
 	// Initialize TLS.
 	MOVD	ZR, g // clear g, make sure it's not junk.
@@ -248,6 +256,13 @@ TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
 	RET
 
 TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME,$0
+	// This is the root frame of new Go-created OS threads.
+	// Call stack unwinding must not proceed past this frame.
+	// Set the frame pointer register to 0 so that frame pointer-based unwinders
+	// (which don't use debug info for performance reasons)
+	// won't attempt to unwind past this function.
+	// See go.dev/issue/63630
+	MOVD	$0, R29
 	BL	runtime·mstart0(SB)
 	RET // not reached
 
-- 
cgit v1.3


From 6caab99026a496107e903469d8c906be66a71896 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Thu, 2 Oct 2025 13:09:03 -0700
Subject: runtime: relax TestMemoryLimit on darwin a bit more

Add 8MB more. Covers most of the failures watchflakes has seen.

Fixes #73136

Change-Id: I593c599a9519b8b31ed0f401d4157d27ac692587
Reviewed-on: https://go-review.googlesource.com/c/go/+/708617
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@google.com>
---
 src/runtime/testdata/testprog/gc.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
index bbe1453401..32e2c5e1b4 100644
--- a/src/runtime/testdata/testprog/gc.go
+++ b/src/runtime/testdata/testprog/gc.go
@@ -396,7 +396,7 @@ func gcMemoryLimit(gcPercent int) {
 		// should do considerably better than this bound.
 		bound := int64(myLimit + 16<<20)
 		if runtime.GOOS == "darwin" {
-			bound += 16 << 20 // Be more lax on Darwin, see issue 73136.
+			bound += 24 << 20 // Be more lax on Darwin, see issue 73136.
 		}
 		start := time.Now()
 		for time.Since(start) < 200*time.Millisecond {
-- 
cgit v1.3


From b9ef0633f6117c74fabcd7247a76b4feb86df086 Mon Sep 17 00:00:00 2001
From: Joel Sing <joel@sing.id.au>
Date: Sun, 27 Aug 2023 19:35:33 +1000
Subject: cmd/internal/sys,internal/goarch,runtime: enable the use of
 compressed instructions on riscv64

Enable the use of compressed instructions on riscv64 by reducing
the PC quantum to two bytes and reducing the minimum instruction
length to two bytes. Change gostartcall on riscv64 to land at
two times the PC quantum into goexit, so that we retain four byte
alignment and revise the NOP instructions in goexit to ensure that
they are never compressed. Additionally, adjust PCALIGN so that it
correctly handles two byte offsets.

Fixes #47560
Updates #71105

Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
Change-Id: I4329a8fbfcb4de636aadaeadabb826bc22698640
Reviewed-on: https://go-review.googlesource.com/c/go/+/523477
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
---
 src/cmd/internal/obj/riscv/obj.go     | 11 +++++++++--
 src/cmd/internal/sys/arch.go          |  2 +-
 src/internal/goarch/goarch_riscv64.go |  2 +-
 src/runtime/asm_riscv64.s             | 10 +++++-----
 src/runtime/sys_riscv64.go            | 11 +++++++++--
 5 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'src/runtime')

diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
index 3deab34d31..8b9be5d78b 100644
--- a/src/cmd/internal/obj/riscv/obj.go
+++ b/src/cmd/internal/obj/riscv/obj.go
@@ -4799,10 +4799,17 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 			v := pcAlignPadLength(p.Pc, alignedValue)
 			offset := p.Pc
 			for ; v >= 4; v -= 4 {
-				// NOP
-				cursym.WriteBytes(ctxt, offset, []byte{0x13, 0, 0, 0})
+				// NOP (ADDI $0, X0, X0)
+				cursym.WriteBytes(ctxt, offset, []byte{0x13, 0x00, 0x00, 0x00})
 				offset += 4
 			}
+			if v == 2 {
+				// CNOP
+				cursym.WriteBytes(ctxt, offset, []byte{0x01, 0x00})
+				offset += 2
+			} else if v != 0 {
+				ctxt.Diag("bad PCALIGN pad length")
+			}
 			continue
 		}
 
diff --git a/src/cmd/internal/sys/arch.go b/src/cmd/internal/sys/arch.go
index 3c92a6bbf2..14b1cde22b 100644
--- a/src/cmd/internal/sys/arch.go
+++ b/src/cmd/internal/sys/arch.go
@@ -236,7 +236,7 @@ var ArchRISCV64 = &Arch{
 	ByteOrder:      binary.LittleEndian,
 	PtrSize:        8,
 	RegSize:        8,
-	MinLC:          4,
+	MinLC:          2,
 	Alignment:      8, // riscv unaligned loads work, but are really slow (trap + simulated by OS)
 	CanMergeLoads:  false,
 	HasLR:          true,
diff --git a/src/internal/goarch/goarch_riscv64.go b/src/internal/goarch/goarch_riscv64.go
index 3b6da1e02f..468f9a6374 100644
--- a/src/internal/goarch/goarch_riscv64.go
+++ b/src/internal/goarch/goarch_riscv64.go
@@ -7,7 +7,7 @@ package goarch
 const (
 	_ArchFamily          = RISCV64
 	_DefaultPhysPageSize = 4096
-	_PCQuantum           = 4
+	_PCQuantum           = 2
 	_MinFrameSize        = 8
 	_StackAlign          = PtrSize
 )
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 5bd16181ee..428701a503 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -623,14 +623,14 @@ TEXT _cgo_topofstack(SB),NOSPLIT,$8
 	RET
 
 // func goexit(neverCallThisFunction)
-// The top-most function running on a goroutine
-// returns to goexit+PCQuantum.
+// The top-most function running on a goroutine, returns to goexit+PCQuantum*2.
+// Note that the NOPs are written in a manner that will not be compressed,
+// since the offset must be known by the runtime.
 TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
-	MOV	ZERO, ZERO	// NOP
+	WORD	$0x00000013	// NOP
 	JMP	runtime·goexit1(SB)	// does not return
 	// traceback from goexit1 must hit code range of goexit
-	MOV	ZERO, ZERO	// NOP
-
+	WORD	$0x00000013	// NOP
 
 // This is called from .init_array and follows the platform, not the Go ABI.
 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
diff --git a/src/runtime/sys_riscv64.go b/src/runtime/sys_riscv64.go
index e710840819..65dc684c33 100644
--- a/src/runtime/sys_riscv64.go
+++ b/src/runtime/sys_riscv64.go
@@ -4,7 +4,12 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"unsafe"
+
+	"internal/abi"
+	"internal/runtime/sys"
+)
 
 // adjust Gobuf as if it executed a call to fn with context ctxt
 // and then did an immediate Gosave.
@@ -12,7 +17,9 @@ func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
 	if buf.lr != 0 {
 		throw("invalid use of gostartcall")
 	}
-	buf.lr = buf.pc
+	// Use double the PC quantum on riscv64, so that we retain
+	// four byte alignment and use non-compressed instructions.
+	buf.lr = abi.FuncPCABI0(goexit) + sys.PCQuantum*2
 	buf.pc = uintptr(fn)
 	buf.ctxt = ctxt
 }
-- 
cgit v1.3


From c93766007dbb9c975d2f18b7c741f4804ce911c0 Mon Sep 17 00:00:00 2001
From: Youlin Feng <fengyoulin@live.com>
Date: Wed, 29 Oct 2025 13:11:48 +0800
Subject: runtime: do not print recovered when double panic with the same value

Show the "[recovered, repanicked]" message only when it is repanicked
after recovered. For the duplicated panics that not recovered, do not
show this message.

Fixes #76099

Change-Id: I87282022ebe44c6f6efbe3239218be4a2a7b1104
Reviewed-on: https://go-review.googlesource.com/c/go/+/716020
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
---
 src/runtime/crash_test.go              |  9 +++++++++
 src/runtime/panic.go                   |  2 +-
 src/runtime/testdata/testprog/crash.go | 11 +++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'src/runtime')

diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 2b8ca549ad..00e67aeca0 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -413,6 +413,15 @@ func TestRepanickedPanicSandwich(t *testing.T) {
 	}
 }
 
+func TestDoublePanicWithSameValue(t *testing.T) {
+	output := runTestProg(t, "testprog", "DoublePanicWithSameValue")
+	want := `panic: message
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
 func TestGoexitCrash(t *testing.T) {
 	// External linking brings in cgo, causing deadlock detection not working.
 	testenv.MustInternalLink(t, deadlockBuildTypes)
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 3c967a2999..62affac5f9 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -739,7 +739,7 @@ func printpanics(p *_panic) {
 	}
 	print("panic: ")
 	printpanicval(p.arg)
-	if p.repanicked {
+	if p.recovered && p.repanicked {
 		print(" [recovered, repanicked]")
 	} else if p.recovered {
 		print(" [recovered]")
diff --git a/src/runtime/testdata/testprog/crash.go b/src/runtime/testdata/testprog/crash.go
index 556215a71e..fcce388871 100644
--- a/src/runtime/testdata/testprog/crash.go
+++ b/src/runtime/testdata/testprog/crash.go
@@ -22,6 +22,7 @@ func init() {
 	register("RepanickedPanic", RepanickedPanic)
 	register("RepanickedMiddlePanic", RepanickedMiddlePanic)
 	register("RepanickedPanicSandwich", RepanickedPanicSandwich)
+	register("DoublePanicWithSameValue", DoublePanicWithSameValue)
 }
 
 func test(name string) {
@@ -189,3 +190,13 @@ func RepanickedPanicSandwich() {
 		panic("outer")
 	}()
 }
+
+// Double panic with same value and not recovered.
+// See issue 76099.
+func DoublePanicWithSameValue() {
+	var e any = "message"
+	defer func() {
+		panic(e)
+	}()
+	panic(e)
+}
-- 
cgit v1.3


From e912618bd2de2121d6c9fed3473b5e0a47da138c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Tue, 10 Jun 2025 19:19:08 -0400
Subject: runtime: add hexdumper

Currently, we have a simple hexdumpWords facility for debugging. It's
useful but pretty limited.

This CL adds a much more configurable and capable "hexdumper". It can
be configured for any word size (including bytes), handles unaligned
data, includes an ASCII dump, and accepts data in multiple slices. It
also has a much nicer "mark" facility for annotating the hexdump that
isn't limited to a single character per word.

We use this to improve our existing hexdumps, particularly the new
mark facility. The next CL will integrate hexdumps into debuglog,
which will make use of several other new capabilities.

Also this adds an actual test.

The output looks like:

                       7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
    000000c00006ef70:                    03000000 00000000          ........
    000000c00006ef80: 00000000 0053da80  000000c0 000bc380  ..S.............
                      ^ <testing.tRunner.func2+0x0>
    000000c00006ef90: 00000000 0053dac0  000000c0 000bc380  ..S.............
                      ^ <testing.tRunner.func1+0x0>
    000000c00006efa0: 000000c0 0006ef90  000000c0 0006ef80  ................
    000000c00006efb0: 000000c0 0006efd0  00000000 0053eb65  ........e.S.....
                                         ^ <testing.(*T).Run.gowrap1+0x25>
    000000c00006efc0: 000000c0 000bc380  00000000 009aaae8  ................
    000000c00006efd0: 00000000 00000000  00000000 00496b01  .........kI.....
                                         ^ <runtime.goexit+0x1>
    000000c00006efe0: 00000000 00000000  00000000 00000000  ................
    000000c00006eff0: 00000000 00000000                     ........

The header gives column labels, indicating the order of bytes within
the following words. The addresses on the left are always 16-byte
aligned so it's easy to combine that address with the column header to
determine the full address of a byte. Annotations are no longer
interleaved with the data, so the data stays in nicely aligned
columns. The annotations are also now much more flexible, including
support for multiple annotations on the same word (not shown).

Change-Id: I27e83800a1f6a7bdd3cc2c59614661a810a57d4d
Reviewed-on: https://go-review.googlesource.com/c/go/+/681375
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Austin Clements <austin@google.com>
---
 src/runtime/export_test.go  |  33 ++++++
 src/runtime/hexdump.go      | 269 ++++++++++++++++++++++++++++++++++++++++++++
 src/runtime/hexdump_test.go | 151 +++++++++++++++++++++++++
 src/runtime/mgcmark.go      |  15 ++-
 src/runtime/mgcsweep.go     |   2 +-
 src/runtime/print.go        |  41 -------
 src/runtime/traceback.go    |  21 ++--
 7 files changed, 475 insertions(+), 57 deletions(-)
 create mode 100644 src/runtime/hexdump.go
 create mode 100644 src/runtime/hexdump_test.go

(limited to 'src/runtime')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 8438603b9e..2db8add7e4 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -2029,3 +2029,36 @@ func (head *ListHeadManual) Pop() unsafe.Pointer {
 func (head *ListHeadManual) Remove(p unsafe.Pointer) {
 	head.l.remove(p)
 }
+
+func Hexdumper(base uintptr, wordBytes int, mark func(addr uintptr, start func()), data ...[]byte) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	h := hexdumper{addr: base, addrBytes: 4, wordBytes: uint8(wordBytes)}
+	if mark != nil {
+		h.mark = func(addr uintptr, m hexdumpMarker) {
+			mark(addr, m.start)
+		}
+	}
+	for _, d := range data {
+		h.write(d)
+	}
+	h.close()
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("Hexdumper buf too small")
+	}
+	return string(buf[:n])
+}
+
+func HexdumpWords(p, bytes uintptr) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	hexdumpWords(p, bytes, nil)
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("HexdumpWords buf too small")
+	}
+	return string(buf[:n])
+}
diff --git a/src/runtime/hexdump.go b/src/runtime/hexdump.go
new file mode 100644
index 0000000000..0d7dbb540b
--- /dev/null
+++ b/src/runtime/hexdump.go
@@ -0,0 +1,269 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// hexdumpWords prints a word-oriented hex dump of [p, p+len).
+//
+// If mark != nil, it will be passed to hexdumper.mark.
+func hexdumpWords(p, len uintptr, mark func(uintptr, hexdumpMarker)) {
+	printlock()
+
+	// Provide a default annotation
+	symMark := func(u uintptr, hm hexdumpMarker) {
+		if mark != nil {
+			mark(u, hm)
+		}
+
+		// Can we symbolize this value?
+		val := *(*uintptr)(unsafe.Pointer(u))
+		fn := findfunc(val)
+		if fn.valid() {
+			hm.start()
+			print("<", funcname(fn), "+", hex(val-fn.entry()), ">\n")
+		}
+	}
+
+	h := hexdumper{addr: p, mark: symMark}
+	h.write(unsafe.Slice((*byte)(unsafe.Pointer(p)), len))
+	h.close()
+	printunlock()
+}
+
+// hexdumper is a Swiss-army knife hex dumper.
+//
+// To use, optionally set addr and wordBytes, then call write repeatedly,
+// followed by close.
+type hexdumper struct {
+	// addr is the address to print for the first byte of data.
+	addr uintptr
+
+	// addrBytes is the number of bytes of addr to print. If this is 0, it
+	// defaults to goarch.PtrSize.
+	addrBytes uint8
+
+	// wordBytes is the number of bytes in a word. If wordBytes is 1, this
+	// prints a byte-oriented dump. If it's > 1, this interprets the data as a
+	// sequence of words of the given size. If it's 0, it's treated as
+	// goarch.PtrSize.
+	wordBytes uint8
+
+	// mark is an optional function that can annotate values in the hex dump.
+	//
+	// If non-nil, it is called with the address of every complete, aligned word
+	// in the hex dump.
+	//
+	// If it decides to print an annotation, it must first call m.start(), then
+	// print the annotation, followed by a new line.
+	mark func(addr uintptr, m hexdumpMarker)
+
+	// Below here is state
+
+	ready int8 // 0=need to init state; 1=need to print header; 2=ready
+
+	// dataBuf accumulates a line at a time of data, in case it's split across
+	// buffers.
+	dataBuf  [16]byte
+	dataPos  uint8
+	dataSkip uint8 // Skip first n bytes of buf on first line
+
+	// toPos maps from byte offset in data to a visual offset in the printed line.
+	toPos [16]byte
+}
+
+type hexdumpMarker struct {
+	chars int
+}
+
+func (h *hexdumper) write(data []byte) {
+	if h.ready == 0 {
+		h.init()
+	}
+
+	// Handle leading data
+	if h.dataPos > 0 {
+		n := copy(h.dataBuf[h.dataPos:], data)
+		h.dataPos += uint8(n)
+		data = data[n:]
+		if h.dataPos < uint8(len(h.dataBuf)) {
+			return
+		}
+		h.flushLine(h.dataBuf[:])
+		h.dataPos = 0
+	}
+
+	// Handle full lines in data
+	for len(data) >= len(h.dataBuf) {
+		h.flushLine(data[:len(h.dataBuf)])
+		data = data[len(h.dataBuf):]
+	}
+
+	// Handle trailing data
+	h.dataPos = uint8(copy(h.dataBuf[:], data))
+}
+
+func (h *hexdumper) close() {
+	if h.dataPos > 0 {
+		h.flushLine(h.dataBuf[:h.dataPos])
+	}
+}
+
+func (h *hexdumper) init() {
+	const bytesPerLine = len(h.dataBuf)
+
+	if h.addrBytes == 0 {
+		h.addrBytes = goarch.PtrSize
+	} else if h.addrBytes < 0 || h.addrBytes > goarch.PtrSize {
+		throw("invalid addrBytes")
+	}
+
+	if h.wordBytes == 0 {
+		h.wordBytes = goarch.PtrSize
+	}
+	wb := int(h.wordBytes)
+	if wb < 0 || wb >= bytesPerLine || wb&(wb-1) != 0 {
+		throw("invalid wordBytes")
+	}
+
+	// Construct position mapping.
+	for i := range h.toPos {
+		// First, calculate the "field" within the line, applying byte swizzling.
+		field := 0
+		if goarch.BigEndian {
+			field = i
+		} else {
+			field = i ^ int(wb-1)
+		}
+		// Translate this field into a visual offset.
+		// "00112233 44556677  8899AABB CCDDEEFF"
+		h.toPos[i] = byte(field*2 + field/4 + field/8)
+	}
+
+	// The first line may need to skip some fields to get to alignment.
+	// Round down the starting address.
+	nAddr := h.addr &^ uintptr(bytesPerLine-1)
+	// Skip bytes to get to alignment.
+	h.dataPos = uint8(h.addr - nAddr)
+	h.dataSkip = uint8(h.addr - nAddr)
+	h.addr = nAddr
+
+	// We're ready to print the header.
+	h.ready = 1
+}
+
+func (h *hexdumper) flushLine(data []byte) {
+	const bytesPerLine = len(h.dataBuf)
+
+	const maxAddrChars = 2 * goarch.PtrSize
+	const addrSep = ": "
+	dataStart := int(2*h.addrBytes) + len(addrSep)
+	// dataChars uses the same formula to toPos above. We calculate it with the
+	// "last field", then add the size of the last field.
+	const dataChars = (bytesPerLine-1)*2 + (bytesPerLine-1)/4 + (bytesPerLine-1)/8 + 2
+	const asciiSep = "  "
+	asciiStart := dataStart + dataChars + len(asciiSep)
+	const asciiChars = bytesPerLine
+	nlPos := asciiStart + asciiChars
+
+	var lineBuf [maxAddrChars + len(addrSep) + dataChars + len(asciiSep) + asciiChars + 1]byte
+	clear := func() {
+		for i := range lineBuf {
+			lineBuf[i] = ' '
+		}
+	}
+	clear()
+
+	if h.ready == 1 {
+		// Print column offsets header.
+		for offset, pos := range h.toPos {
+			h.fmtHex(lineBuf[dataStart+int(pos+1):][:1], uint64(offset))
+		}
+		// Print ASCII offsets.
+		for offset := range asciiChars {
+			h.fmtHex(lineBuf[asciiStart+offset:][:1], uint64(offset))
+		}
+		lineBuf[nlPos] = '\n'
+		gwrite(lineBuf[:nlPos+1])
+		clear()
+		h.ready = 2
+	}
+
+	// Format address.
+	h.fmtHex(lineBuf[:2*h.addrBytes], uint64(h.addr))
+	copy(lineBuf[2*h.addrBytes:], addrSep)
+	// Format data in hex and ASCII.
+	for offset, b := range data {
+		if offset < int(h.dataSkip) {
+			continue
+		}
+
+		pos := h.toPos[offset]
+		h.fmtHex(lineBuf[dataStart+int(pos):][:2], uint64(b))
+
+		copy(lineBuf[dataStart+dataChars:], asciiSep)
+		ascii := uint8('.')
+		if b >= ' ' && b <= '~' {
+			ascii = b
+		}
+		lineBuf[asciiStart+offset] = ascii
+	}
+	// Trim buffer.
+	end := asciiStart + len(data)
+	lineBuf[end] = '\n'
+	buf := lineBuf[:end+1]
+
+	// Print.
+	gwrite(buf)
+
+	// Print marks.
+	if h.mark != nil {
+		clear()
+		for offset := 0; offset+int(h.wordBytes) <= len(data); offset += int(h.wordBytes) {
+			if offset < int(h.dataSkip) {
+				continue
+			}
+			addr := h.addr + uintptr(offset)
+			// Find the position of the left edge of this word
+			caret := dataStart + int(min(h.toPos[offset], h.toPos[offset+int(h.wordBytes)-1]))
+			h.mark(addr, hexdumpMarker{caret})
+		}
+	}
+
+	h.addr += uintptr(bytesPerLine)
+	h.dataPos = 0
+	h.dataSkip = 0
+}
+
+// fmtHex formats v in base 16 into buf. It fills all of buf. If buf is too
+// small to represent v, it the output will start with '*'.
+func (h *hexdumper) fmtHex(buf []byte, v uint64) {
+	const dig = "0123456789abcdef"
+	i := len(buf) - 1
+	for ; i >= 0; i-- {
+		buf[i] = dig[v%16]
+		v /= 16
+	}
+	if v != 0 {
+		// Indicate that we couldn't fit the whole number.
+		buf[0] = '*'
+	}
+}
+
+func (m hexdumpMarker) start() {
+	var spaces [64]byte
+	for i := range spaces {
+		spaces[i] = ' '
+	}
+	for m.chars > len(spaces) {
+		gwrite(spaces[:])
+		m.chars -= len(spaces)
+	}
+	gwrite(spaces[:m.chars])
+	print("^ ")
+}
diff --git a/src/runtime/hexdump_test.go b/src/runtime/hexdump_test.go
new file mode 100644
index 0000000000..cc44e48e4b
--- /dev/null
+++ b/src/runtime/hexdump_test.go
@@ -0,0 +1,151 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"internal/abi"
+	"internal/goarch"
+	"runtime"
+	"slices"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+func TestHexdumper(t *testing.T) {
+	check := func(label, got, want string) {
+		got = strings.TrimRight(got, "\n")
+		want = strings.TrimPrefix(want, "\n")
+		want = strings.TrimRight(want, "\n")
+		if got != want {
+			t.Errorf("%s: got\n%s\nwant\n%s", label, got, want)
+		}
+	}
+
+	data := make([]byte, 32)
+	for i := range data {
+		data[i] = 0x10 + byte(i)
+	}
+
+	check("basic", runtime.Hexdumper(0, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./`)
+
+	if !goarch.BigEndian {
+		// Different word sizes
+		check("word=4", runtime.Hexdumper(0, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./`)
+		check("word=8", runtime.Hexdumper(0, 8, nil, data), `
+           7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+00000000: 17161514 13121110  1f1e1d1c 1b1a1918  ................
+00000010: 27262524 23222120  2f2e2d2c 2b2a2928   !"#$%&'()*+,-./`)
+	}
+
+	// Starting offset
+	check("offset=1", runtime.Hexdumper(1, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+	if !goarch.BigEndian {
+		// ... combined with a word size
+		check("offset=1 and word=4", runtime.Hexdumper(1, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 121110   16151413  1a191817 1e1d1c1b   ...............
+00000010: 2221201f 26252423  2a292827 2e2d2c2b  . !"#$%&'()*+,-.
+00000020:       2f                              /`)
+	}
+
+	// Partial data full of annoying boundaries.
+	partials := make([][]byte, 0)
+	for i := 0; i < len(data); i += 2 {
+		partials = append(partials, data[i:i+2])
+	}
+	check("partials", runtime.Hexdumper(1, 1, nil, partials...), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+
+	// Marks.
+	check("marks", runtime.Hexdumper(0, 1, func(addr uintptr, start func()) {
+		if addr%7 == 0 {
+			start()
+			println("mark")
+		}
+	}, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+          ^ mark
+                         ^ mark
+                                          ^ mark
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./
+                     ^ mark
+                                      ^ mark`)
+	if !goarch.BigEndian {
+		check("marks and word=4", runtime.Hexdumper(0, 4, func(addr uintptr, start func()) {
+			if addr%7 == 0 {
+				start()
+				println("mark")
+			}
+		}, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+          ^ mark
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./
+                                      ^ mark`)
+	}
+}
+
+func TestHexdumpWords(t *testing.T) {
+	if goarch.BigEndian || goarch.PtrSize != 8 {
+		// We could support these, but it's kind of a pain.
+		t.Skip("requires 64-bit little endian")
+	}
+
+	// Most of this is in hexdumper. Here we just test the symbolizer.
+
+	pc := abi.FuncPCABIInternal(TestHexdumpWords)
+	pcs := slices.Repeat([]uintptr{pc}, 3)
+
+	// Make sure pcs doesn't move around on us.
+	var p runtime.Pinner
+	defer p.Unpin()
+	p.Pin(&pcs[0])
+	// Get a 16 byte, 16-byte-aligned chunk of pcs so the hexdump is simple.
+	start := uintptr(unsafe.Pointer(&pcs[0]))
+	start = (start + 15) &^ uintptr(15)
+
+	// Do the hex dump.
+	got := runtime.HexdumpWords(start, 16)
+
+	// Construct the expected output.
+	pcStr := fmt.Sprintf("%016x", pc)
+	pcStr = pcStr[:8] + " " + pcStr[8:] // Add middle space
+	ascii := make([]byte, 8)
+	for i := range ascii {
+		b := byte(pc >> (8 * i))
+		if b >= ' ' && b <= '~' {
+			ascii[i] = b
+		} else {
+			ascii[i] = '.'
+		}
+	}
+	want := fmt.Sprintf(`
+                   7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+%016x: %s  %s  %s%s
+                  ^ <runtime_test.TestHexdumpWords+0x0>
+                                     ^ <runtime_test.TestHexdumpWords+0x0>
+`, start, pcStr, pcStr, ascii, ascii)
+	want = strings.TrimPrefix(want, "\n")
+
+	if got != want {
+		t.Errorf("got\n%s\nwant\n%s", got, want)
+	}
+}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index c9234c5084..714b9a51df 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -1524,29 +1524,32 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
 	if debugScanConservative {
 		printlock()
 		print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n")
-		hexdumpWords(b, b+n, func(p uintptr) byte {
+		hexdumpWords(b, n, func(p uintptr, m hexdumpMarker) {
 			if ptrmask != nil {
 				word := (p - b) / goarch.PtrSize
 				bits := *addb(ptrmask, word/8)
 				if (bits>>(word%8))&1 == 0 {
-					return '$'
+					return
 				}
 			}
 
 			val := *(*uintptr)(unsafe.Pointer(p))
 			if state != nil && state.stack.lo <= val && val < state.stack.hi {
-				return '@'
+				m.start()
+				println("ptr to stack")
+				return
 			}
 
 			span := spanOfHeap(val)
 			if span == nil {
-				return ' '
+				return
 			}
 			idx := span.objIndex(val)
 			if span.isFreeOrNewlyAllocated(idx) {
-				return ' '
+				return
 			}
-			return '*'
+			m.start()
+			println("ptr to heap")
 		})
 		printunlock()
 	}
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index c3d6afb90a..4eecb1cfd9 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -885,7 +885,7 @@ func (s *mspan) reportZombies() {
 			if length > 1024 {
 				length = 1024
 			}
-			hexdumpWords(addr, addr+length, nil)
+			hexdumpWords(addr, length, nil)
 		}
 		mbits.advance()
 		abits.advance()
diff --git a/src/runtime/print.go b/src/runtime/print.go
index c01db9d7f9..d2733fb266 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"internal/goarch"
 	"internal/strconv"
 	"unsafe"
 )
@@ -212,43 +211,3 @@ func printeface(e eface) {
 func printiface(i iface) {
 	print("(", i.tab, ",", i.data, ")")
 }
-
-// hexdumpWords prints a word-oriented hex dump of [p, end).
-//
-// If mark != nil, it will be called with each printed word's address
-// and should return a character mark to appear just before that
-// word's value. It can return 0 to indicate no mark.
-func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
-	printlock()
-	var markbuf [1]byte
-	markbuf[0] = ' '
-	minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2)
-	for i := uintptr(0); p+i < end; i += goarch.PtrSize {
-		if i%16 == 0 {
-			if i != 0 {
-				println()
-			}
-			print(hex(p+i), ": ")
-		}
-
-		if mark != nil {
-			markbuf[0] = mark(p + i)
-			if markbuf[0] == 0 {
-				markbuf[0] = ' '
-			}
-		}
-		gwrite(markbuf[:])
-		val := *(*uintptr)(unsafe.Pointer(p + i))
-		print(hex(val))
-		print(" ")
-
-		// Can we symbolize val?
-		fn := findfunc(val)
-		if fn.valid() {
-			print("<", funcname(fn), "+", hex(val-fn.entry()), "> ")
-		}
-	}
-	minhexdigits = 0
-	println()
-	printunlock()
-}
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 6649f72471..74aaeba876 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1366,16 +1366,19 @@ func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) {
 
 	// Print the hex dump.
 	print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n")
-	hexdumpWords(lo, hi, func(p uintptr) byte {
-		switch p {
-		case frame.fp:
-			return '>'
-		case frame.sp:
-			return '<'
-		case bad:
-			return '!'
+	hexdumpWords(lo, hi-lo, func(p uintptr, m hexdumpMarker) {
+		if p == frame.fp {
+			m.start()
+			println("FP")
+		}
+		if p == frame.sp {
+			m.start()
+			println("SP")
+		}
+		if p == bad {
+			m.start()
+			println("bad")
 		}
-		return 0
 	})
 }
 
-- 
cgit v1.3


From 8c41a482f9b7a101404cd0b417ac45abd441e598 Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Tue, 10 Jun 2025 19:19:34 -0400
Subject: runtime: add dlog.hexdump

Change-Id: I8149ab9314216bb8f9bf58da55633e2587d75851
Reviewed-on: https://go-review.googlesource.com/c/go/+/681376
Auto-Submit: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/debuglog.go | 54 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/debuglog.go b/src/runtime/debuglog.go
index e993e396c1..405f2455c6 100644
--- a/src/runtime/debuglog.go
+++ b/src/runtime/debuglog.go
@@ -196,7 +196,8 @@ const (
 	debugLogPtr
 	debugLogString
 	debugLogConstString
-	debugLogStringOverflow
+	debugLogHexdump
+	debugLogOverflow
 
 	debugLogPC
 	debugLogTraceback
@@ -365,13 +366,39 @@ func (l *dloggerImpl) s(x string) *dloggerImpl {
 		l.w.uvarint(uint64(len(b)))
 		l.w.bytes(b)
 		if len(b) != len(x) {
-			l.w.byte(debugLogStringOverflow)
+			l.w.byte(debugLogOverflow)
 			l.w.uvarint(uint64(len(x) - len(b)))
 		}
 	}
 	return l
 }
 
+//go:nosplit
+func (l dloggerFake) hexdump(p unsafe.Pointer, bytes uintptr) dloggerFake { return l }
+
+//go:nosplit
+func (l *dloggerImpl) hexdump(p unsafe.Pointer, bytes uintptr) *dloggerImpl {
+	var b []byte
+	bb := (*slice)(unsafe.Pointer(&b))
+	bb.array = unsafe.Pointer(p)
+	bb.len, bb.cap = int(bytes), int(bytes)
+	if len(b) > debugLogStringLimit {
+		b = b[:debugLogStringLimit]
+	}
+
+	l.w.byte(debugLogHexdump)
+	l.w.uvarint(uint64(uintptr(p)))
+	l.w.uvarint(uint64(len(b)))
+	l.w.bytes(b)
+
+	if uintptr(len(b)) != bytes {
+		l.w.byte(debugLogOverflow)
+		l.w.uvarint(uint64(bytes) - uint64(len(b)))
+	}
+
+	return l
+}
+
 //go:nosplit
 func (l dloggerFake) pc(x uintptr) dloggerFake { return l }
 
@@ -708,9 +735,30 @@ func (r *debugLogReader) printVal() bool {
 		s := *(*string)(unsafe.Pointer(&str))
 		print(s)
 
-	case debugLogStringOverflow:
+	case debugLogOverflow:
 		print("..(", r.uvarint(), " more bytes)..")
 
+	case debugLogHexdump:
+		p := uintptr(r.uvarint())
+		bl := r.uvarint()
+		if r.begin+bl > r.end {
+			r.begin = r.end
+			print("<hexdump length corrupted>")
+			break
+		}
+		println() // Start on a new line
+		hd := hexdumper{addr: p}
+		for bl > 0 {
+			b := r.data.b[r.begin%uint64(len(r.data.b)):]
+			if uint64(len(b)) > bl {
+				b = b[:bl]
+			}
+			r.begin += uint64(len(b))
+			bl -= uint64(len(b))
+			hd.write(b)
+		}
+		hd.close()
+
 	case debugLogPC:
 		printDebugLogPC(uintptr(r.uvarint()), false)
 
-- 
cgit v1.3


From 829779f4fe7e002b959a2f4966aa9e21c59e418c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 17 Nov 2025 15:09:50 -0500
Subject: runtime: split findRunnableGCWorker in two

The first part, assignWaitingGCWorker selects a mark worker (if any) and
assigns it to the P. The second part, findRunnableGCWorker, is
responsible for actually marking the worker as runnable and updating the
CPU limiter.

The advantage of this split is that assignWaitingGCWorker is safe to do
during STW, which will allow the next CL to make selections during
procresize.

This change is a semantic no-op in preparation for the next CL.

For #65694.

Change-Id: I6a6a636c8beb212185829946cfa1e49f706ac31a
Reviewed-on: https://go-review.googlesource.com/c/go/+/721001
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgc.go      |  8 ++++-
 src/runtime/mgcpacer.go | 88 ++++++++++++++++++++++++++++++++++++++++---------
 src/runtime/runtime2.go | 12 +++++++
 3 files changed, 91 insertions(+), 17 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 43afbc330b..febcd9558c 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1727,7 +1727,13 @@ func gcBgMarkWorker(ready chan struct{}) {
 	// the stack (see gopark). Prevent deadlock from recursively
 	// starting GC by disabling preemption.
 	gp.m.preemptoff = "GC worker init"
-	node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode // TODO: technically not allowed in the heap. See comment in tagptr.go.
+	// TODO: This is technically not allowed in the heap. See comment in tagptr.go.
+	//
+	// It is kept alive simply by virtue of being used in the infinite loop
+	// below. gcBgMarkWorkerPool keeps pointers to nodes that are not
+	// GC-visible, so this must be kept alive indefinitely (even if
+	// GOMAXPROCS decreases).
+	node := &new(gcBgMarkWorkerNodePadded).gcBgMarkWorkerNode
 	gp.m.preemptoff = ""
 
 	node.gp.set(gp)
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 32c1b941e5..f78fb6f636 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -10,7 +10,7 @@ import (
 	"internal/runtime/atomic"
 	"internal/runtime/math"
 	"internal/strconv"
-	_ "unsafe" // for go:linkname
+	_ "unsafe"
 )
 
 const (
@@ -749,30 +749,33 @@ func (c *gcControllerState) enlistWorker() {
 	}
 }
 
-// findRunnableGCWorker returns a background mark worker for pp if it
-// should be run. This must only be called when gcBlackenEnabled != 0.
-func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
+// assignWaitingGCWorker assigns a background mark worker to pp if one should
+// be run.
+//
+// If a worker is selected, it is assigned to pp.nextMarkGCWorker and the P is
+// wired as a GC mark worker. The G is still in _Gwaiting. If no worker is
+// selected, ok returns false.
+//
+// If assignedWaitingGCWorker returns true, this P must either:
+// - Mark the G as runnable and run it, clearing pp.nextMarkGCWorker.
+// - Or, call c.releaseNextGCMarkWorker.
+//
+// This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) assignWaitingGCWorker(pp *p, now int64) (bool, int64) {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
 
-	// Since we have the current time, check if the GC CPU limiter
-	// hasn't had an update in a while. This check is necessary in
-	// case the limiter is on but hasn't been checked in a while and
-	// so may have left sufficient headroom to turn off again.
 	if now == 0 {
 		now = nanotime()
 	}
-	if gcCPULimiter.needUpdate(now) {
-		gcCPULimiter.update(now)
-	}
 
 	if !gcShouldScheduleWorker(pp) {
 		// No good reason to schedule a worker. This can happen at
 		// the end of the mark phase when there are still
 		// assists tapering off. Don't bother running a worker
 		// now because it'll just return immediately.
-		return nil, now
+		return false, now
 	}
 
 	if c.dedicatedMarkWorkersNeeded.Load() <= 0 && c.fractionalUtilizationGoal == 0 {
@@ -783,7 +786,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		// When a dedicated worker stops running, the gcBgMarkWorker loop notes
 		// the need for the worker before returning it to the pool. If we don't
 		// see the need now, we wouldn't have found it in the pool anyway.
-		return nil, now
+		return false, now
 	}
 
 	// Grab a worker before we commit to running below.
@@ -800,7 +803,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		// it will always do so with queued global work. Thus, that P
 		// will be immediately eligible to re-run the worker G it was
 		// just using, ensuring work can complete.
-		return nil, now
+		return false, now
 	}
 
 	decIfPositive := func(val *atomic.Int64) bool {
@@ -823,7 +826,7 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 	} else if c.fractionalUtilizationGoal == 0 {
 		// No need for fractional workers.
 		gcBgMarkWorkerPool.push(&node.node)
-		return nil, now
+		return false, now
 	} else {
 		// Is this P behind on the fractional utilization
 		// goal?
@@ -833,12 +836,48 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		if delta > 0 && float64(pp.gcFractionalMarkTime.Load())/float64(delta) > c.fractionalUtilizationGoal {
 			// Nope. No need to run a fractional worker.
 			gcBgMarkWorkerPool.push(&node.node)
-			return nil, now
+			return false, now
 		}
 		// Run a fractional worker.
 		pp.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
+	pp.nextGCMarkWorker = node
+	return true, now
+}
+
+// findRunnableGCWorker returns a background mark worker for pp if it
+// should be run.
+//
+// If findRunnableGCWorker returns a G, this P is wired as a GC mark worker and
+// must run the G.
+//
+// This must only be called when gcBlackenEnabled != 0.
+//
+// This function is allowed to have write barriers because it is called from
+// the portion of findRunnable that always has a P.
+//
+//go:yeswritebarrierrec
+func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
+	// Since we have the current time, check if the GC CPU limiter
+	// hasn't had an update in a while. This check is necessary in
+	// case the limiter is on but hasn't been checked in a while and
+	// so may have left sufficient headroom to turn off again.
+	if now == 0 {
+		now = nanotime()
+	}
+	if gcCPULimiter.needUpdate(now) {
+		gcCPULimiter.update(now)
+	}
+
+	ok, now := c.assignWaitingGCWorker(pp, now)
+	if !ok {
+		return nil, now
+	}
+
+	node := pp.nextGCMarkWorker
+	pp.nextGCMarkWorker = nil
+
 	// Run the background mark worker.
 	gp := node.gp.ptr()
 	trace := traceAcquire()
@@ -850,6 +889,23 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 	return gp, now
 }
 
+// Release an unused pp.nextGCMarkWorker, if any.
+//
+// This function is allowed to have write barriers because it is called from
+// the portion of schedule.
+//
+//go:yeswritebarrierrec
+func (c *gcControllerState) releaseNextGCMarkWorker(pp *p) {
+	node := pp.nextGCMarkWorker
+	if node == nil {
+		return
+	}
+
+	c.markWorkerStop(pp.gcMarkWorkerMode, 0)
+	gcBgMarkWorkerPool.push(&node.node)
+	pp.nextGCMarkWorker = nil
+}
+
 // resetLive sets up the controller state for the next mark phase after the end
 // of the previous one. Must be called after endCycle and before commit, before
 // the world is started.
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index e415df6ec1..56082bf7f5 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -854,6 +854,18 @@ type p struct {
 	// mark worker started.
 	gcMarkWorkerStartTime int64
 
+	// nextGCMarkWorker is the next mark worker to run. This may be set
+	// during start-the-world to assign a worker to this P. The P runs this
+	// worker on the next call to gcController.findRunnableGCWorker. If the
+	// P runs something else or stops, it must release this worker via
+	// gcController.releaseNextGCMarkWorker.
+	//
+	// See comment in gcBgMarkWorker about the lifetime of
+	// gcBgMarkWorkerNode.
+	//
+	// Only accessed by this P or during STW.
+	nextGCMarkWorker *gcBgMarkWorkerNode
+
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
-- 
cgit v1.3


From a18aff805706bfdaeb9aca042111fae32f9f8b61 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 17 Nov 2025 16:08:21 -0500
Subject: runtime: select GC mark workers during start-the-world

When the GC starts today, procresize and startTheWorldWithSema don't
consider the additional Ps required to run the mark workers. procresize
and startTheWorldWithSema resume only the Ps necessary to run the normal
user goroutines.

Once those Ps start, findRunnable and findRunnableGCWorker determine
that a GC worker is necessary and run the worker instead, calling wakep
to wake another P to run the original user goroutine.

This is unfortunate because it disrupts the intentional placement of Ps
on Ms that procresize does. It also has the unfortunate side effect of
slightly delaying start-the-world time, as it takes several sequential
wakeps to get all Ps started.

To address this, procresize explicitly assigns GC mark workers to Ps
before starting the world. The assignment occurs _after_ selecting
runnable Ps, so that we prefer to select Ps that were previously idle.

Note that if fewer than 25% of Ps are idle then we won't be able to
assign all dedicated workers, and some of the Ps intended for user
goroutines will convert to dedicated workers once they reach
findRunnableGCWorker.

Also note that stack scanning temporarily suspends the goroutine. Resume
occurs through ready, which will move the goroutine to the local runq of
the P that did the scan. Thus there is still a source of migration at
some point during the GC.

For #65694.

Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c51f39f4f4bc716aa87de68f6ebe163a5
Reviewed-on: https://go-review.googlesource.com/c/go/+/721002
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/runtime/mgcpacer.go                    |   9 ++-
 src/runtime/proc.go                        |  79 +++++++++++++++++++-
 src/runtime/proc_test.go                   |  50 ++++++++++---
 src/runtime/testdata/testprog/stw_trace.go | 111 ++++++++++++++++++++++++++++-
 4 files changed, 236 insertions(+), 13 deletions(-)

(limited to 'src/runtime')

diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index f78fb6f636..388cce83cd 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -870,9 +870,12 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 		gcCPULimiter.update(now)
 	}
 
-	ok, now := c.assignWaitingGCWorker(pp, now)
-	if !ok {
-		return nil, now
+	// If a worker wasn't already assigned by procresize, assign one now.
+	if pp.nextGCMarkWorker == nil {
+		ok, now := c.assignWaitingGCWorker(pp, now)
+		if !ok {
+			return nil, now
+		}
 	}
 
 	node := pp.nextGCMarkWorker
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a378b8c39d..62e79e74e2 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4135,11 +4135,23 @@ top:
 
 	gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
 
+	// May be on a new P.
+	pp = mp.p.ptr()
+
 	// findRunnable may have collected an allp snapshot. The snapshot is
 	// only required within findRunnable. Clear it to all GC to collect the
 	// slice.
 	mp.clearAllpSnapshot()
 
+	// If the P was assigned a next GC mark worker but findRunnable
+	// selected anything else, release the worker so another P may run it.
+	//
+	// N.B. If this occurs because a higher-priority goroutine was selected
+	// (trace reader), then tryWakeP is set, which will wake another P to
+	// run the worker. If this occurs because the GC is no longer active,
+	// there is no need to wakep.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	if debug.dontfreezetheworld > 0 && freezing.Load() {
 		// See comment in freezetheworld. We don't want to perturb
 		// scheduler state, so we didn't gcstopm in findRunnable, but
@@ -6036,8 +6048,10 @@ func procresize(nprocs int32) *p {
 		unlock(&allpLock)
 	}
 
+	// Assign Ms to Ps with runnable goroutines.
 	var runnablePs *p
 	var runnablePsNeedM *p
+	var idlePs *p
 	for i := nprocs - 1; i >= 0; i-- {
 		pp := allp[i]
 		if gp.m.p.ptr() == pp {
@@ -6045,7 +6059,8 @@ func procresize(nprocs int32) *p {
 		}
 		pp.status = _Pidle
 		if runqempty(pp) {
-			pidleput(pp, now)
+			pp.link.set(idlePs)
+			idlePs = pp
 			continue
 		}
 
@@ -6071,6 +6086,8 @@ func procresize(nprocs int32) *p {
 		pp.link.set(runnablePs)
 		runnablePs = pp
 	}
+	// Assign Ms to remaining runnable Ps without usable oldm. See comment
+	// above.
 	for runnablePsNeedM != nil {
 		pp := runnablePsNeedM
 		runnablePsNeedM = pp.link.ptr()
@@ -6081,6 +6098,62 @@ func procresize(nprocs int32) *p {
 		runnablePs = pp
 	}
 
+	// Now that we've assigned Ms to Ps with runnable goroutines, assign GC
+	// mark workers to remaining idle Ps, if needed.
+	//
+	// By assigning GC workers to Ps here, we slightly speed up starting
+	// the world, as we will start enough Ps to run all of the user
+	// goroutines and GC mark workers all at once, rather than using a
+	// sequence of wakep calls as each P's findRunnable realizes it needs
+	// to run a mark worker instead of a user goroutine.
+	//
+	// By assigning GC workers to Ps only _after_ previously-running Ps are
+	// assigned Ms, we ensure that goroutines previously running on a P
+	// continue to run on the same P, with GC mark workers preferring
+	// previously-idle Ps. This helps prevent goroutines from shuffling
+	// around too much across STW.
+	//
+	// N.B., if there aren't enough Ps left in idlePs for all of the GC
+	// mark workers, then findRunnable will still choose to run mark
+	// workers on Ps assigned above.
+	//
+	// N.B., we do this during any STW in the mark phase, not just the
+	// sweep termination STW that starts the mark phase. gcBgMarkWorker
+	// always preempts by removing itself from the P, so even unrelated
+	// STWs during the mark require that Ps reselect mark workers upon
+	// restart.
+	if gcBlackenEnabled != 0 {
+		for idlePs != nil {
+			pp := idlePs
+
+			ok, _ := gcController.assignWaitingGCWorker(pp, now)
+			if !ok {
+				// No more mark workers needed.
+				break
+			}
+
+			// Got a worker, P is now runnable.
+			//
+			// mget may return nil if there aren't enough Ms, in
+			// which case startTheWorldWithSema will start one.
+			//
+			// N.B. findRunnableGCWorker will make the worker G
+			// itself runnable.
+			idlePs = pp.link.ptr()
+			mp := mget()
+			pp.m.set(mp)
+			pp.link.set(runnablePs)
+			runnablePs = pp
+		}
+	}
+
+	// Finally, any remaining Ps are truly idle.
+	for idlePs != nil {
+		pp := idlePs
+		idlePs = pp.link.ptr()
+		pidleput(pp, now)
+	}
+
 	stealOrder.reset(uint32(nprocs))
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
 	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@@ -6183,6 +6256,10 @@ func releasepNoTrace() *p {
 		print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
 		throw("releasep: invalid p state")
 	}
+
+	// P must clear if nextGCMarkWorker if it stops.
+	gcController.releaseNextGCMarkWorker(pp)
+
 	gp.m.p = 0
 	pp.m = 0
 	pp.status = _Pidle
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index b3084f4895..35a1aeab1f 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) {
 
 	var errors int
 	for i := range runs {
-		err := runTestTracesSTW(t, i)
+		err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
 		if err != nil {
 			t.Logf("Run %d failed: %v", i, err)
 			errors++
@@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) {
 	}
 }
 
-func runTestTracesSTW(t *testing.T, run int) (err error) {
+// TestTraceGCSTW verifies that goroutines continue running on the same M and P
+// after a GC STW.
+func TestTraceGCSTW(t *testing.T) {
+	// Very similar to TestTraceSTW, but using a STW that starts the GC.
+	// When the GC starts, the background GC mark workers start running,
+	// which provide an additional source of disturbance to the scheduler.
+	//
+	// procresize assigns GC workers to previously-idle Ps to avoid
+	// changing what the previously-running Ps are doing.
+
+	if testing.Short() {
+		t.Skip("skipping in -short mode")
+	}
+
+	if runtime.NumCPU() < 8 {
+		t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
+	}
+
+	const runs = 50
+
+	var errors int
+	for i := range runs {
+		err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
+		if err != nil {
+			t.Logf("Run %d failed: %v", i, err)
+			errors++
+		}
+	}
+
+	pct := float64(errors)/float64(runs)
+	t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
+	if pct > 0.25 {
+		t.Errorf("Error rate too high")
+	}
+}
+
+func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
 	t.Logf("Run %d", run)
 
 	// By default, TSAN sleeps for 1s at exit to allow background
@@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// much, since we are running 50 iterations, so disable the sleep.
 	//
 	// Outside of race mode, GORACE does nothing.
-	buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0"))
+	buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
 
 	// We locally "fail" the run (return an error) if the trace exhibits
 	// unwanted scheduling. i.e., the target goroutines did not remain on
@@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
 	// occur, such as a trace parse error.
 	defer func() {
 		if err != nil || t.Failed() {
-			testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false)
+			testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
 		}
 	}()
 
@@ -1509,12 +1545,10 @@ findEnd:
 			break findEnd
 		case trace.EventRangeBegin:
 			r := ev.Range()
-			if r.Name == "stop-the-world (read mem stats)" {
+			if r.Name == stwType {
 				// Note when we see the STW begin. This is not
 				// load bearing; it's purpose is simply to fail
-				// the test if we manage to remove the STW from
-				// ReadMemStat, so we remember to change this
-				// test to add some new source of STW.
+				// the test if we accidentally remove the STW.
 				stwSeen = true
 			}
 		}
diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go
index 0fed55b875..0fa15da09e 100644
--- a/src/runtime/testdata/testprog/stw_trace.go
+++ b/src/runtime/testdata/testprog/stw_trace.go
@@ -7,15 +7,18 @@ package main
 import (
 	"context"
 	"log"
+	"math/rand/v2"
 	"os"
 	"runtime"
 	"runtime/debug"
+	"runtime/metrics"
 	"runtime/trace"
 	"sync/atomic"
 )
 
 func init() {
 	register("TraceSTW", TraceSTW)
+	register("TraceGCSTW", TraceGCSTW)
 }
 
 // The parent writes to ping and waits for the children to write back
@@ -53,7 +56,7 @@ func TraceSTW() {
 	// https://go.dev/issue/65694). Alternatively, we could just ignore the
 	// trace if the GC runs.
 	runtime.GOMAXPROCS(4)
-	debug.SetGCPercent(0)
+	debug.SetGCPercent(-1)
 
 	if err := trace.Start(os.Stdout); err != nil {
 		log.Fatalf("failed to start tracing: %v", err)
@@ -86,6 +89,112 @@ func TraceSTW() {
 	stop.Store(true)
 }
 
+// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
+// previously-idle Ps, rather than bumping the current P.
+func TraceGCSTW() {
+	ctx := context.Background()
+
+	// The idea here is to have 2 target goroutines that are constantly
+	// running. When the world restarts after STW, we expect these
+	// goroutines to continue execution on the same M and P.
+	//
+	// Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
+	// 2 dedicated workers, and a bit of slack.
+	//
+	// Disable the GC initially so we can be sure it only triggers once we
+	// are ready.
+	runtime.GOMAXPROCS(8)
+	debug.SetGCPercent(-1)
+
+	if err := trace.Start(os.Stdout); err != nil {
+		log.Fatalf("failed to start tracing: %v", err)
+	}
+	defer trace.Stop()
+
+	for i := range 2 {
+		go traceSTWTarget(i)
+	}
+
+	// Wait for children to start running.
+	ping.Store(1)
+	for pong[0].Load() != 1 {}
+	for pong[1].Load() != 1 {}
+
+	trace.Log(ctx, "TraceSTW", "start")
+
+	// STW
+	triggerGC()
+
+	// Make sure to run long enough for the children to schedule again
+	// after STW. This is included for good measure, but the goroutines
+	// really ought to have already scheduled since the entire GC
+	// completed.
+	ping.Store(2)
+	for pong[0].Load() != 2 {}
+	for pong[1].Load() != 2 {}
+
+	trace.Log(ctx, "TraceSTW", "end")
+
+	stop.Store(true)
+}
+
+func triggerGC() {
+	// Allocate a bunch to trigger the GC rather than using runtime.GC. The
+	// latter blocks until the GC is complete, which is convenient, but
+	// messes with scheduling as it gives this P a chance to steal the
+	// other goroutines before their Ps get up and running again.
+
+	// Bring heap size up prior to enabling the GC to ensure that there is
+	// a decent amount of work in case the GC triggers immediately upon
+	// re-enabling.
+	for range 1000 {
+		alloc()
+	}
+
+	sample := make([]metrics.Sample, 1)
+	sample[0].Name = "/gc/cycles/total:gc-cycles"
+	metrics.Read(sample)
+
+	start := sample[0].Value.Uint64()
+
+	debug.SetGCPercent(100)
+
+	// Keep allocating until the GC is complete. We really only need to
+	// continue until the mark workers are scheduled, but there isn't a
+	// good way to measure that.
+	for {
+		metrics.Read(sample)
+		if sample[0].Value.Uint64() != start {
+			return
+		}
+
+		alloc()
+	}
+}
+
+// Allocate a tree data structure to generate plenty of scan work for the GC.
+
+type node struct {
+	children []*node
+}
+
+var gcSink node
+
+func alloc() {
+	// 10% chance of adding a node a each layer.
+
+	curr := &gcSink
+	for {
+		if len(curr.children) == 0 || rand.Float32() < 0.1 {
+			curr.children = append(curr.children, new(node))
+			return
+		}
+
+		i := rand.IntN(len(curr.children))
+		curr = curr.children[i]
+	}
+}
+
 // Manually insert a morestack call. Leaf functions can omit morestack, but
 // non-leaf functions should include them.
 
-- 
cgit v1.3


From 32f5aadd2ffc60421c62b185fa7668012fb5e73e Mon Sep 17 00:00:00 2001
From: "khr@golang.org" <khr@golang.org>
Date: Fri, 12 Sep 2025 14:43:19 -0700
Subject: cmd/compile: stack allocate backing stores during append

We can already stack allocate the backing store during append if the
resulting backing store doesn't escape. See CL 664299.

This CL enables us to often stack allocate the backing store during
append *even if* the result escapes. Typically, for code like:

    func f(n int) []int {
        var r []int
        for i := range n {
            r = append(r, i)
        }
        return r
    }

the backing store for r escapes, but only by returning it.
Could we operate with r on the stack for most of its lifeime,
and only move it to the heap at the return point?

The current implementation of append will need to do an allocation
each time it calls growslice. This will happen on the 1st, 2nd, 4th,
8th, etc. append calls. The allocations done by all but the
last growslice call will then immediately be garbage.

We'd like to avoid doing some of those intermediate allocations
if possible. We rewrite the above code by introducing a move2heap
operation:

    func f(n int) []int {
        var r []int
        for i := range n {
            r = append(r, i)
        }
        r = move2heap(r)
        return r
    }

Using the move2heap runtime function, which does:

    move2heap(r):
        If r is already backed by heap storage, return r.
        Otherwise, copy r to the heap and return the copy.

Now we can treat the backing store of r allocated at the
append site as not escaping. Previous stack allocation
optimizations now apply, which can use a fixed-size
stack-allocated backing store for r when appending.

See the description in cmd/compile/internal/slice/slice.go
for how we ensure that this optimization is safe.

Change-Id: I81f36e58bade2241d07f67967d8d547fff5302b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/707755
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/cmd/compile/internal/deadlocals/deadlocals.go  |   5 +
 src/cmd/compile/internal/escape/leaks.go           |  18 +
 src/cmd/compile/internal/gc/main.go                |   3 +
 src/cmd/compile/internal/ir/expr.go                |  26 ++
 src/cmd/compile/internal/ir/name.go                |   2 +-
 src/cmd/compile/internal/ir/node.go                |   1 +
 src/cmd/compile/internal/ir/node_gen.go            |  28 ++
 src/cmd/compile/internal/ir/op_string.go           |  19 +-
 src/cmd/compile/internal/ir/stmt.go                |   1 +
 src/cmd/compile/internal/ir/symtab.go              |   5 +
 src/cmd/compile/internal/slice/slice.go            | 455 +++++++++++++++++++++
 src/cmd/compile/internal/ssa/_gen/AMD64Ops.go      |   3 +-
 src/cmd/compile/internal/ssa/opGen.go              |   2 +-
 src/cmd/compile/internal/ssagen/ssa.go             | 266 ++++++++++--
 .../compile/internal/typecheck/_builtin/runtime.go |   6 +
 src/cmd/compile/internal/typecheck/builtin.go      | 212 +++++-----
 src/cmd/compile/internal/walk/expr.go              |   5 +
 src/runtime/slice.go                               | 104 +++++
 src/runtime/slice_test.go                          | 319 +++++++++++++++
 test/codegen/append.go                             | 190 +++++++++
 20 files changed, 1519 insertions(+), 151 deletions(-)
 create mode 100644 src/cmd/compile/internal/slice/slice.go
 create mode 100644 test/codegen/append.go

(limited to 'src/runtime')

diff --git a/src/cmd/compile/internal/deadlocals/deadlocals.go b/src/cmd/compile/internal/deadlocals/deadlocals.go
index 238450416a..55ad0387a4 100644
--- a/src/cmd/compile/internal/deadlocals/deadlocals.go
+++ b/src/cmd/compile/internal/deadlocals/deadlocals.go
@@ -44,6 +44,11 @@ func Funcs(fns []*ir.Func) {
 				*as.lhs = ir.BlankNode
 				*as.rhs = zero
 			}
+			if len(assigns) > 0 {
+				// k.Defn might be pointing at one of the
+				// assignments we're overwriting.
+				k.Defn = nil
+			}
 		}
 	}
 }
diff --git a/src/cmd/compile/internal/escape/leaks.go b/src/cmd/compile/internal/escape/leaks.go
index 942f87d2a2..176bccd847 100644
--- a/src/cmd/compile/internal/escape/leaks.go
+++ b/src/cmd/compile/internal/escape/leaks.go
@@ -124,3 +124,21 @@ func parseLeaks(s string) leaks {
 	copy(l[:], s[4:])
 	return l
 }
+
+func ParseLeaks(s string) leaks {
+	return parseLeaks(s)
+}
+
+// Any reports whether the value flows anywhere at all.
+func (l leaks) Any() bool {
+	// TODO: do mutator/callee matter?
+	if l.Heap() >= 0 || l.Mutator() >= 0 || l.Callee() >= 0 {
+		return true
+	}
+	for i := range numEscResults {
+		if l.Result(i) >= 0 {
+			return true
+		}
+	}
+	return false
+}
diff --git a/src/cmd/compile/internal/gc/main.go b/src/cmd/compile/internal/gc/main.go
index 42e2afaee4..ef6a5d6017 100644
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@@ -22,6 +22,7 @@ import (
 	"cmd/compile/internal/pkginit"
 	"cmd/compile/internal/reflectdata"
 	"cmd/compile/internal/rttype"
+	"cmd/compile/internal/slice"
 	"cmd/compile/internal/ssa"
 	"cmd/compile/internal/ssagen"
 	"cmd/compile/internal/staticinit"
@@ -266,6 +267,8 @@ func Main(archInit func(*ssagen.ArchInfo)) {
 	base.Timer.Start("fe", "escapes")
 	escape.Funcs(typecheck.Target.Funcs)
 
+	slice.Funcs(typecheck.Target.Funcs)
+
 	loopvar.LogTransformations(transformed)
 
 	// Collect information for go:nowritebarrierrec
diff --git a/src/cmd/compile/internal/ir/expr.go b/src/cmd/compile/internal/ir/expr.go
index 7a75ff40f2..dd1b94aa0d 100644
--- a/src/cmd/compile/internal/ir/expr.go
+++ b/src/cmd/compile/internal/ir/expr.go
@@ -192,6 +192,7 @@ type CallExpr struct {
 	IsDDD     bool
 	GoDefer   bool // whether this call is part of a go or defer statement
 	NoInline  bool // whether this call must not be inlined
+	UseBuf    bool // use stack buffer for backing store (OAPPEND only)
 }
 
 func NewCallExpr(pos src.XPos, op Op, fun Node, args []Node) *CallExpr {
@@ -1269,3 +1270,28 @@ func MethodExprFunc(n Node) *types.Field {
 	base.Fatalf("unexpected node: %v (%v)", n, n.Op())
 	panic("unreachable")
 }
+
+// A MoveToHeapExpr takes a slice as input and moves it to the
+// heap (by copying the backing store if it is not already
+// on the heap).
+type MoveToHeapExpr struct {
+	miniExpr
+	Slice Node
+	// An expression that evaluates to a *runtime._type
+	// that represents the slice element type.
+	RType Node
+	// If PreserveCapacity is true, the capacity of
+	// the resulting slice, and all of the elements in
+	// [len:cap], must be preserved.
+	// If PreserveCapacity is false, the resulting
+	// slice may have any capacity >= len, with any
+	// elements in the resulting [len:cap] range zeroed.
+	PreserveCapacity bool
+}
+
+func NewMoveToHeapExpr(pos src.XPos, slice Node) *MoveToHeapExpr {
+	n := &MoveToHeapExpr{Slice: slice}
+	n.pos = pos
+	n.op = OMOVE2HEAP
+	return n
+}
diff --git a/src/cmd/compile/internal/ir/name.go b/src/cmd/compile/internal/ir/name.go
index 01f1c0c502..63f1b1c931 100644
--- a/src/cmd/compile/internal/ir/name.go
+++ b/src/cmd/compile/internal/ir/name.go
@@ -43,7 +43,7 @@ type Name struct {
 	Func      *Func // TODO(austin): nil for I.M
 	Offset_   int64
 	val       constant.Value
-	Opt       any      // for use by escape analysis
+	Opt       any      // for use by escape or slice analysis
 	Embed     *[]Embed // list of embedded files, for ONAME var
 
 	// For a local variable (not param) or extern, the initializing assignment (OAS or OAS2).
diff --git a/src/cmd/compile/internal/ir/node.go b/src/cmd/compile/internal/ir/node.go
index 8c61bb6ed5..f26f61cb18 100644
--- a/src/cmd/compile/internal/ir/node.go
+++ b/src/cmd/compile/internal/ir/node.go
@@ -293,6 +293,7 @@ const (
 	OLINKSYMOFFSET   // offset within a name
 	OJUMPTABLE       // A jump table structure for implementing dense expression switches
 	OINTERFACESWITCH // A type switch with interface cases
+	OMOVE2HEAP       // Promote a stack-backed slice to heap
 
 	// opcodes for generics
 	ODYNAMICDOTTYPE  // x = i.(T) where T is a type parameter (or derived from a type parameter)
diff --git a/src/cmd/compile/internal/ir/node_gen.go b/src/cmd/compile/internal/ir/node_gen.go
index 2221045c93..4298b3a43d 100644
--- a/src/cmd/compile/internal/ir/node_gen.go
+++ b/src/cmd/compile/internal/ir/node_gen.go
@@ -1175,6 +1175,34 @@ func (n *MakeExpr) editChildrenWithHidden(edit func(Node) Node) {
 	}
 }
 
+func (n *MoveToHeapExpr) Format(s fmt.State, verb rune) { fmtNode(n, s, verb) }
+func (n *MoveToHeapExpr) copy() Node {
+	c := *n
+	c.init = copyNodes(c.init)
+	return &c
+}
+func (n *MoveToHeapExpr) doChildren(do func(Node) bool) bool {
+	if doNodes(n.init, do) {
+		return true
+	}
+	if n.Slice != nil && do(n.Slice) {
+		return true
+	}
+	return false
+}
+func (n *MoveToHeapExpr) doChildrenWithHidden(do func(Node) bool) bool {
+	return n.doChildren(do)
+}
+func (n *MoveToHeapExpr) editChildren(edit func(Node) Node) {
+	editNodes(n.init, edit)
+	if n.Slice != nil {
+		n.Slice = edit(n.Slice).(Node)
+	}
+}
+func (n *MoveToHeapExpr) editChildrenWithHidden(edit func(Node) Node) {
+	n.editChildren(edit)
+}
+
 func (n *Name) Format(s fmt.State, verb rune) { fmtNode(n, s, verb) }
 
 func (n *NilExpr) Format(s fmt.State, verb rune) { fmtNode(n, s, verb) }
diff --git a/src/cmd/compile/internal/ir/op_string.go b/src/cmd/compile/internal/ir/op_string.go
index 7494beee4c..f042ad84a4 100644
--- a/src/cmd/compile/internal/ir/op_string.go
+++ b/src/cmd/compile/internal/ir/op_string.go
@@ -151,18 +151,19 @@ func _() {
 	_ = x[OLINKSYMOFFSET-140]
 	_ = x[OJUMPTABLE-141]
 	_ = x[OINTERFACESWITCH-142]
-	_ = x[ODYNAMICDOTTYPE-143]
-	_ = x[ODYNAMICDOTTYPE2-144]
-	_ = x[ODYNAMICTYPE-145]
-	_ = x[OTAILCALL-146]
-	_ = x[OGETG-147]
-	_ = x[OGETCALLERSP-148]
-	_ = x[OEND-149]
+	_ = x[OMOVE2HEAP-143]
+	_ = x[ODYNAMICDOTTYPE-144]
+	_ = x[ODYNAMICDOTTYPE2-145]
+	_ = x[ODYNAMICTYPE-146]
+	_ = x[OTAILCALL-147]
+	_ = x[OGETG-148]
+	_ = x[OGETCALLERSP-149]
+	_ = x[OEND-150]
 }
 
-const _Op_name = "XXXNAMENONAMETYPELITERALNILADDSUBORXORADDSTRADDRANDANDAPPENDBYTES2STRBYTES2STRTMPRUNES2STRSTR2BYTESSTR2BYTESTMPSTR2RUNESSLICE2ARRSLICE2ARRPTRASAS2AS2DOTTYPEAS2FUNCAS2MAPRAS2RECVASOPCALLCALLFUNCCALLMETHCALLINTERCAPCLEARCLOSECLOSURECOMPLITMAPLITSTRUCTLITARRAYLITSLICELITPTRLITCONVCONVIFACECONVNOPCOPYDCLDCLFUNCDELETEDOTDOTPTRDOTMETHDOTINTERXDOTDOTTYPEDOTTYPE2EQNELTLEGEGTDEREFINDEXINDEXMAPKEYSTRUCTKEYLENMAKEMAKECHANMAKEMAPMAKESLICEMAKESLICECOPYMULDIVMODLSHRSHANDANDNOTNEWNOTBITNOTPLUSNEGORORPANICPRINTPRINTLNPARENSENDSLICESLICEARRSLICESTRSLICE3SLICE3ARRSLICEHEADERSTRINGHEADERRECOVERRECVRUNESTRSELRECV2MINMAXREALIMAGCOMPLEXUNSAFEADDUNSAFESLICEUNSAFESLICEDATAUNSAFESTRINGUNSAFESTRINGDATAMETHEXPRMETHVALUEBLOCKBREAKCASECONTINUEDEFERFALLFORGOTOIFLABELGORANGERETURNSELECTSWITCHTYPESWINLCALLMAKEFACEITABIDATASPTRCFUNCCHECKNILRESULTINLMARKLINKSYMOFFSETJUMPTABLEINTERFACESWITCHDYNAMICDOTTYPEDYNAMICDOTTYPE2DYNAMICTYPETAILCALLGETGGETCALLERSPEND"
+const _Op_name = "XXXNAMENONAMETYPELITERALNILADDSUBORXORADDSTRADDRANDANDAPPENDBYTES2STRBYTES2STRTMPRUNES2STRSTR2BYTESSTR2BYTESTMPSTR2RUNESSLICE2ARRSLICE2ARRPTRASAS2AS2DOTTYPEAS2FUNCAS2MAPRAS2RECVASOPCALLCALLFUNCCALLMETHCALLINTERCAPCLEARCLOSECLOSURECOMPLITMAPLITSTRUCTLITARRAYLITSLICELITPTRLITCONVCONVIFACECONVNOPCOPYDCLDCLFUNCDELETEDOTDOTPTRDOTMETHDOTINTERXDOTDOTTYPEDOTTYPE2EQNELTLEGEGTDEREFINDEXINDEXMAPKEYSTRUCTKEYLENMAKEMAKECHANMAKEMAPMAKESLICEMAKESLICECOPYMULDIVMODLSHRSHANDANDNOTNEWNOTBITNOTPLUSNEGORORPANICPRINTPRINTLNPARENSENDSLICESLICEARRSLICESTRSLICE3SLICE3ARRSLICEHEADERSTRINGHEADERRECOVERRECVRUNESTRSELRECV2MINMAXREALIMAGCOMPLEXUNSAFEADDUNSAFESLICEUNSAFESLICEDATAUNSAFESTRINGUNSAFESTRINGDATAMETHEXPRMETHVALUEBLOCKBREAKCASECONTINUEDEFERFALLFORGOTOIFLABELGORANGERETURNSELECTSWITCHTYPESWINLCALLMAKEFACEITABIDATASPTRCFUNCCHECKNILRESULTINLMARKLINKSYMOFFSETJUMPTABLEINTERFACESWITCHMOVE2HEAPDYNAMICDOTTYPEDYNAMICDOTTYPE2DYNAMICTYPETAILCALLGETGGETCALLERSPEND"
 
-var _Op_index = [...]uint16{0, 3, 7, 13, 17, 24, 27, 30, 33, 35, 38, 44, 48, 54, 60, 69, 81, 90, 99, 111, 120, 129, 141, 143, 146, 156, 163, 170, 177, 181, 185, 193, 201, 210, 213, 218, 223, 230, 237, 243, 252, 260, 268, 274, 278, 287, 294, 298, 301, 308, 314, 317, 323, 330, 338, 342, 349, 357, 359, 361, 363, 365, 367, 369, 374, 379, 387, 390, 399, 402, 406, 414, 421, 430, 443, 446, 449, 452, 455, 458, 461, 467, 470, 473, 479, 483, 486, 490, 495, 500, 507, 512, 516, 521, 529, 537, 543, 552, 563, 575, 582, 586, 593, 601, 604, 607, 611, 615, 622, 631, 642, 657, 669, 685, 693, 702, 707, 712, 716, 724, 729, 733, 736, 740, 742, 747, 749, 754, 760, 766, 772, 778, 785, 793, 797, 802, 806, 811, 819, 825, 832, 845, 854, 869, 883, 898, 909, 917, 921, 932, 935}
+var _Op_index = [...]uint16{0, 3, 7, 13, 17, 24, 27, 30, 33, 35, 38, 44, 48, 54, 60, 69, 81, 90, 99, 111, 120, 129, 141, 143, 146, 156, 163, 170, 177, 181, 185, 193, 201, 210, 213, 218, 223, 230, 237, 243, 252, 260, 268, 274, 278, 287, 294, 298, 301, 308, 314, 317, 323, 330, 338, 342, 349, 357, 359, 361, 363, 365, 367, 369, 374, 379, 387, 390, 399, 402, 406, 414, 421, 430, 443, 446, 449, 452, 455, 458, 461, 467, 470, 473, 479, 483, 486, 490, 495, 500, 507, 512, 516, 521, 529, 537, 543, 552, 563, 575, 582, 586, 593, 601, 604, 607, 611, 615, 622, 631, 642, 657, 669, 685, 693, 702, 707, 712, 716, 724, 729, 733, 736, 740, 742, 747, 749, 754, 760, 766, 772, 778, 785, 793, 797, 802, 806, 811, 819, 825, 832, 845, 854, 869, 878, 892, 907, 918, 926, 930, 941, 944}
 
 func (i Op) String() string {
 	if i >= Op(len(_Op_index)-1) {
diff --git a/src/cmd/compile/internal/ir/stmt.go b/src/cmd/compile/internal/ir/stmt.go
index 0801ecdd9e..affa5f4551 100644
--- a/src/cmd/compile/internal/ir/stmt.go
+++ b/src/cmd/compile/internal/ir/stmt.go
@@ -42,6 +42,7 @@ func (*Decl) isStmt() {}
 type Stmt interface {
 	Node
 	isStmt()
+	PtrInit() *Nodes
 }
 
 // A miniStmt is a miniNode with extra fields common to statements.
diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go
index f8eb457880..828c3b553a 100644
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@@ -29,6 +29,11 @@ type symsStruct struct {
 	GCWriteBarrier            [8]*obj.LSym
 	Goschedguarded            *obj.LSym
 	Growslice                 *obj.LSym
+	GrowsliceBuf              *obj.LSym
+	MoveSlice                 *obj.LSym
+	MoveSliceNoScan           *obj.LSym
+	MoveSliceNoCap            *obj.LSym
+	MoveSliceNoCapNoScan      *obj.LSym
 	InterfaceSwitch           *obj.LSym
 	MallocGC                  *obj.LSym
 	MallocGCSmallNoScan       [27]*obj.LSym
diff --git a/src/cmd/compile/internal/slice/slice.go b/src/cmd/compile/internal/slice/slice.go
new file mode 100644
index 0000000000..7a32e7adbd
--- /dev/null
+++ b/src/cmd/compile/internal/slice/slice.go
@@ -0,0 +1,455 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package slice
+
+// This file implements a stack-allocation optimization
+// for the backing store of slices.
+//
+// Consider the code:
+//
+//     var s []int
+//     for i := range ... {
+//        s = append(s, i)
+//     }
+//     return s
+//
+// Some of the append operations will need to do an allocation
+// by calling growslice. This will happen on the 1st, 2nd, 4th,
+// 8th, etc. append calls. The allocations done by all but the
+// last growslice call will then immediately be garbage.
+//
+// We'd like to avoid doing some of those intermediate
+// allocations if possible.
+//
+// If we can determine that the "return s" statement is the
+// *only* way that the backing store for s escapes, then we
+// can rewrite the code to something like:
+//
+//     var s []int
+//     for i := range N {
+//        s = append(s, i)
+//     }
+//     s = move2heap(s)
+//     return s
+//
+// Using the move2heap runtime function, which does:
+//
+//     move2heap(s):
+//         If s is not backed by a stackframe-allocated
+//         backing store, return s. Otherwise, copy s
+//         to the heap and return the copy.
+//
+// Now we can treat the backing store of s allocated at the
+// append site as not escaping. Previous stack allocation
+// optimizations now apply, which can use a fixed-size
+// stack-allocated backing store for s when appending.
+// (See ../ssagen/ssa.go:(*state).append)
+//
+// It is tricky to do this optimization safely. To describe
+// our analysis, we first define what an "exclusive" slice
+// variable is.
+//
+// A slice variable (a variable of slice type) is called
+// "exclusive" if, when it has a reference to a
+// stackframe-allocated backing store, it is the only
+// variable with such a reference.
+//
+// In other words, a slice variable is exclusive if
+// any of the following holds:
+//  1) It points to a heap-allocated backing store
+//  2) It points to a stack-allocated backing store
+//     for any parent frame.
+//  3) It is the only variable that references its
+//     backing store.
+//  4) It is nil.
+//
+// The nice thing about exclusive slice variables is that
+// it is always safe to do
+//    s = move2heap(s)
+// whenever s is an exclusive slice variable. Because no
+// one else has a reference to the backing store, no one
+// else can tell that we moved the backing store from one
+// location to another.
+//
+// Note that exclusiveness is a dynamic property. A slice
+// variable may be exclusive during some parts of execution
+// and not exclusive during others.
+//
+// The following operations set or preserve the exclusivity
+// of a slice variable s:
+//     s = nil
+//     s = append(s, ...)
+//     s = s[i:j]
+//     ... = s[i]
+//     s[i] = ...
+//     f(s) where f does not escape its argument
+// Other operations destroy exclusivity. A non-exhaustive list includes:
+//     x = s
+//     *p = s
+//     f(s) where f escapes its argument
+//     return s
+// To err on the safe side, we white list exclusivity-preserving
+// operations and we asssume that any other operations that mention s
+// destroy its exclusivity.
+//
+// Our strategy is to move the backing store of s to the heap before
+// any exclusive->nonexclusive transition. That way, s will only ever
+// have a reference to a stack backing store while it is exclusive.
+//
+// move2heap for a variable s is implemented with:
+//     if s points to within the stack frame {
+//         s2 := make([]T, s.len, s.cap)
+//         copy(s2[:s.cap], s[:s.cap])
+//         s = s2
+//     }
+// Note that in general we need to copy all of s[:cap(s)] elements when
+// moving to the heap. As an optimization, we keep track of slice variables
+// whose capacity, and the elements in s[len(s):cap(s)], are never accessed.
+// For those slice variables, we can allocate to the next size class above
+// the length, which saves memory and copying cost.
+
+import (
+	"cmd/compile/internal/base"
+	"cmd/compile/internal/escape"
+	"cmd/compile/internal/ir"
+	"cmd/compile/internal/reflectdata"
+)
+
+func Funcs(all []*ir.Func) {
+	if base.Flag.N != 0 {
+		return
+	}
+	for _, fn := range all {
+		analyze(fn)
+	}
+}
+
+func analyze(fn *ir.Func) {
+	type sliceInfo struct {
+		// Slice variable.
+		s *ir.Name
+
+		// Count of uses that this pass understands.
+		okUses int32
+		// Count of all uses found.
+		allUses int32
+
+		// A place where the slice variable transitions from
+		// exclusive to nonexclusive.
+		// We could keep track of more than one, but one is enough for now.
+		// Currently, this can be either a return statement or
+		// an assignment.
+		// TODO: other possible transitions?
+		transition ir.Stmt
+
+		// Each s = append(s, ...) instance we found.
+		appends []*ir.CallExpr
+
+		// Weight of the number of s = append(s, ...) instances we found.
+		// The optimizations we do are only really useful if there are at
+		// least weight 2. (Note: appends in loops have weight >= 2.)
+		appendWeight int
+
+		// Whether we ever do cap(s), or other operations that use cap(s)
+		// (possibly implicitly), like s[i:j].
+		capUsed bool
+	}
+
+	// Every variable (*ir.Name) that we are tracking will have
+	// a non-nil *sliceInfo in its Opt field.
+	haveLocalSlice := false
+	maxStackSize := int64(base.Debug.VariableMakeThreshold)
+	var namedRets []*ir.Name
+	for _, s := range fn.Dcl {
+		if !s.Type().IsSlice() {
+			continue
+		}
+		if s.Type().Elem().Size() > maxStackSize {
+			continue
+		}
+		if !base.VariableMakeHash.MatchPos(s.Pos(), nil) {
+			continue
+		}
+		s.Opt = &sliceInfo{s: s} // start tracking s
+		haveLocalSlice = true
+		if s.Class == ir.PPARAMOUT {
+			namedRets = append(namedRets, s)
+		}
+	}
+	if !haveLocalSlice {
+		return
+	}
+
+	// Keep track of loop depth while walking.
+	loopDepth := 0
+
+	// tracking returns the info for the slice variable if n is a slice
+	// variable that we're still considering, or nil otherwise.
+	tracking := func(n ir.Node) *sliceInfo {
+		if n == nil || n.Op() != ir.ONAME {
+			return nil
+		}
+		s := n.(*ir.Name)
+		if s.Opt == nil {
+			return nil
+		}
+		return s.Opt.(*sliceInfo)
+	}
+
+	// addTransition(n, loc) records that s experiences an exclusive->nonexclusive
+	// transition somewhere within loc.
+	addTransition := func(i *sliceInfo, loc ir.Stmt) {
+		if i.transition != nil {
+			// We only keep track of a single exclusive->nonexclusive transition
+			// for a slice variable. If we find more than one, give up.
+			// (More than one transition location would be fine, but we would
+			// start to get worried about introducing too much additional code.)
+			i.s.Opt = nil
+			return
+		}
+		i.transition = loc
+	}
+
+	// Examine an x = y assignment that occurs somewhere within statement stmt.
+	assign := func(x, y ir.Node, stmt ir.Stmt) {
+		if i := tracking(x); i != nil {
+			// s = y. Check for understood patterns for y.
+			if y == nil || y.Op() == ir.ONIL {
+				// s = nil is ok.
+				i.okUses++
+			} else if y.Op() == ir.OSLICELIT {
+				// s = []{...} is ok.
+				// Note: this reveals capacity. Should it?
+				i.okUses++
+				i.capUsed = true
+			} else if y.Op() == ir.OSLICE {
+				y := y.(*ir.SliceExpr)
+				if y.X == i.s {
+					// s = s[...:...] is ok
+					i.okUses += 2
+					i.capUsed = true
+				}
+			} else if y.Op() == ir.OAPPEND {
+				y := y.(*ir.CallExpr)
+				if y.Args[0] == i.s {
+					// s = append(s, ...) is ok
+					i.okUses += 2
+					i.appends = append(i.appends, y)
+					i.appendWeight += 1 + loopDepth
+				}
+				// TODO: s = append(nil, ...)?
+			}
+			// Note that technically s = make([]T, ...) preserves exclusivity, but
+			// we don't track that because we assume users who wrote that know
+			// better than the compiler does.
+
+			// TODO: figure out how to handle s = fn(..., s, ...)
+			// It would be nice to maintain exclusivity of s in this situation.
+			// But unfortunately, fn can return one of its other arguments, which
+			// may be a slice with a stack-allocated backing store other than s.
+			// (which may have preexisting references to its backing store).
+			//
+			// Maybe we could do it if s is the only argument?
+		}
+
+		if i := tracking(y); i != nil {
+			// ... = s
+			// Treat this as an exclusive->nonexclusive transition.
+			i.okUses++
+			addTransition(i, stmt)
+		}
+	}
+
+	var do func(ir.Node) bool
+	do = func(n ir.Node) bool {
+		if n == nil {
+			return false
+		}
+		switch n.Op() {
+		case ir.ONAME:
+			if i := tracking(n); i != nil {
+				// A use of a slice variable. Count it.
+				i.allUses++
+			}
+		case ir.ODCL:
+			n := n.(*ir.Decl)
+			if i := tracking(n.X); i != nil {
+				i.okUses++
+			}
+		case ir.OINDEX:
+			n := n.(*ir.IndexExpr)
+			if i := tracking(n.X); i != nil {
+				// s[i] is ok.
+				i.okUses++
+			}
+		case ir.OLEN:
+			n := n.(*ir.UnaryExpr)
+			if i := tracking(n.X); i != nil {
+				// len(s) is ok
+				i.okUses++
+			}
+		case ir.OCAP:
+			n := n.(*ir.UnaryExpr)
+			if i := tracking(n.X); i != nil {
+				// cap(s) is ok
+				i.okUses++
+				i.capUsed = true
+			}
+		case ir.OADDR:
+			n := n.(*ir.AddrExpr)
+			if n.X.Op() == ir.OINDEX {
+				n := n.X.(*ir.IndexExpr)
+				if i := tracking(n.X); i != nil {
+					// &s[i] is definitely a nonexclusive transition.
+					// (We need this case because s[i] is ok, but &s[i] is not.)
+					i.s.Opt = nil
+				}
+			}
+		case ir.ORETURN:
+			n := n.(*ir.ReturnStmt)
+			for _, x := range n.Results {
+				if i := tracking(x); i != nil {
+					i.okUses++
+					// We go exclusive->nonexclusive here
+					addTransition(i, n)
+				}
+			}
+			if len(n.Results) == 0 {
+				// Uses of named result variables are implicit here.
+				for _, x := range namedRets {
+					if i := tracking(x); i != nil {
+						addTransition(i, n)
+					}
+				}
+			}
+		case ir.OCALLFUNC:
+			n := n.(*ir.CallExpr)
+			for idx, arg := range n.Args {
+				if i := tracking(arg); i != nil {
+					if !argLeak(n, idx) {
+						// Passing s to a nonescaping arg is ok.
+						i.okUses++
+						i.capUsed = true
+					}
+				}
+			}
+		case ir.ORANGE:
+			// Range over slice is ok.
+			n := n.(*ir.RangeStmt)
+			if i := tracking(n.X); i != nil {
+				i.okUses++
+			}
+		case ir.OAS:
+			n := n.(*ir.AssignStmt)
+			assign(n.X, n.Y, n)
+		case ir.OAS2:
+			n := n.(*ir.AssignListStmt)
+			for i := range len(n.Lhs) {
+				assign(n.Lhs[i], n.Rhs[i], n)
+			}
+		case ir.OCLOSURE:
+			n := n.(*ir.ClosureExpr)
+			for _, v := range n.Func.ClosureVars {
+				do(v.Outer)
+			}
+		}
+		if n.Op() == ir.OFOR || n.Op() == ir.ORANGE {
+			// Note: loopDepth isn't really right for init portion
+			// of the for statement, but that's ok. Correctness
+			// does not depend on depth info.
+			loopDepth++
+			defer func() { loopDepth-- }()
+		}
+		// Check all the children.
+		ir.DoChildren(n, do)
+		return false
+	}
+
+	// Run the analysis over the whole body.
+	for _, stmt := range fn.Body {
+		do(stmt)
+	}
+
+	// Process accumulated info to find slice variables
+	// that we can allocate on the stack.
+	for _, s := range fn.Dcl {
+		if s.Opt == nil {
+			continue
+		}
+		i := s.Opt.(*sliceInfo)
+		s.Opt = nil
+		if i.okUses != i.allUses {
+			// Some use of i.s that don't understand lurks. Give up.
+			continue
+		}
+
+		// At this point, we've decided that we *can* do
+		// the optimization.
+
+		if i.transition == nil {
+			// Exclusive for its whole lifetime. That means it
+			// didn't escape. We can already handle nonescaping
+			// slices without this pass.
+			continue
+		}
+		if i.appendWeight < 2 {
+			// This optimization only really helps if there is
+			// (dynamically) more than one append.
+			continue
+		}
+
+		// Commit point - at this point we've decided we *should*
+		// do the optimization.
+
+		// Insert a move2heap operation before the exclusive->nonexclusive
+		// transition.
+		move := ir.NewMoveToHeapExpr(i.transition.Pos(), i.s)
+		if i.capUsed {
+			move.PreserveCapacity = true
+		}
+		move.RType = reflectdata.AppendElemRType(i.transition.Pos(), i.appends[0])
+		move.SetType(i.s.Type())
+		move.SetTypecheck(1)
+		as := ir.NewAssignStmt(i.transition.Pos(), i.s, move)
+		as.SetTypecheck(1)
+		i.transition.PtrInit().Prepend(as)
+		// Note: we prepend because we need to put the move2heap
+		// operation first, before any other init work, as the transition
+		// might occur in the init work.
+
+		// Now that we've inserted a move2heap operation before every
+		// exclusive -> nonexclusive transition, appends can now use
+		// stack backing stores.
+		// (This is the whole point of this pass, to enable stack
+		// allocation of append backing stores.)
+		for _, a := range i.appends {
+			a.SetEsc(ir.EscNone)
+			if i.capUsed {
+				a.UseBuf = true
+			}
+		}
+	}
+}
+
+// argLeak reports if the idx'th argument to the call n escapes anywhere
+// (to the heap, another argument, return value, etc.)
+// If unknown returns true.
+func argLeak(n *ir.CallExpr, idx int) bool {
+	if n.Op() != ir.OCALLFUNC {
+		return true
+	}
+	fn := ir.StaticCalleeName(ir.StaticValue(n.Fun))
+	if fn == nil {
+		return true
+	}
+	fntype := fn.Type()
+	if recv := fntype.Recv(); recv != nil {
+		if idx == 0 {
+			return escape.ParseLeaks(recv.Note).Any()
+		}
+		idx--
+	}
+	return escape.ParseLeaks(fntype.Params()[idx].Note).Any()
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
index e42b54398d..b23ccc4a70 100644
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -118,6 +118,7 @@ func init() {
 		gp11sb         = regInfo{inputs: []regMask{gpspsbg}, outputs: gponly}
 		gp21           = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
 		gp21sp         = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21sp2        = regInfo{inputs: []regMask{gp, gpsp}, outputs: gponly}
 		gp21sb         = regInfo{inputs: []regMask{gpspsbg, gpsp}, outputs: gponly}
 		gp21shift      = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
 		gp31shift      = regInfo{inputs: []regMask{gp, gp, cx}, outputs: []regMask{gp}}
@@ -262,7 +263,7 @@ func init() {
 		{name: "ADDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
 		{name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
 
-		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true},
+		{name: "SUBQ", argLength: 2, reg: gp21sp2, asm: "SUBQ", resultInArg0: true, clobberFlags: true},
 		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},
 		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int32", resultInArg0: true, clobberFlags: true},
 		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 944e1d7854..f13373d2c0 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7643,7 +7643,7 @@ var opcodeTable = [...]opInfo{
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
-				{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+				{1, 49151}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15
 			},
 			outputs: []outputInfo{
 				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 3dea733bbd..96be8ddd86 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -124,6 +124,11 @@ func InitConfig() {
 	ir.Syms.GCWriteBarrier[7] = typecheck.LookupRuntimeFunc("gcWriteBarrier8")
 	ir.Syms.Goschedguarded = typecheck.LookupRuntimeFunc("goschedguarded")
 	ir.Syms.Growslice = typecheck.LookupRuntimeFunc("growslice")
+	ir.Syms.GrowsliceBuf = typecheck.LookupRuntimeFunc("growsliceBuf")
+	ir.Syms.MoveSlice = typecheck.LookupRuntimeFunc("moveSlice")
+	ir.Syms.MoveSliceNoScan = typecheck.LookupRuntimeFunc("moveSliceNoScan")
+	ir.Syms.MoveSliceNoCap = typecheck.LookupRuntimeFunc("moveSliceNoCap")
+	ir.Syms.MoveSliceNoCapNoScan = typecheck.LookupRuntimeFunc("moveSliceNoCapNoScan")
 	ir.Syms.InterfaceSwitch = typecheck.LookupRuntimeFunc("interfaceSwitch")
 	for i := 1; i < len(ir.Syms.MallocGCSmallNoScan); i++ {
 		ir.Syms.MallocGCSmallNoScan[i] = typecheck.LookupRuntimeFunc(fmt.Sprintf("mallocgcSmallNoScanSC%d", i))
@@ -1091,6 +1096,23 @@ type state struct {
 
 	// Block starting position, indexed by block id.
 	blockStarts []src.XPos
+
+	// Information for stack allocation. Indexed by the first argument
+	// to an append call. Normally a slice-typed variable, but not always.
+	backingStores map[ir.Node]*backingStoreInfo
+}
+
+type backingStoreInfo struct {
+	// Size of backing store array (in elements)
+	K int64
+	// Stack-allocated backing store variable.
+	store *ir.Name
+	// Dynamic boolean variable marking the fact that we used this backing store.
+	used *ir.Name
+	// Have we used this variable statically yet? This is just a hint
+	// to avoid checking the dynamic variable if the answer is obvious.
+	// (usedStatic == true implies used == true)
+	usedStatic bool
 }
 
 type funcLine struct {
@@ -3673,6 +3695,9 @@ func (s *state) exprCheckPtr(n ir.Node, checkPtrOK bool) *ssa.Value {
 	case ir.OAPPEND:
 		return s.append(n.(*ir.CallExpr), false)
 
+	case ir.OMOVE2HEAP:
+		return s.move2heap(n.(*ir.MoveToHeapExpr))
+
 	case ir.OMIN, ir.OMAX:
 		return s.minMax(n.(*ir.CallExpr))
 
@@ -3734,6 +3759,68 @@ func (s *state) resultAddrOfCall(c *ssa.Value, which int64, t *types.Type) *ssa.
 	return addr
 }
 
+// Get backing store information for an append call.
+func (s *state) getBackingStoreInfoForAppend(n *ir.CallExpr) *backingStoreInfo {
+	if n.Esc() != ir.EscNone {
+		return nil
+	}
+	return s.getBackingStoreInfo(n.Args[0])
+}
+func (s *state) getBackingStoreInfo(n ir.Node) *backingStoreInfo {
+	t := n.Type()
+	et := t.Elem()
+	maxStackSize := int64(base.Debug.VariableMakeThreshold)
+	if et.Size() == 0 || et.Size() > maxStackSize {
+		return nil
+	}
+	if base.Flag.N != 0 {
+		return nil
+	}
+	if !base.VariableMakeHash.MatchPos(n.Pos(), nil) {
+		return nil
+	}
+	i := s.backingStores[n]
+	if i != nil {
+		return i
+	}
+
+	// Build type of backing store.
+	K := maxStackSize / et.Size() // rounds down
+	KT := types.NewArray(et, K)
+	KT.SetNoalg(true)
+	types.CalcArraySize(KT)
+	// Align more than naturally for the type KT. See issue 73199.
+	align := types.NewArray(types.Types[types.TUINTPTR], 0)
+	types.CalcArraySize(align)
+	storeTyp := types.NewStruct([]*types.Field{
+		{Sym: types.BlankSym, Type: align},
+		{Sym: types.BlankSym, Type: KT},
+	})
+	storeTyp.SetNoalg(true)
+	types.CalcStructSize(storeTyp)
+
+	// Make backing store variable.
+	backingStore := typecheck.TempAt(n.Pos(), s.curfn, storeTyp)
+	backingStore.SetAddrtaken(true)
+
+	// Make "used" boolean.
+	used := typecheck.TempAt(n.Pos(), s.curfn, types.Types[types.TBOOL])
+	if s.curBlock == s.f.Entry {
+		s.vars[used] = s.constBool(false)
+	} else {
+		// initialize this variable at end of entry block
+		s.defvars[s.f.Entry.ID][used] = s.constBool(false)
+	}
+
+	// Initialize an info structure.
+	if s.backingStores == nil {
+		s.backingStores = map[ir.Node]*backingStoreInfo{}
+	}
+	i = &backingStoreInfo{K: K, store: backingStore, used: used, usedStatic: false}
+	s.backingStores[n] = i
+	return i
+}
+
 // append converts an OAPPEND node to SSA.
 // If inplace is false, it converts the OAPPEND expression n to an ssa.Value,
 // adds it to s, and returns the Value.
@@ -3824,9 +3911,29 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 	// A stack-allocated backing store could be used at every
 	// append that qualifies, but we limit it in some cases to
 	// avoid wasted code and stack space.
-	// TODO: handle ... append case.
-	maxStackSize := int64(base.Debug.VariableMakeThreshold)
-	if !inplace && n.Esc() == ir.EscNone && et.Size() > 0 && et.Size() <= maxStackSize && base.Flag.N == 0 && base.VariableMakeHash.MatchPos(n.Pos(), nil) && !s.appendTargets[sn] {
+	//
+	// Note that we have two different strategies.
+	// 1. The standard strategy is just to allocate the full
+	//    backing store at the first append.
+	// 2. An alternate strategy is used when
+	//        a. The backing store eventually escapes via move2heap
+	//    and b. The capacity is used somehow
+	//    In this case, we don't want to just allocate
+	//    the full buffer at the first append, because when
+	//    we move2heap the buffer to the heap when it escapes,
+	//    we might end up wasting memory because we can't
+	//    change the capacity.
+	//    So in this case we use growsliceBuf to reuse the buffer
+	//    and walk one step up the size class ladder each time.
+	//
+	// TODO: handle ... append case? Currently we handle only
+	// a fixed number of appended elements.
+	var info *backingStoreInfo
+	if !inplace {
+		info = s.getBackingStoreInfoForAppend(n)
+	}
+
+	if !inplace && info != nil && !n.UseBuf && !info.usedStatic {
 		// if l <= K {
 		//   if !used {
 		//     if oldLen == 0 {
@@ -3850,43 +3957,19 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 		// It is ok to do it more often, but it is probably helpful only for
 		// the first instance. TODO: this could use more tuning. Using ir.Node
 		// as the key works for *ir.Name instances but probably nothing else.
-		if s.appendTargets == nil {
-			s.appendTargets = map[ir.Node]bool{}
-		}
-		s.appendTargets[sn] = true
-
-		K := maxStackSize / et.Size() // rounds down
-		KT := types.NewArray(et, K)
-		KT.SetNoalg(true)
-		types.CalcArraySize(KT)
-		// Align more than naturally for the type KT. See issue 73199.
-		align := types.NewArray(types.Types[types.TUINTPTR], 0)
-		types.CalcArraySize(align)
-		storeTyp := types.NewStruct([]*types.Field{
-			{Sym: types.BlankSym, Type: align},
-			{Sym: types.BlankSym, Type: KT},
-		})
-		storeTyp.SetNoalg(true)
-		types.CalcStructSize(storeTyp)
+		info.usedStatic = true
+		// TODO: unset usedStatic somehow?
 
 		usedTestBlock := s.f.NewBlock(ssa.BlockPlain)
 		oldLenTestBlock := s.f.NewBlock(ssa.BlockPlain)
 		bodyBlock := s.f.NewBlock(ssa.BlockPlain)
 		growSlice := s.f.NewBlock(ssa.BlockPlain)
-
-		// Make "used" boolean.
-		tBool := types.Types[types.TBOOL]
-		used := typecheck.TempAt(n.Pos(), s.curfn, tBool)
-		s.defvars[s.f.Entry.ID][used] = s.constBool(false) // initialize this variable at fn entry
-
-		// Make backing store variable.
 		tInt := types.Types[types.TINT]
-		backingStore := typecheck.TempAt(n.Pos(), s.curfn, storeTyp)
-		backingStore.SetAddrtaken(true)
+		tBool := types.Types[types.TBOOL]
 
 		// if l <= K
 		s.startBlock(grow)
-		kTest := s.newValue2(s.ssaOp(ir.OLE, tInt), tBool, l, s.constInt(tInt, K))
+		kTest := s.newValue2(s.ssaOp(ir.OLE, tInt), tBool, l, s.constInt(tInt, info.K))
 		b := s.endBlock()
 		b.Kind = ssa.BlockIf
 		b.SetControl(kTest)
@@ -3896,7 +3979,7 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 
 		// if !used
 		s.startBlock(usedTestBlock)
-		usedTest := s.newValue1(ssa.OpNot, tBool, s.expr(used))
+		usedTest := s.newValue1(ssa.OpNot, tBool, s.expr(info.used))
 		b = s.endBlock()
 		b.Kind = ssa.BlockIf
 		b.SetControl(usedTest)
@@ -3917,18 +4000,18 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 		// var store struct { _ [0]uintptr; arr [K]T }
 		s.startBlock(bodyBlock)
 		if et.HasPointers() {
-			s.vars[memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, backingStore, s.mem())
+			s.vars[memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, info.store, s.mem())
 		}
-		addr := s.addr(backingStore)
-		s.zero(storeTyp, addr)
+		addr := s.addr(info.store)
+		s.zero(info.store.Type(), addr)
 
 		// s = store.arr[:l:K]
 		s.vars[ptrVar] = addr
 		s.vars[lenVar] = l // nargs would also be ok because of the oldLen==0 test.
-		s.vars[capVar] = s.constInt(tInt, K)
+		s.vars[capVar] = s.constInt(tInt, info.K)
 
 		// used = true
-		s.assign(used, s.constBool(true), false, 0)
+		s.assign(info.used, s.constBool(true), false, 0)
 		b = s.endBlock()
 		b.AddEdgeTo(assign)
 
@@ -3939,7 +4022,25 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 	// Call growslice
 	s.startBlock(grow)
 	taddr := s.expr(n.Fun)
-	r := s.rtcall(ir.Syms.Growslice, true, []*types.Type{n.Type()}, p, l, c, nargs, taddr)
+	var r []*ssa.Value
+	if info != nil && n.UseBuf {
+		// Use stack-allocated buffer as backing store, if we can.
+		if et.HasPointers() && !info.usedStatic {
+			// Initialize in the function header. Not the best place,
+			// but it makes sure we don't scan this area before it is
+			// initialized.
+			mem := s.defvars[s.f.Entry.ID][memVar]
+			mem = s.f.Entry.NewValue1A(n.Pos(), ssa.OpVarDef, types.TypeMem, info.store, mem)
+			addr := s.f.Entry.NewValue2A(n.Pos(), ssa.OpLocalAddr, types.NewPtr(info.store.Type()), info.store, s.sp, mem)
+			mem = s.f.Entry.NewValue2I(n.Pos(), ssa.OpZero, types.TypeMem, info.store.Type().Size(), addr, mem)
+			mem.Aux = info.store.Type()
+			s.defvars[s.f.Entry.ID][memVar] = mem
+			info.usedStatic = true
+		}
+		r = s.rtcall(ir.Syms.GrowsliceBuf, true, []*types.Type{n.Type()}, p, l, c, nargs, taddr, s.addr(info.store), s.constInt(types.Types[types.TINT], info.K))
+	} else {
+		r = s.rtcall(ir.Syms.Growslice, true, []*types.Type{n.Type()}, p, l, c, nargs, taddr)
+	}
 
 	// Decompose output slice
 	p = s.newValue1(ssa.OpSlicePtr, pt, r[0])
@@ -4026,6 +4127,95 @@ func (s *state) append(n *ir.CallExpr, inplace bool) *ssa.Value {
 	return s.newValue3(ssa.OpSliceMake, n.Type(), p, l, c)
 }
 
+func (s *state) move2heap(n *ir.MoveToHeapExpr) *ssa.Value {
+	// s := n.Slice
+	// if s.ptr points to current stack frame {
+	//     s2 := make([]T, s.len, s.cap)
+	//     copy(s2[:cap], s[:cap])
+	//     s = s2
+	// }
+	// return s
+
+	slice := s.expr(n.Slice)
+	et := slice.Type.Elem()
+	pt := types.NewPtr(et)
+
+	info := s.getBackingStoreInfo(n)
+	if info == nil {
+		// Backing store will never be stack allocated, so
+		// move2heap is a no-op.
+		return slice
+	}
+
+	// Decomposse input slice.
+	p := s.newValue1(ssa.OpSlicePtr, pt, slice)
+	l := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
+	c := s.newValue1(ssa.OpSliceCap, types.Types[types.TINT], slice)
+
+	moveBlock := s.f.NewBlock(ssa.BlockPlain)
+	mergeBlock := s.f.NewBlock(ssa.BlockPlain)
+
+	s.vars[ptrVar] = p
+	s.vars[lenVar] = l
+	s.vars[capVar] = c
+
+	// Decide if we need to move the slice backing store.
+	// It needs to be moved if it is currently on the stack.
+	sub := ssa.OpSub64
+	less := ssa.OpLess64U
+	if s.config.PtrSize == 4 {
+		sub = ssa.OpSub32
+		less = ssa.OpLess32U
+	}
+	callerSP := s.newValue1(ssa.OpGetCallerSP, types.Types[types.TUINTPTR], s.mem())
+	frameSize := s.newValue2(sub, types.Types[types.TUINTPTR], callerSP, s.sp)
+	pInt := s.newValue2(ssa.OpConvert, types.Types[types.TUINTPTR], p, s.mem())
+	off := s.newValue2(sub, types.Types[types.TUINTPTR], pInt, s.sp)
+	cond := s.newValue2(less, types.Types[types.TBOOL], off, frameSize)
+
+	b := s.endBlock()
+	b.Kind = ssa.BlockIf
+	b.Likely = ssa.BranchUnlikely // fast path is to not have to call into runtime
+	b.SetControl(cond)
+	b.AddEdgeTo(moveBlock)
+	b.AddEdgeTo(mergeBlock)
+
+	// Move the slice to heap
+	s.startBlock(moveBlock)
+	var newSlice *ssa.Value
+	if et.HasPointers() {
+		typ := s.expr(n.RType)
+		if n.PreserveCapacity {
+			newSlice = s.rtcall(ir.Syms.MoveSlice, true, []*types.Type{slice.Type}, typ, p, l, c)[0]
+		} else {
+			newSlice = s.rtcall(ir.Syms.MoveSliceNoCap, true, []*types.Type{slice.Type}, typ, p, l)[0]
+		}
+	} else {
+		elemSize := s.constInt(types.Types[types.TUINTPTR], et.Size())
+		if n.PreserveCapacity {
+			newSlice = s.rtcall(ir.Syms.MoveSliceNoScan, true, []*types.Type{slice.Type}, elemSize, p, l, c)[0]
+		} else {
+			newSlice = s.rtcall(ir.Syms.MoveSliceNoCapNoScan, true, []*types.Type{slice.Type}, elemSize, p, l)[0]
+		}
+	}
+	// Decompose output slice
+	s.vars[ptrVar] = s.newValue1(ssa.OpSlicePtr, pt, newSlice)
+	s.vars[lenVar] = s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], newSlice)
+	s.vars[capVar] = s.newValue1(ssa.OpSliceCap, types.Types[types.TINT], newSlice)
+	b = s.endBlock()
+	b.AddEdgeTo(mergeBlock)
+
+	// Merge fast path (no moving) and slow path (moved)
+	s.startBlock(mergeBlock)
+	p = s.variable(ptrVar, pt)                      // generates phi for ptr
+	l = s.variable(lenVar, types.Types[types.TINT]) // generates phi for len
+	c = s.variable(capVar, types.Types[types.TINT]) // generates phi for cap
+	delete(s.vars, ptrVar)
+	delete(s.vars, lenVar)
+	delete(s.vars, capVar)
+	return s.newValue3(ssa.OpSliceMake, slice.Type, p, l, c)
+}
+
 // minMax converts an OMIN/OMAX builtin call into SSA.
 func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
 	// The OMIN/OMAX builtin is variadic, but its semantics are
diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go
index fbe8f77abd..7988ebf5b9 100644
--- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go
+++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go
@@ -195,6 +195,7 @@ func makeslice(typ *byte, len int, cap int) unsafe.Pointer
 func makeslice64(typ *byte, len int64, cap int64) unsafe.Pointer
 func makeslicecopy(typ *byte, tolen int, fromlen int, from unsafe.Pointer) unsafe.Pointer
 func growslice(oldPtr *any, newLen, oldCap, num int, et *byte) (ary []any)
+func growsliceBuf(oldPtr *any, newLen, oldCap, num int, et *byte, buf *any, bufLen int) (ary []any)
 func unsafeslicecheckptr(typ *byte, ptr unsafe.Pointer, len int64)
 func panicunsafeslicelen()
 func panicunsafeslicenilptr()
@@ -202,6 +203,11 @@ func unsafestringcheckptr(ptr unsafe.Pointer, len int64)
 func panicunsafestringlen()
 func panicunsafestringnilptr()
 
+func moveSlice(typ *byte, old *byte, len, cap int) (*byte, int, int)
+func moveSliceNoScan(elemSize uintptr, old *byte, len, cap int) (*byte, int, int)
+func moveSliceNoCap(typ *byte, old *byte, len int) (*byte, int, int)
+func moveSliceNoCapNoScan(elemSize uintptr, old *byte, len int) (*byte, int, int)
+
 func memmove(to *any, frm *any, length uintptr)
 func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
 func memclrHasPointers(ptr unsafe.Pointer, n uintptr)
diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go
index ff72bdcf37..ee892856dd 100644
--- a/src/cmd/compile/internal/typecheck/builtin.go
+++ b/src/cmd/compile/internal/typecheck/builtin.go
@@ -160,80 +160,85 @@ var runtimeDecls = [...]struct {
 	{"makeslice64", funcTag, 124},
 	{"makeslicecopy", funcTag, 125},
 	{"growslice", funcTag, 127},
-	{"unsafeslicecheckptr", funcTag, 128},
+	{"growsliceBuf", funcTag, 128},
+	{"unsafeslicecheckptr", funcTag, 129},
 	{"panicunsafeslicelen", funcTag, 9},
 	{"panicunsafeslicenilptr", funcTag, 9},
-	{"unsafestringcheckptr", funcTag, 129},
+	{"unsafestringcheckptr", funcTag, 130},
 	{"panicunsafestringlen", funcTag, 9},
 	{"panicunsafestringnilptr", funcTag, 9},
-	{"memmove", funcTag, 130},
-	{"memclrNoHeapPointers", funcTag, 131},
-	{"memclrHasPointers", funcTag, 131},
-	{"memequal", funcTag, 132},
-	{"memequal0", funcTag, 133},
-	{"memequal8", funcTag, 133},
-	{"memequal16", funcTag, 133},
-	{"memequal32", funcTag, 133},
-	{"memequal64", funcTag, 133},
-	{"memequal128", funcTag, 133},
-	{"f32equal", funcTag, 134},
-	{"f64equal", funcTag, 134},
-	{"c64equal", funcTag, 134},
-	{"c128equal", funcTag, 134},
-	{"strequal", funcTag, 134},
-	{"interequal", funcTag, 134},
-	{"nilinterequal", funcTag, 134},
-	{"memhash", funcTag, 135},
-	{"memhash0", funcTag, 136},
-	{"memhash8", funcTag, 136},
-	{"memhash16", funcTag, 136},
-	{"memhash32", funcTag, 136},
-	{"memhash64", funcTag, 136},
-	{"memhash128", funcTag, 136},
-	{"f32hash", funcTag, 137},
-	{"f64hash", funcTag, 137},
-	{"c64hash", funcTag, 137},
-	{"c128hash", funcTag, 137},
-	{"strhash", funcTag, 137},
-	{"interhash", funcTag, 137},
-	{"nilinterhash", funcTag, 137},
-	{"int64div", funcTag, 138},
-	{"uint64div", funcTag, 139},
-	{"int64mod", funcTag, 138},
-	{"uint64mod", funcTag, 139},
-	{"float64toint64", funcTag, 140},
-	{"float64touint64", funcTag, 141},
-	{"float64touint32", funcTag, 142},
-	{"int64tofloat64", funcTag, 143},
-	{"int64tofloat32", funcTag, 144},
-	{"uint64tofloat64", funcTag, 145},
-	{"uint64tofloat32", funcTag, 146},
-	{"uint32tofloat64", funcTag, 147},
-	{"complex128div", funcTag, 148},
+	{"moveSlice", funcTag, 131},
+	{"moveSliceNoScan", funcTag, 132},
+	{"moveSliceNoCap", funcTag, 133},
+	{"moveSliceNoCapNoScan", funcTag, 134},
+	{"memmove", funcTag, 135},
+	{"memclrNoHeapPointers", funcTag, 136},
+	{"memclrHasPointers", funcTag, 136},
+	{"memequal", funcTag, 137},
+	{"memequal0", funcTag, 138},
+	{"memequal8", funcTag, 138},
+	{"memequal16", funcTag, 138},
+	{"memequal32", funcTag, 138},
+	{"memequal64", funcTag, 138},
+	{"memequal128", funcTag, 138},
+	{"f32equal", funcTag, 139},
+	{"f64equal", funcTag, 139},
+	{"c64equal", funcTag, 139},
+	{"c128equal", funcTag, 139},
+	{"strequal", funcTag, 139},
+	{"interequal", funcTag, 139},
+	{"nilinterequal", funcTag, 139},
+	{"memhash", funcTag, 140},
+	{"memhash0", funcTag, 141},
+	{"memhash8", funcTag, 141},
+	{"memhash16", funcTag, 141},
+	{"memhash32", funcTag, 141},
+	{"memhash64", funcTag, 141},
+	{"memhash128", funcTag, 141},
+	{"f32hash", funcTag, 142},
+	{"f64hash", funcTag, 142},
+	{"c64hash", funcTag, 142},
+	{"c128hash", funcTag, 142},
+	{"strhash", funcTag, 142},
+	{"interhash", funcTag, 142},
+	{"nilinterhash", funcTag, 142},
+	{"int64div", funcTag, 143},
+	{"uint64div", funcTag, 144},
+	{"int64mod", funcTag, 143},
+	{"uint64mod", funcTag, 144},
+	{"float64toint64", funcTag, 145},
+	{"float64touint64", funcTag, 146},
+	{"float64touint32", funcTag, 147},
+	{"int64tofloat64", funcTag, 148},
+	{"int64tofloat32", funcTag, 149},
+	{"uint64tofloat64", funcTag, 150},
+	{"uint64tofloat32", funcTag, 151},
+	{"uint32tofloat64", funcTag, 152},
+	{"complex128div", funcTag, 153},
 	{"racefuncenter", funcTag, 33},
 	{"racefuncexit", funcTag, 9},
 	{"raceread", funcTag, 33},
 	{"racewrite", funcTag, 33},
-	{"racereadrange", funcTag, 149},
-	{"racewriterange", funcTag, 149},
-	{"msanread", funcTag, 149},
-	{"msanwrite", funcTag, 149},
-	{"msanmove", funcTag, 150},
-	{"asanread", funcTag, 149},
-	{"asanwrite", funcTag, 149},
-	{"checkptrAlignment", funcTag, 151},
-	{"checkptrArithmetic", funcTag, 153},
-	{"libfuzzerTraceCmp1", funcTag, 154},
-	{"libfuzzerTraceCmp2", funcTag, 155},
-	{"libfuzzerTraceCmp4", funcTag, 156},
-	{"libfuzzerTraceCmp8", funcTag, 157},
-	{"libfuzzerTraceConstCmp1", funcTag, 154},
-	{"libfuzzerTraceConstCmp2", funcTag, 155},
-	{"libfuzzerTraceConstCmp4", funcTag, 156},
-	{"libfuzzerTraceConstCmp8", funcTag, 157},
-	{"libfuzzerHookStrCmp", funcTag, 158},
-	{"libfuzzerHookEqualFold", funcTag, 158},
-	{"addCovMeta", funcTag, 160},
+	{"racereadrange", funcTag, 154},
+	{"racewriterange", funcTag, 154},
+	{"msanread", funcTag, 154},
+	{"msanwrite", funcTag, 154},
+	{"msanmove", funcTag, 155},
+	{"asanread", funcTag, 154},
+	{"asanwrite", funcTag, 154},
+	{"checkptrAlignment", funcTag, 156},
+	{"checkptrArithmetic", funcTag, 158},
+	{"libfuzzerTraceCmp1", funcTag, 159},
+	{"libfuzzerTraceCmp2", funcTag, 160},
+	{"libfuzzerTraceCmp4", funcTag, 161},
+	{"libfuzzerTraceCmp8", funcTag, 162},
+	{"libfuzzerTraceConstCmp1", funcTag, 159},
+	{"libfuzzerTraceConstCmp2", funcTag, 160},
+	{"libfuzzerTraceConstCmp4", funcTag, 161},
+	{"libfuzzerTraceConstCmp8", funcTag, 162},
+	{"libfuzzerHookStrCmp", funcTag, 163},
+	{"libfuzzerHookEqualFold", funcTag, 163},
+	{"addCovMeta", funcTag, 165},
 	{"x86HasPOPCNT", varTag, 6},
 	{"x86HasSSE41", varTag, 6},
 	{"x86HasFMA", varTag, 6},
@@ -243,11 +248,11 @@ var runtimeDecls = [...]struct {
 	{"loong64HasLAM_BH", varTag, 6},
 	{"loong64HasLSX", varTag, 6},
 	{"riscv64HasZbb", varTag, 6},
-	{"asanregisterglobals", funcTag, 131},
+	{"asanregisterglobals", funcTag, 136},
 }
 
 func runtimeTypes() []*types.Type {
-	var typs [161]*types.Type
+	var typs [166]*types.Type
 	typs[0] = types.ByteType
 	typs[1] = types.NewPtr(typs[0])
 	typs[2] = types.Types[types.TANY]
@@ -376,39 +381,44 @@ func runtimeTypes() []*types.Type {
 	typs[125] = newSig(params(typs[1], typs[13], typs[13], typs[7]), params(typs[7]))
 	typs[126] = types.NewSlice(typs[2])
 	typs[127] = newSig(params(typs[3], typs[13], typs[13], typs[13], typs[1]), params(typs[126]))
-	typs[128] = newSig(params(typs[1], typs[7], typs[22]), nil)
-	typs[129] = newSig(params(typs[7], typs[22]), nil)
-	typs[130] = newSig(params(typs[3], typs[3], typs[5]), nil)
-	typs[131] = newSig(params(typs[7], typs[5]), nil)
-	typs[132] = newSig(params(typs[3], typs[3], typs[5]), params(typs[6]))
-	typs[133] = newSig(params(typs[3], typs[3]), params(typs[6]))
-	typs[134] = newSig(params(typs[7], typs[7]), params(typs[6]))
-	typs[135] = newSig(params(typs[3], typs[5], typs[5]), params(typs[5]))
-	typs[136] = newSig(params(typs[7], typs[5]), params(typs[5]))
-	typs[137] = newSig(params(typs[3], typs[5]), params(typs[5]))
-	typs[138] = newSig(params(typs[22], typs[22]), params(typs[22]))
-	typs[139] = newSig(params(typs[24], typs[24]), params(typs[24]))
-	typs[140] = newSig(params(typs[18]), params(typs[22]))
-	typs[141] = newSig(params(typs[18]), params(typs[24]))
-	typs[142] = newSig(params(typs[18]), params(typs[67]))
-	typs[143] = newSig(params(typs[22]), params(typs[18]))
-	typs[144] = newSig(params(typs[22]), params(typs[20]))
-	typs[145] = newSig(params(typs[24]), params(typs[18]))
-	typs[146] = newSig(params(typs[24]), params(typs[20]))
-	typs[147] = newSig(params(typs[67]), params(typs[18]))
-	typs[148] = newSig(params(typs[26], typs[26]), params(typs[26]))
-	typs[149] = newSig(params(typs[5], typs[5]), nil)
-	typs[150] = newSig(params(typs[5], typs[5], typs[5]), nil)
-	typs[151] = newSig(params(typs[7], typs[1], typs[5]), nil)
-	typs[152] = types.NewSlice(typs[7])
-	typs[153] = newSig(params(typs[7], typs[152]), nil)
-	typs[154] = newSig(params(typs[71], typs[71], typs[15]), nil)
-	typs[155] = newSig(params(typs[65], typs[65], typs[15]), nil)
-	typs[156] = newSig(params(typs[67], typs[67], typs[15]), nil)
-	typs[157] = newSig(params(typs[24], typs[24], typs[15]), nil)
-	typs[158] = newSig(params(typs[30], typs[30], typs[15]), nil)
-	typs[159] = types.NewArray(typs[0], 16)
-	typs[160] = newSig(params(typs[7], typs[67], typs[159], typs[30], typs[13], typs[71], typs[71]), params(typs[67]))
+	typs[128] = newSig(params(typs[3], typs[13], typs[13], typs[13], typs[1], typs[3], typs[13]), params(typs[126]))
+	typs[129] = newSig(params(typs[1], typs[7], typs[22]), nil)
+	typs[130] = newSig(params(typs[7], typs[22]), nil)
+	typs[131] = newSig(params(typs[1], typs[1], typs[13], typs[13]), params(typs[1], typs[13], typs[13]))
+	typs[132] = newSig(params(typs[5], typs[1], typs[13], typs[13]), params(typs[1], typs[13], typs[13]))
+	typs[133] = newSig(params(typs[1], typs[1], typs[13]), params(typs[1], typs[13], typs[13]))
+	typs[134] = newSig(params(typs[5], typs[1], typs[13]), params(typs[1], typs[13], typs[13]))
+	typs[135] = newSig(params(typs[3], typs[3], typs[5]), nil)
+	typs[136] = newSig(params(typs[7], typs[5]), nil)
+	typs[137] = newSig(params(typs[3], typs[3], typs[5]), params(typs[6]))
+	typs[138] = newSig(params(typs[3], typs[3]), params(typs[6]))
+	typs[139] = newSig(params(typs[7], typs[7]), params(typs[6]))
+	typs[140] = newSig(params(typs[3], typs[5], typs[5]), params(typs[5]))
+	typs[141] = newSig(params(typs[7], typs[5]), params(typs[5]))
+	typs[142] = newSig(params(typs[3], typs[5]), params(typs[5]))
+	typs[143] = newSig(params(typs[22], typs[22]), params(typs[22]))
+	typs[144] = newSig(params(typs[24], typs[24]), params(typs[24]))
+	typs[145] = newSig(params(typs[18]), params(typs[22]))
+	typs[146] = newSig(params(typs[18]), params(typs[24]))
+	typs[147] = newSig(params(typs[18]), params(typs[67]))
+	typs[148] = newSig(params(typs[22]), params(typs[18]))
+	typs[149] = newSig(params(typs[22]), params(typs[20]))
+	typs[150] = newSig(params(typs[24]), params(typs[18]))
+	typs[151] = newSig(params(typs[24]), params(typs[20]))
+	typs[152] = newSig(params(typs[67]), params(typs[18]))
+	typs[153] = newSig(params(typs[26], typs[26]), params(typs[26]))
+	typs[154] = newSig(params(typs[5], typs[5]), nil)
+	typs[155] = newSig(params(typs[5], typs[5], typs[5]), nil)
+	typs[156] = newSig(params(typs[7], typs[1], typs[5]), nil)
+	typs[157] = types.NewSlice(typs[7])
+	typs[158] = newSig(params(typs[7], typs[157]), nil)
+	typs[159] = newSig(params(typs[71], typs[71], typs[15]), nil)
+	typs[160] = newSig(params(typs[65], typs[65], typs[15]), nil)
+	typs[161] = newSig(params(typs[67], typs[67], typs[15]), nil)
+	typs[162] = newSig(params(typs[24], typs[24], typs[15]), nil)
+	typs[163] = newSig(params(typs[30], typs[30], typs[15]), nil)
+	typs[164] = types.NewArray(typs[0], 16)
+	typs[165] = newSig(params(typs[7], typs[67], typs[164], typs[30], typs[13], typs[71], typs[71]), params(typs[67]))
 	return typs[:]
 }
 
diff --git a/src/cmd/compile/internal/walk/expr.go b/src/cmd/compile/internal/walk/expr.go
index 989ae0a1db..2794671c73 100644
--- a/src/cmd/compile/internal/walk/expr.go
+++ b/src/cmd/compile/internal/walk/expr.go
@@ -351,6 +351,11 @@ func walkExpr1(n ir.Node, init *ir.Nodes) ir.Node {
 
 	case ir.OMETHVALUE:
 		return walkMethodValue(n.(*ir.SelectorExpr), init)
+
+	case ir.OMOVE2HEAP:
+		n := n.(*ir.MoveToHeapExpr)
+		n.Slice = walkExpr(n.Slice, init)
+		return n
 	}
 
 	// No return! Each case must return (or panic),
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index e31d5dccb2..a9e8fc1610 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -399,3 +399,107 @@ func bytealg_MakeNoZero(len int) []byte {
 	cap := roundupsize(uintptr(len), true)
 	return unsafe.Slice((*byte)(mallocgc(cap, nil, false)), cap)[:len]
 }
+
+// moveSlice copies the input slice to the heap and returns it.
+// et is the element type of the slice.
+func moveSlice(et *_type, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) {
+	if cap == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	capmem := uintptr(cap) * et.Size_
+	new := mallocgc(capmem, et, true)
+	bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), capmem, et)
+	memmove(new, old, capmem)
+	return new, len, cap
+}
+
+// moveSliceNoScan is like moveSlice except the element type is known to
+// not have any pointers. We instead pass in the size of the element.
+func moveSliceNoScan(elemSize uintptr, old unsafe.Pointer, len, cap int) (unsafe.Pointer, int, int) {
+	if cap == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	capmem := uintptr(cap) * elemSize
+	new := mallocgc(capmem, nil, false)
+	memmove(new, old, capmem)
+	return new, len, cap
+}
+
+// moveSliceNoCap is like moveSlice, but can pick any appropriate capacity
+// for the returned slice.
+// Elements between len and cap in the returned slice will be zeroed.
+func moveSliceNoCap(et *_type, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) {
+	if len == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	lenmem := uintptr(len) * et.Size_
+	capmem := roundupsize(lenmem, false)
+	new := mallocgc(capmem, et, true)
+	bulkBarrierPreWriteSrcOnly(uintptr(new), uintptr(old), lenmem, et)
+	memmove(new, old, lenmem)
+	return new, len, int(capmem / et.Size_)
+}
+
+// moveSliceNoCapNoScan is a combination of moveSliceNoScan and moveSliceNoCap.
+func moveSliceNoCapNoScan(elemSize uintptr, old unsafe.Pointer, len int) (unsafe.Pointer, int, int) {
+	if len == 0 {
+		if old != nil {
+			old = unsafe.Pointer(&zerobase)
+		}
+		return old, 0, 0
+	}
+	lenmem := uintptr(len) * elemSize
+	capmem := roundupsize(lenmem, true)
+	new := mallocgc(capmem, nil, false)
+	memmove(new, old, lenmem)
+	if capmem > lenmem {
+		memclrNoHeapPointers(add(new, lenmem), capmem-lenmem)
+	}
+	return new, len, int(capmem / elemSize)
+}
+
+// growsliceBuf is like growslice, but we can use the given buffer
+// as a backing store if we want. bufPtr must be on the stack.
+func growsliceBuf(oldPtr unsafe.Pointer, newLen, oldCap, num int, et *_type, bufPtr unsafe.Pointer, bufLen int) slice {
+	if newLen > bufLen {
+		// Doesn't fit, process like a normal growslice.
+		return growslice(oldPtr, newLen, oldCap, num, et)
+	}
+	oldLen := newLen - num
+	if oldPtr != bufPtr && oldLen != 0 {
+		// Move data to start of buffer.
+		// Note: bufPtr is on the stack, so no write barrier needed.
+		memmove(bufPtr, oldPtr, uintptr(oldLen)*et.Size_)
+	}
+	// Pick a new capacity.
+	//
+	// Unlike growslice, we don't need to double the size each time.
+	// The work done here is not proportional to the length of the slice.
+	// (Unless the memmove happens above, but that is rare, and in any
+	// case there are not many elements on this path.)
+	//
+	// Instead, we try to just bump up to the next size class.
+	// This will ensure that we don't waste any space when we eventually
+	// call moveSlice with the resulting slice.
+	newCap := int(roundupsize(uintptr(newLen)*et.Size_, !et.Pointers()) / et.Size_)
+
+	// Zero slice beyond newLen.
+	// The buffer is stack memory, so NoHeapPointers is ok.
+	// Caller will overwrite [oldLen:newLen], so we don't need to zero that portion.
+	// If et.Pointers(), buffer is at least initialized so we don't need to
+	// worry about the caller overwriting junk in [oldLen:newLen].
+	if newLen < newCap {
+		memclrNoHeapPointers(add(bufPtr, uintptr(newLen)*et.Size_), uintptr(newCap-newLen)*et.Size_)
+	}
+
+	return slice{bufPtr, newLen, newCap}
+}
diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go
index cd2bc26d1e..5463b6c02f 100644
--- a/src/runtime/slice_test.go
+++ b/src/runtime/slice_test.go
@@ -6,6 +6,9 @@ package runtime_test
 
 import (
 	"fmt"
+	"internal/race"
+	"internal/testenv"
+	"runtime"
 	"testing"
 )
 
@@ -499,3 +502,319 @@ func BenchmarkAppendInPlace(b *testing.B) {
 
 	})
 }
+
+//go:noinline
+func byteSlice(n int) []byte {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r
+}
+func TestAppendByteInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64}, // If we up the stack buffer size from 32->64, this line and the next would become 48.
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r = byteSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func ptrSlice(n int, p *[]*byte) {
+	var r []*byte
+	for range n {
+		r = append(r, nil)
+	}
+	*p = r
+}
+func TestAppendPtrInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	var tests [][3]int
+	if runtime.PtrSize == 8 {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 1},
+			{2, 1, 2},
+			{3, 1, 3}, // This is the interesting case, allocates 24 bytes when before it was 32.
+			{4, 1, 4},
+			{5, 1, 8},
+			{6, 1, 8},
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 2, 16},
+		}
+	} else {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 2},
+			{2, 1, 2},
+			{3, 1, 4},
+			{4, 1, 4},
+			{5, 1, 6}, // These two are also 24 bytes instead of 32.
+			{6, 1, 6}, //
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 1, 16},
+			{10, 1, 16},
+			{11, 1, 16},
+			{12, 1, 16},
+			{13, 1, 16},
+			{14, 1, 16},
+			{15, 1, 16},
+			{16, 1, 16},
+			{17, 2, 32},
+		}
+	}
+	for _, test := range tests {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []*byte
+		got := testing.AllocsPerRun(10, func() {
+			ptrSlice(n, &r)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func byteCapSlice(n int) ([]byte, int) {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r, cap(r)
+}
+func TestAppendByteCapInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64},
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r, _ = byteCapSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+func TestAppendGeneric(t *testing.T) {
+	type I *int
+	r := testAppendGeneric[I](100)
+	if len(r) != 100 {
+		t.Errorf("bad length")
+	}
+}
+
+//go:noinline
+func testAppendGeneric[E any](n int) []E {
+	var r []E
+	var z E
+	for range n {
+		r = append(r, z)
+	}
+	return r
+}
+
+func appendSomeBytes(r []byte, s []byte) []byte {
+	for _, b := range s {
+		r = append(r, b)
+	}
+	return r
+}
+
+func TestAppendOfArg(t *testing.T) {
+	r := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		r[i] = byte(i)
+	}
+	appendSomeBytes(r, []byte{25, 26, 27})
+	// Do the same thing, trying to overwrite any
+	// stack-allocated buffers used above.
+	s := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		s[i] = 99
+	}
+	appendSomeBytes(s, []byte{99, 99, 99})
+	// Check that we still have the right data.
+	for i, b := range r {
+		if b != byte(i) {
+			t.Errorf("r[%d]=%d, want %d", i, b, byte(i))
+		}
+	}
+
+}
+
+func BenchmarkAppendInLoop(b *testing.B) {
+	for _, size := range []int{0, 1, 8, 16, 32, 64, 128} {
+		b.Run(fmt.Sprintf("%d", size),
+			func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					byteSlice(size)
+				}
+			})
+	}
+}
+
+func TestMoveToHeapEarly(t *testing.T) {
+	// Just checking that this compiles.
+	var x []int
+	y := x // causes a move2heap in the entry block
+	for range 5 {
+		x = append(x, 5)
+	}
+	_ = y
+}
+
+func TestMoveToHeapCap(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = cap(s)
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func runit(f func()) {
+	f()
+}
+
+func TestMoveToHeapClosure1(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		runit(func() {
+			c = cap(s)
+		})
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+func TestMoveToHeapClosure2(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = func() int {
+			return cap(s)
+		}()
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func buildClosure(t *testing.T) ([]byte, func()) {
+	var s []byte
+	for i := range 20 {
+		s = append(s, byte(i))
+	}
+	c := func() {
+		for i, b := range s {
+			if b != byte(i) {
+				t.Errorf("s[%d]=%d, want %d", i, b, i)
+			}
+		}
+	}
+	return s, c
+}
+
+func TestMoveToHeapClosure3(t *testing.T) {
+	_, f := buildClosure(t)
+	overwriteStack(0)
+	f()
+}
+
+//go:noinline
+func overwriteStack(n int) uint64 {
+	var x [100]uint64
+	for i := range x {
+		x[i] = 0xabcdabcdabcdabcd
+	}
+	return x[n]
+}
+
+var sinkSlice []byte
diff --git a/test/codegen/append.go b/test/codegen/append.go
new file mode 100644
index 0000000000..0e58a48c45
--- /dev/null
+++ b/test/codegen/append.go
@@ -0,0 +1,190 @@
+// asmcheck
+
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package codegen
+
+func Append1(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append2(n int) (r []int) {
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return
+}
+
+func Append3(n int) (r []int) {
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append4(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, i)
+	}
+	println(cap(r))
+	// amd64:`.*moveSliceNoScan`
+	return r
+}
+
+func Append5(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, i)
+	}
+	useSlice(r)
+	// amd64:`.*moveSliceNoScan`
+	return r
+}
+
+func Append6(n int) []*int {
+	var r []*int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, new(i))
+	}
+	// amd64:`.*moveSliceNoCap`
+	return r
+}
+
+func Append7(n int) []*int {
+	var r []*int
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, new(i))
+	}
+	println(cap(r))
+	// amd64:`.*moveSlice`
+	return r
+}
+
+func Append8(n int, p *[]int) {
+	var r []int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	*p = r
+}
+
+func Append9(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	println(len(r))
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append10(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	println(r[3])
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append11(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, i)
+	}
+	r = r[3:5]
+	// amd64:`.*moveSliceNoScan`
+	return r
+}
+
+func Append12(n int) []int {
+	var r []int
+	r = nil
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append13(n int) []int {
+	var r []int
+	r, r = nil, nil
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append14(n int) []int {
+	var r []int
+	r = []int{3, 4, 5}
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoScan`
+	return r
+}
+
+func Append15(n int) []int {
+	r := []int{3, 4, 5}
+	for i := range n {
+		// amd64:`.*growsliceBuf`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoScan`
+	return r
+}
+
+func Append16(r []int, n int) []int {
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+func Append17(n int) []int {
+	var r []int
+	for i := range n {
+		// amd64:`.*growslice`
+		r = append(r, i)
+	}
+	for i, x := range r {
+		println(i, x)
+	}
+	// amd64:`.*moveSliceNoCapNoScan`
+	return r
+}
+
+//go:noinline
+func useSlice(s []int) {
+}
-- 
cgit v1.3