From 1a03d0db3f36c1a81a184fa15a54f716569a9972 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Wed, 5 Nov 2025 12:18:49 -0500
Subject: runtime: skip tests for GOEXPERIMENT=arenas that do not handle
 clobberfree=1

When run with GODEBUG=clobberfree=1, three out of seven of the top-level
tests in runtime/arena_test.go fail with a SIGSEGV inside the
clobberfree function where it is overwriting freed memory
with 0xdeadbeef.

This is not a new problem. For example, this crashes in Go 1.20:

  GODEBUG=clobberfree=1 go test runtime -run=TestUserArena

It would be nice for all.bash to pass with GODEBUG=clobberfree=1,
including it is useful for testing the automatic reclaiming of
dead memory via runtime.freegc in #74299.

Given the GOEXPERIMENT=arenas in #51317 is not planned to move forward
(and is hopefully slated to be replace by regions before too long),
for now we just skip those three tests in order to get all.bash
passing with GODEBUG=clobberfree=1.

Updates #74299

Change-Id: I384d96791157b30c73457d582a45dd74c5607ee0
Reviewed-on: https://go-review.googlesource.com/c/go/+/715080
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/runtime/export_test.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/runtime/export_test.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index d17984881d..731ba5d6b9 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -238,6 +238,12 @@ func SetEnvs(e []string) { envs = e }
 
 const PtrSize = goarch.PtrSize
 
+const ClobberdeadPtr = clobberdeadPtr
+
+func Clobberfree() bool {
+	return debug.clobberfree != 0
+}
+
 var ForceGCPeriod = &forcegcperiod
 
 // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
-- 
cgit v1.3-5-g9baa


From fecfcaa4f68a220f47e2c7c8b65d55906dbf8d46 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Tue, 4 Nov 2025 09:33:17 -0500
Subject: runtime: add runtime.freegc to reduce GC work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL adds runtime.freegc:

 func freegc(ptr unsafe.Pointer, uintptr size, noscan bool)

Memory freed via runtime.freegc is made immediately reusable for
the next allocation in the same size class, without waiting for a
GC cycle, and hence can dramatically reduce pressure on the GC. A sample
microbenchmark included below shows strings.Builder operating roughly
2x faster.

An experimental modification to reflect to use runtime.freegc
and then using that reflect with json/v2 gave reported memory
allocation reductions of -43.7%, -32.9%, -21.9%, -22.0%, -1.0%
for the 5 official real-world unmarshalling benchmarks from
go-json-experiment/jsonbench by the authors of json/v2, covering
the CanadaGeometry through TwitterStatus datasets.

Note: there is no intent to modify the standard library to have
explicit calls to runtime.freegc, and of course such an ability
would never be exposed to end-user code.

Later CLs in this stack teach the compiler how to automatically
insert runtime.freegc calls when it can prove it is safe to do so.

(The reflect modification and other experimental changes to
the standard library were just that -- experiments. It was
very helpful while initially developing runtime.freegc to see
more complex uses and closer-to-real-world benchmark results
prior to updating the compiler.)

This CL only addresses noscan span classes (heap objects without
pointers), such as the backing memory for a []byte or string. A
follow-on CL adds support for heap objects with pointers.

If we update strings.Builder to explicitly call runtime.freegc on its
internal buf after a resize operation (but without freeing the usually
final incarnation of buf that will be returned to the user as a string),
we can see some nice benchmark results on the existing strings
benchmarks that call Builder.Write N times and then call Builder.String.
Here, the (uncommon) case of a single Builder.Write is not helped (given
it never resizes after first alloc if there is only one Write), but the
impact grows such that it is up to ~2x faster as there are more resize
operations due to more strings.Builder.Write calls:

                                               │     disabled.out │      new-free-20.txt                │
                                               │      sec/op      │   sec/op     vs base                │
BuildString_Builder/1Write_36Bytes_NoGrow-4           55.82n ± 2%   55.86n ± 2%        ~ (p=0.794 n=20)
BuildString_Builder/2Write_36Bytes_NoGrow-4           125.2n ± 2%   115.4n ± 1%   -7.86% (p=0.000 n=20)
BuildString_Builder/3Write_36Bytes_NoGrow-4           224.0n ± 1%   188.2n ± 2%  -16.00% (p=0.000 n=20)
BuildString_Builder/5Write_36Bytes_NoGrow-4           239.1n ± 9%   205.1n ± 1%  -14.20% (p=0.000 n=20)
BuildString_Builder/8Write_36Bytes_NoGrow-4           422.8n ± 3%   325.4n ± 1%  -23.04% (p=0.000 n=20)
BuildString_Builder/10Write_36Bytes_NoGrow-4          436.9n ± 2%   342.3n ± 1%  -21.64% (p=0.000 n=20)
BuildString_Builder/100Write_36Bytes_NoGrow-4         4.403µ ± 1%   2.381µ ± 2%  -45.91% (p=0.000 n=20)
BuildString_Builder/1000Write_36Bytes_NoGrow-4        48.28µ ± 2%   21.38µ ± 2%  -55.71% (p=0.000 n=20)

See the design document for more discussion of the strings.Builder case.

For testing, we add tests that attempt to exercise different aspects
of the underlying freegc and mallocgc behavior on the reuse path.
Validating the assist credit manipulations turned out to be subtle,
so a test for that is added in the next CL. There are also
invariant checks added, controlled by consts (primarily the
doubleCheckReusable const currently).

This CL also adds support in runtime.freegc for GODEBUG=clobberfree=1
to immediately overwrite freed memory with 0xdeadbeef, which
can help a higher-level test fail faster in the event of a bug,
and also the GC specifically looks for that pattern and throws
a fatal error if it unexpectedly finds it.

A later CL (currently experimental) adds GODEBUG=clobberfree=2,
which uses mprotect (or VirtualProtect on Windows) to set
freed memory to fault if read or written, until the runtime
later unprotects the memory on the mallocgc reuse path.

For the cases where a normal allocation is happening without any reuse,
some initial microbenchmarks suggest the impact of these changes could
be small to negligible (at least with GOAMD64=v3):

goos: linux
goarch: amd64
pkg: runtime
cpu: AMD EPYC 7B13
                        │ base-512M-v3.bench │   ps16-512M-goamd64-v3.bench     │
                        │        sec/op      │ sec/op     vs base               │
Malloc8-16                      11.01n ± 1%   10.94n ± 1%  -0.68% (p=0.038 n=20)
Malloc16-16                     17.15n ± 1%   17.05n ± 0%  -0.55% (p=0.007 n=20)
Malloc32-16                     18.65n ± 1%   18.42n ± 0%  -1.26% (p=0.000 n=20)
MallocTypeInfo8-16              18.63n ± 0%   18.36n ± 0%  -1.45% (p=0.000 n=20)
MallocTypeInfo16-16             22.32n ± 0%   22.65n ± 0%  +1.50% (p=0.000 n=20)
MallocTypeInfo32-16             23.37n ± 0%   23.89n ± 0%  +2.23% (p=0.000 n=20)
geomean                         18.02n        18.01n       -0.05%

These last benchmark results include the runtime updates to support
span classes with pointers (which was originally part of this CL,
but later split out for ease of review).

Updates #74299

Change-Id: Icceaa0f79f85c70cd1a718f9a4e7f0cf3d77803c
Reviewed-on: https://go-review.googlesource.com/c/go/+/673695
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/cmd/link/internal/loader/loader.go   |   3 +-
 src/cmd/link/link_test.go                |   1 +
 src/cmd/link/testdata/linkname/freegc.go |  18 ++
 src/runtime/export_test.go               |   9 +
 src/runtime/malloc.go                    | 329 ++++++++++++++++++++++++++++++-
 src/runtime/malloc_test.go               | 286 +++++++++++++++++++++++++++
 src/runtime/mcache.go                    |  52 ++++-
 src/runtime/mheap.go                     |   2 +-
 8 files changed, 693 insertions(+), 7 deletions(-)
 create mode 100644 src/cmd/link/testdata/linkname/freegc.go

(limited to 'src/runtime/export_test.go')

diff --git a/src/cmd/link/internal/loader/loader.go b/src/cmd/link/internal/loader/loader.go
index 2d386c0c65..9ab55643f6 100644
--- a/src/cmd/link/internal/loader/loader.go
+++ b/src/cmd/link/internal/loader/loader.go
@@ -2464,10 +2464,11 @@ var blockedLinknames = map[string][]string{
 	// Experimental features
 	"runtime.goroutineLeakGC":    {"runtime/pprof"},
 	"runtime.goroutineleakcount": {"runtime/pprof"},
+	"runtime.freegc":             {}, // disallow all packages
 	// Others
 	"net.newWindowsFile":                   {"net"},              // pushed from os
 	"testing/synctest.testingSynctestTest": {"testing/synctest"}, // pushed from testing
-	"runtime.addmoduledata":                {},                   // disallow all package
+	"runtime.addmoduledata":                {},                   // disallow all packages
 }
 
 // check if a linkname reference to symbol s from pkg is allowed
diff --git a/src/cmd/link/link_test.go b/src/cmd/link/link_test.go
index 31822d21f3..6ab1246c81 100644
--- a/src/cmd/link/link_test.go
+++ b/src/cmd/link/link_test.go
@@ -1616,6 +1616,7 @@ func TestCheckLinkname(t *testing.T) {
 		// pull linkname of a builtin symbol is not ok
 		{"builtin.go", false},
 		{"addmoduledata.go", false},
+		{"freegc.go", false},
 		// legacy bad linkname is ok, for now
 		{"fastrand.go", true},
 		{"badlinkname.go", true},
diff --git a/src/cmd/link/testdata/linkname/freegc.go b/src/cmd/link/testdata/linkname/freegc.go
new file mode 100644
index 0000000000..390063f8e9
--- /dev/null
+++ b/src/cmd/link/testdata/linkname/freegc.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linkname runtime.freegc is not allowed.
+
+package main
+
+import (
+	_ "unsafe"
+)
+
+//go:linkname freegc runtime.freegc
+func freegc()
+
+func main() {
+	freegc()
+}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 731ba5d6b9..48dcf5aa39 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -639,6 +639,15 @@ func RunGetgThreadSwitchTest() {
 	}
 }
 
+// Expose freegc for testing.
+func Freegc(p unsafe.Pointer, size uintptr, noscan bool) {
+	freegc(p, size, noscan)
+}
+
+const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled
+
+const RuntimeFreegcEnabled = runtimeFreegcEnabled
+
 const (
 	PageSize         = pageSize
 	PallocChunkPages = pallocChunkPages
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index fc4f21b532..13f5fc3081 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1080,7 +1080,8 @@ func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, checkGCTrigger
 //
 // We might consider turning these on by default; many of them previously were.
 // They account for a few % of mallocgc's cost though, which does matter somewhat
-// at scale.
+// at scale. (When testing changes to malloc, consider enabling this, and also
+// some function-local 'doubleCheck' consts such as in mbitmap.go currently.)
 const doubleCheckMalloc = false
 
 // sizeSpecializedMallocEnabled is the set of conditions where we enable the size-specialized
@@ -1089,6 +1090,12 @@ const doubleCheckMalloc = false
 // properly on plan9, so size-specialized malloc is also disabled on plan9.
 const sizeSpecializedMallocEnabled = goexperiment.SizeSpecializedMalloc && GOOS != "plan9" && !asanenabled && !raceenabled && !msanenabled && !valgrindenabled
 
+// runtimeFreegcEnabled is the set of conditions where we enable the runtime.freegc
+// implementation and the corresponding allocation-related changes: the experiment must be
+// enabled, and none of the memory sanitizers should be enabled. We allow the race detector,
+// in contrast to sizeSpecializedMallocEnabled.
+const runtimeFreegcEnabled = goexperiment.RuntimeFreegc && !asanenabled && !msanenabled && !valgrindenabled
+
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -1150,7 +1157,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		size += asanRZ
 	}
 
-	// Assist the GC if needed.
+	// Assist the GC if needed. (On the reuse path, we currently compensate for this;
+	// changes here might require changes there.)
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}
@@ -1413,6 +1421,16 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	size = uintptr(gc.SizeClassToSize[sizeclass])
 	spc := makeSpanClass(sizeclass, true)
 	span := c.alloc[spc]
+
+	// First, check for a reusable object.
+	if runtimeFreegcEnabled && c.hasReusableNoscan(spc) {
+		// We have a reusable object, use it.
+		x := mallocgcSmallNoscanReuse(c, span, spc, size, needzero)
+		mp.mallocing = 0
+		releasem(mp)
+		return x, size
+	}
+
 	v := nextFreeFast(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
@@ -1472,6 +1490,55 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	return x, size
 }
 
+// mallocgcSmallNoscanReuse returns a previously freed noscan object after preparing it for reuse.
+// It must only be called if hasReusableNoscan returned true.
+func mallocgcSmallNoscanReuse(c *mcache, span *mspan, spc spanClass, size uintptr, needzero bool) unsafe.Pointer {
+	// TODO(thepudds): could nextFreeFast, nextFree and nextReusable return unsafe.Pointer?
+	// Maybe doesn't matter. gclinkptr might be for historical reasons.
+	v, span := c.nextReusableNoScan(span, spc)
+	x := unsafe.Pointer(v)
+
+	// Compensate for the GC assist credit deducted in mallocgc (before calling us and
+	// after we return) because this is not a newly allocated object. We use the full slot
+	// size (elemsize) here because that's what mallocgc deducts overall. Note we only
+	// adjust this when gcBlackenEnabled is true, which follows mallocgc behavior.
+	// TODO(thepudds): a follow-up CL adds a more specific test of our assist credit
+	// handling, including for validating internal fragmentation handling.
+	if gcBlackenEnabled != 0 {
+		addAssistCredit(size)
+	}
+
+	// This is a previously used object, so only check needzero (and not span.needzero)
+	// for clearing.
+	if needzero {
+		memclrNoHeapPointers(x, size)
+	}
+
+	// See publicationBarrier comment in mallocgcSmallNoscan.
+	publicationBarrier()
+
+	// Finish and return. Note that we do not update span.freeIndexForScan, profiling info,
+	// nor do we check gcTrigger.
+	// TODO(thepudds): the current approach is viable for a GOEXPERIMENT, but
+	// means we do not profile reused heap objects. Ultimately, we will need a better
+	// approach for profiling, or at least ensure we are not introducing bias in the
+	// profiled allocations.
+	// TODO(thepudds): related, we probably want to adjust how allocs and frees are counted
+	// in the existing stats. Currently, reused objects are not counted as allocs nor
+	// frees, but instead roughly appear as if the original heap object lived on. We
+	// probably will also want some additional runtime/metrics, and generally think about
+	// user-facing observability & diagnostics, though all this likely can wait for an
+	// official proposal.
+	if writeBarrier.enabled {
+		// Allocate black during GC.
+		// All slots hold nil so no scanning is needed.
+		// This may be racing with GC so do it atomically if there can be
+		// a race marking the bit.
+		gcmarknewobject(span, uintptr(x))
+	}
+	return x
+}
+
 func mallocgcSmallScanNoHeader(size uintptr, typ *_type) (unsafe.Pointer, uintptr) {
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
@@ -1816,8 +1883,6 @@ func postMallocgcDebug(x unsafe.Pointer, elemsize uintptr, typ *_type) {
 // by size bytes, and assists the GC if necessary.
 //
 // Caller must be preemptible.
-//
-// Returns the G for which the assist credit was accounted.
 func deductAssistCredit(size uintptr) {
 	// Charge the current user G for this allocation.
 	assistG := getg()
@@ -1836,6 +1901,262 @@ func deductAssistCredit(size uintptr) {
 	}
 }
 
+// addAssistCredit is like deductAssistCredit,
+// but adds credit rather than removes,
+// and never calls gcAssistAlloc.
+func addAssistCredit(size uintptr) {
+	// Credit the current user G.
+	assistG := getg()
+	if assistG.m.curg != nil { // TODO(thepudds): do we need to do this?
+		assistG = assistG.m.curg
+	}
+	// Credit the size against the G.
+	assistG.gcAssistBytes += int64(size)
+}
+
+const (
+	// doubleCheckReusable enables some additional invariant checks for the
+	// runtime.freegc and reusable objects. Note that some of these checks alter timing,
+	// and it is good to test changes with and without this enabled.
+	doubleCheckReusable = false
+
+	// debugReusableLog enables some printlns for runtime.freegc and reusable objects.
+	debugReusableLog = false
+)
+
+// freegc records that a heap object is reusable and available for
+// immediate reuse in a subsequent mallocgc allocation, without
+// needing to wait for the GC cycle to progress.
+//
+// The information is recorded in a free list stored in the
+// current P's mcache. The caller must pass in the user size
+// and whether the object has pointers, which allows a faster free
+// operation.
+//
+// freegc must be called by the effective owner of ptr who knows
+// the pointer is logically dead, with no possible aliases that might
+// be used past that moment. In other words, ptr must be the
+// last and only pointer to its referent.
+//
+// The intended caller is the compiler.
+//
+// Note: please do not send changes that attempt to add freegc calls
+// to the standard library.
+//
+// ptr must point to a heap object or into the current g's stack,
+// in which case freegc is a no-op. In particular, ptr must not point
+// to memory in the data or bss sections, which is partially enforced.
+// For objects with a malloc header, ptr should point mallocHeaderSize bytes
+// past the base; otherwise, ptr should point to the base of the heap object.
+// In other words, ptr should be the same pointer that was returned by mallocgc.
+//
+// In addition, the caller must know that ptr's object has no specials, such
+// as might have been created by a call to SetFinalizer or AddCleanup.
+// (Internally, the runtime deals appropriately with internally-created
+// specials, such as specials for memory profiling).
+//
+// If the size of ptr's object is less than 16 bytes or greater than
+// 32KiB - gc.MallocHeaderSize bytes, freegc is currently a no-op. It must only
+// be called in alloc-safe places. It currently throws if noscan is false
+// (support for which is implemented in a later CL in our stack).
+//
+// Note that freegc accepts an unsafe.Pointer and hence keeps the pointer
+// alive. It therefore could be a pessimization in some cases (such
+// as a long-lived function) if the caller does not call freegc before
+// or roughly when the liveness analysis of the compiler
+// would otherwise have determined ptr's object is reclaimable by the GC.
+func freegc(ptr unsafe.Pointer, size uintptr, noscan bool) bool {
+	if !runtimeFreegcEnabled || sizeSpecializedMallocEnabled || !reusableSize(size) {
+		// TODO(thepudds): temporarily disable freegc with SizeSpecializedMalloc until we finish integrating.
+		return false
+	}
+	if ptr == nil {
+		throw("freegc nil")
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	// Otherwise, the GC could flush our mcache or otherwise cause problems.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("freegc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("freegc during signal")
+	}
+	mp.mallocing = 1
+
+	if mp.curg.stack.lo <= uintptr(ptr) && uintptr(ptr) < mp.curg.stack.hi {
+		// This points into our stack, so free is a no-op.
+		mp.mallocing = 0
+		releasem(mp)
+		return false
+	}
+
+	if doubleCheckReusable {
+		// TODO(thepudds): we could enforce no free on globals in bss or data. Maybe by
+		// checking span via spanOf or spanOfHeap, or maybe walk from firstmoduledata
+		// like isGoPointerWithoutSpan, or activeModules, or something. If so, we might
+		// be able to delay checking until reuse (e.g., check span just before reusing,
+		// though currently we don't always need to lookup a span on reuse). If we think
+		// no usage patterns could result in globals, maybe enforcement for globals could
+		// be behind -d=checkptr=1 or similar. The compiler can have knowledge of where
+		// a variable is allocated, but stdlib does not, although there are certain
+		// usage patterns that cannot result in a global.
+		// TODO(thepudds): separately, consider a local debugReusableMcacheOnly here
+		// to ignore freed objects if not in mspan in mcache,  maybe when freeing and reading,
+		// by checking something like s.base() <= uintptr(v) && uintptr(v) < s.limit. Or
+		// maybe a GODEBUG or compiler debug flag.
+		span := spanOf(uintptr(ptr))
+		if span == nil {
+			throw("nextReusable: nil span for pointer in free list")
+		}
+		if state := span.state.get(); state != mSpanInUse {
+			throw("nextReusable: span is not in use")
+		}
+	}
+
+	if debug.clobberfree != 0 {
+		clobberfree(ptr, size)
+	}
+
+	// We first check if p is still in our per-P cache.
+	// Get our per-P cache for small objects.
+	c := getMCache(mp)
+	if c == nil {
+		throw("freegc called without a P or outside bootstrapping")
+	}
+
+	v := uintptr(ptr)
+	if !noscan && !heapBitsInSpan(size) {
+		// mallocgcSmallScanHeader expects to get the base address of the object back
+		// from the findReusable funcs (as well as from nextFreeFast and nextFree), and
+		// not mallocHeaderSize bytes into a object, so adjust that here.
+		v -= mallocHeaderSize
+
+		// The size class lookup wants size to be adjusted by mallocHeaderSize.
+		size += mallocHeaderSize
+	}
+
+	// TODO(thepudds): should verify (behind doubleCheckReusable constant) that our calculated
+	// sizeclass here matches what's in span found via spanOf(ptr) or findObject(ptr).
+	var sizeclass uint8
+	if size <= gc.SmallSizeMax-8 {
+		sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
+	} else {
+		sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
+	}
+
+	spc := makeSpanClass(sizeclass, noscan)
+	s := c.alloc[spc]
+
+	if debugReusableLog {
+		if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+			println("freegc [in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		} else {
+			println("freegc [NOT in mcache]:", hex(uintptr(v)), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+		}
+	}
+
+	if noscan {
+		c.addReusableNoscan(spc, uintptr(v))
+	} else {
+		// TODO(thepudds): implemented in later CL in our stack.
+		throw("freegc called for object with pointers, not yet implemented")
+	}
+
+	// For stats, for now we leave allocCount alone, roughly pretending to the rest
+	// of the system that this potential reuse never happened.
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	return true
+}
+
+// nextReusableNoScan returns the next reusable object for a noscan span,
+// or 0 if no reusable object is found.
+func (c *mcache) nextReusableNoScan(s *mspan, spc spanClass) (gclinkptr, *mspan) {
+	if !runtimeFreegcEnabled {
+		return 0, s
+	}
+
+	// Pop a reusable pointer from the free list for this span class.
+	v := c.reusableNoscan[spc]
+	if v == 0 {
+		return 0, s
+	}
+	c.reusableNoscan[spc] = v.ptr().next
+
+	if debugReusableLog {
+		println("reusing from ptr free list:", hex(v), "sweepgen:", mheap_.sweepgen, "writeBarrier.enabled:", writeBarrier.enabled)
+	}
+	if doubleCheckReusable {
+		doubleCheckNextReusable(v) // debug only sanity check
+	}
+
+	// For noscan spans, we only need the span if the write barrier is enabled (so that our caller
+	// can call gcmarknewobject to allocate black). If the write barrier is enabled, we can skip
+	// looking up the span when the pointer is in a span in the mcache.
+	if !writeBarrier.enabled {
+		return v, nil
+	}
+	if s.base() <= uintptr(v) && uintptr(v) < s.limit {
+		// Return the original span.
+		return v, s
+	}
+
+	// We must find and return the span.
+	span := spanOf(uintptr(v))
+	if span == nil {
+		// TODO(thepudds): construct a test that triggers this throw.
+		throw("nextReusableNoScan: nil span for pointer in reusable object free list")
+	}
+
+	return v, span
+}
+
+// doubleCheckNextReusable checks some invariants.
+// TODO(thepudds): will probably delete some of this. Can mostly be ignored for review.
+func doubleCheckNextReusable(v gclinkptr) {
+	// TODO(thepudds): should probably take the spanClass as well to confirm expected
+	// sizeclass match.
+	_, span, objIndex := findObject(uintptr(v), 0, 0)
+	if span == nil {
+		throw("nextReusable: nil span for pointer in free list")
+	}
+	if state := span.state.get(); state != mSpanInUse {
+		throw("nextReusable: span is not in use")
+	}
+	if uintptr(v) < span.base() || uintptr(v) >= span.limit {
+		throw("nextReusable: span is not in range")
+	}
+	if span.objBase(uintptr(v)) != uintptr(v) {
+		print("nextReusable: v=", hex(v), " base=", hex(span.objBase(uintptr(v))), "\n")
+		throw("nextReusable: v is non-base-address for object found on pointer free list")
+	}
+	if span.isFree(objIndex) {
+		throw("nextReusable: pointer on free list is free")
+	}
+
+	const debugReusableEnsureSwept = false
+	if debugReusableEnsureSwept {
+		// Currently disabled.
+		// Note: ensureSwept here alters behavior (not just an invariant check).
+		span.ensureSwept()
+		if span.isFree(objIndex) {
+			throw("nextReusable: pointer on free list is free after ensureSwept")
+		}
+	}
+}
+
+// reusableSize reports if size is a currently supported size for a reusable object.
+func reusableSize(size uintptr) bool {
+	if size < maxTinySize || size > maxSmallSize-mallocHeaderSize {
+		return false
+	}
+	return true
+}
+
 // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
 // on chunks of the buffer to be zeroed, with opportunities for preemption
 // along the way.  memclrNoHeapPointers contains no safepoints and also
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index bf58947bbc..6285cdaff7 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -16,6 +16,7 @@ import (
 	"runtime"
 	. "runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -234,6 +235,275 @@ func TestTinyAllocIssue37262(t *testing.T) {
 	runtime.Releasem()
 }
 
+// TestFreegc does basic testing of explicit frees.
+func TestFreegc(t *testing.T) {
+	tests := []struct {
+		size   string
+		f      func(noscan bool) func(*testing.T)
+		noscan bool
+	}{
+		// Types without pointers.
+		{"size=16", testFreegc[[16]byte], true}, // smallest we support currently
+		{"size=17", testFreegc[[17]byte], true},
+		{"size=64", testFreegc[[64]byte], true},
+		{"size=500", testFreegc[[500]byte], true},
+		{"size=512", testFreegc[[512]byte], true},
+		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
+	}
+
+	// Run the tests twice if not in -short mode or not otherwise saving test time.
+	// First while manually calling runtime.GC to slightly increase isolation (perhaps making
+	// problems more reproducible).
+	for _, tt := range tests {
+		runtime.GC()
+		t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		return
+	}
+
+	// Again, but without manually calling runtime.GC in the loop (perhaps less isolation might
+	// trigger problems).
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+}
+
+func testFreegc[T comparable](noscan bool) func(*testing.T) {
+	// We use stressMultiple to influence the duration of the tests.
+	// When testing freegc changes, stressMultiple can be increased locally
+	// to test longer or in some cases with more goroutines.
+	// It can also be helpful to test with GODEBUG=clobberfree=1 and
+	// with and without doubleCheckMalloc and doubleCheckReusable enabled.
+	stressMultiple := 10
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		stressMultiple = 1
+	}
+
+	return func(t *testing.T) {
+		alloc := func() *T {
+			// Force heap alloc, plus some light validation of zeroed memory.
+			t.Helper()
+			p := Escape(new(T))
+			var zero T
+			if *p != zero {
+				t.Fatalf("allocator returned non-zero memory: %v", *p)
+			}
+			return p
+		}
+
+		free := func(p *T) {
+			t.Helper()
+			var zero T
+			if *p != zero {
+				t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p)
+			}
+			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
+		}
+
+		t.Run("basic-free", func(t *testing.T) {
+			// Test that freeing a live heap object doesn't crash.
+			for range 100 {
+				p := alloc()
+				free(p)
+			}
+		})
+
+		t.Run("stack-free", func(t *testing.T) {
+			// Test that freeing a stack object doesn't crash.
+			for range 100 {
+				var x [32]byte
+				var y [32]*int
+				runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true)  // noscan
+				runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan
+			}
+		})
+
+		// Check our allocations. These tests rely on the
+		// current implementation treating a re-used object
+		// as not adding to the allocation counts seen
+		// by testing.AllocsPerRun. (This is not the desired
+		// long-term behavior, but it is the current behavior and
+		// makes these tests convenient).
+
+		t.Run("allocs-baseline", func(t *testing.T) {
+			// Baseline result without any explicit free.
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					_ = p
+				}
+			})
+			if allocs < 100 {
+				// TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why
+				// ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8.
+				t.Fatalf("expected >=100 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("allocs-with-free", func(t *testing.T) {
+			// Same allocations, but now using explicit free so that
+			// no allocs get reported. (Again, not the desired long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("free-multiple", func(t *testing.T) {
+			// Multiple allocations outstanding before explicitly freeing,
+			// but still within the limit of our smallest free list size
+			// so that no allocs are reported. (Again, not long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			const maxOutstanding = 20
+			s := make([]*T, 0, maxOutstanding)
+			allocs := testing.AllocsPerRun(100*stressMultiple, func() {
+				s = s[:0]
+				for range maxOutstanding {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		if runtime.GOARCH == "wasm" {
+			// TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem,
+			// vs. something else. It might have been wasm was slowest with tests that spawn
+			// many goroutines, which might be expected for wasm. This skip might no longer be
+			// needed now that we have tuned test execution time more, or perhaps wasm should just
+			// always run in short mode, which might also let us remove this skip.
+			t.Skip("skipping remaining freegc tests, was timing out on wasm")
+		}
+
+		t.Run("free-many", func(t *testing.T) {
+			// Confirm we are graceful if we have more freed elements at once
+			// than the max free list size.
+			s := make([]*T, 0, 1000)
+			iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short
+			for range iterations {
+				s = s[:0]
+				for range 1000 {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			}
+		})
+
+		t.Run("duplicate-check", func(t *testing.T) {
+			// A simple duplicate allocation test. We track what should be the set
+			// of live pointers in a map across a series of allocs and frees,
+			// and fail if a live pointer value is returned by an allocation.
+			// TODO: maybe add randomness? allow more live pointers? do across goroutines?
+			live := make(map[uintptr]bool)
+			for i := range 100 * stressMultiple {
+				var s []*T
+				// Alloc 10 times, tracking the live pointer values.
+				for j := range 10 {
+					p := alloc()
+					uptr := uintptr(unsafe.Pointer(p))
+					if live[uptr] {
+						t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+					}
+					live[uptr] = true
+					s = append(s, p)
+				}
+				// Explicitly free those pointers, removing them from the live map.
+				for k := range s {
+					p := s[k]
+					s[k] = nil
+					uptr := uintptr(unsafe.Pointer(p))
+					free(p)
+					delete(live, uptr)
+				}
+			}
+		})
+
+		t.Run("free-other-goroutine", func(t *testing.T) {
+			// Use explicit free, but the free happens on a different goroutine than the alloc.
+			// This also lightly simulates how the free code sees P migration or flushing
+			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short
+			for _, capacity := range []int{2} {
+				for range iterations {
+					ch := make(chan *T, capacity)
+					var wg sync.WaitGroup
+					for range 2 {
+						wg.Add(1)
+						go func() {
+							defer wg.Done()
+							for p := range ch {
+								free(p)
+							}
+						}()
+					}
+					for range 100 {
+						p := alloc()
+						ch <- p
+					}
+					close(ch)
+					wg.Wait()
+				}
+			}
+		})
+
+		t.Run("many-goroutines", func(t *testing.T) {
+			// Allocate across multiple goroutines, freeing on the same goroutine.
+			// TODO: probably remove the duplicate checking here; not that useful.
+			counts := []int{1, 2, 4, 8, 10 * stressMultiple}
+			for _, goroutines := range counts {
+				var wg sync.WaitGroup
+				for range goroutines {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						live := make(map[uintptr]bool)
+						for range 100 * stressMultiple {
+							p := alloc()
+							uptr := uintptr(unsafe.Pointer(p))
+							if live[uptr] {
+								panic("TestFreeLive: found duplicate pointer")
+							}
+							live[uptr] = true
+							free(p)
+							delete(live, uptr)
+						}
+					}()
+				}
+				wg.Wait()
+			}
+		})
+	}
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
@@ -337,6 +607,13 @@ func BenchmarkMalloc16(b *testing.B) {
 	}
 }
 
+func BenchmarkMalloc32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new([4]int64)
+		Escape(p)
+	}
+}
+
 func BenchmarkMallocTypeInfo8(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
@@ -355,6 +632,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) {
 	}
 }
 
+func BenchmarkMallocTypeInfo32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [32 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		Escape(p)
+	}
+}
+
 type LargeStruct struct {
 	x [16][]byte
 }
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index cade81031d..82872f1454 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -44,7 +44,17 @@ type mcache struct {
 
 	// The rest is not accessed on every malloc.
 
-	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
+	// alloc contains spans to allocate from, indexed by spanClass.
+	alloc [numSpanClasses]*mspan
+
+	// TODO(thepudds): better to interleave alloc and reusableScan/reusableNoscan so that
+	// a single malloc call can often access both in the same cache line for a given spanClass.
+	// It's not interleaved right now in part to have slightly smaller diff, and might be
+	// negligible effect on current microbenchmarks.
+
+	// reusableNoscan contains linked lists of reusable noscan heap objects, indexed by spanClass.
+	// The next pointers are stored in the first word of the heap objects.
+	reusableNoscan [numSpanClasses]gclinkptr
 
 	stackcache [_NumStackOrders]stackfreelist
 
@@ -96,6 +106,7 @@ func allocmcache() *mcache {
 		c.alloc[i] = &emptymspan
 	}
 	c.nextSample = nextSample()
+
 	return c
 }
 
@@ -153,6 +164,16 @@ func (c *mcache) refill(spc spanClass) {
 	if s.allocCount != s.nelems {
 		throw("refill of span with free space remaining")
 	}
+
+	// TODO(thepudds): we might be able to allow mallocgcTiny to reuse 16 byte objects from spc==5,
+	// but for now, just clear our reusable objects for tinySpanClass.
+	if spc == tinySpanClass {
+		c.reusableNoscan[spc] = 0
+	}
+	if c.reusableNoscan[spc] != 0 {
+		throw("refill of span with reusable pointers remaining on pointer free list")
+	}
+
 	if s != &emptymspan {
 		// Mark this span as no longer cached.
 		if s.sweepgen != mheap_.sweepgen+3 {
@@ -312,6 +333,13 @@ func (c *mcache) releaseAll() {
 	c.tinyAllocs = 0
 	memstats.heapStats.release()
 
+	// Clear the reusable linked lists.
+	// For noscan objects, the nodes of the linked lists are the reusable heap objects themselves,
+	// so we can simply clear the linked list head pointers.
+	// TODO(thepudds): consider having debug logging of a non-empty reusable lists getting cleared,
+	// maybe based on the existing debugReusableLog.
+	clear(c.reusableNoscan[:])
+
 	// Update heapLive and heapScan.
 	gcController.update(dHeapLive, scanAlloc)
 }
@@ -339,3 +367,25 @@ func (c *mcache) prepareForSweep() {
 	stackcache_clear(c)
 	c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart
 }
+
+// addReusableNoscan adds a noscan object pointer to the reusable pointer free list
+// for a span class.
+func (c *mcache) addReusableNoscan(spc spanClass, ptr uintptr) {
+	if !runtimeFreegcEnabled {
+		return
+	}
+
+	// Add to the reusable pointers free list.
+	v := gclinkptr(ptr)
+	v.ptr().next = c.reusableNoscan[spc]
+	c.reusableNoscan[spc] = v
+}
+
+// hasReusableNoscan reports whether there is a reusable object available for
+// a noscan spc.
+func (c *mcache) hasReusableNoscan(spc spanClass) bool {
+	if !runtimeFreegcEnabled {
+		return false
+	}
+	return c.reusableNoscan[spc] != 0
+}
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 08a0057be7..d2ff063b00 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -435,7 +435,7 @@ type mspan struct {
 	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
 	// just past the newly discovered free object.
 	//
-	// If freeindex == nelems, this span has no free objects.
+	// If freeindex == nelems, this span has no free objects, though might have reusable objects.
 	//
 	// allocBits is a bitmap of objects in this span.
 	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
-- 
cgit v1.3-5-g9baa


From 120f1874ef380362cf8b8c4775a327bcd417ff70 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Mon, 3 Nov 2025 16:40:40 -0500
Subject: runtime: add more precise test of assist credit handling for
 runtime.freegc

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL adds a better test of assist credit handling when heap objects
are being reused after a runtime.freegc call.

The main approach is bracketing alloc/free pairs with measurements
of the assist credit before and after, and hoping to see a net zero
change in the assist credit.

However, validating the desired behavior is perhaps a bit subtle.
To help stabilize the measurements, we do acquirem in the test code
to avoid being preempted during the measurements to reduce other code's
ability to adjust the assist credit while we are measuring, and
we also reduce GOMAXPROCS to 1.

This test currently does fail if we deliberately introduce bugs
in the runtime.freegc implementation such as if we:
- never adjust the assist credit when reusing an object, or
- always adjust the assist credit when reusing an object, or
- deliberately mishandle internal fragmentation.

The two main cases of current interest for testing runtime.freegc
are when over the course of our bracketed measurements gcBlackenEnable
is either true or false. The test attempts to exercise both of those
case by running the GC continually in the background (which we can see
seems effective based on logging and by how our deliberate bugs fail).

This passes ~10K test executions locally via stress.

A small note to the future: a previous incarnation of this test (circa
patchset 11 of this CL) did not do acquirem but had an approach of
ignoring certain measurements, which also was able to pass ~10K runs
via stress. The current version in this CL is simpler, but
recording the existence of the prior version here in case it is
useful in the future. (Hopefully not.)

Updates #74299

Change-Id: I46c7e0295d125f5884fee0cc3d3d31aedc7e5ff4
Reviewed-on: https://go-review.googlesource.com/c/go/+/717520
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/export_test.go | 28 ++++++++++++++
 src/runtime/malloc_test.go | 93 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 117 insertions(+), 4 deletions(-)

(limited to 'src/runtime/export_test.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 48dcf5aa39..8438603b9e 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -644,6 +644,25 @@ func Freegc(p unsafe.Pointer, size uintptr, noscan bool) {
 	freegc(p, size, noscan)
 }
 
+// Expose gcAssistBytes for the current g for testing.
+func AssistCredit() int64 {
+	assistG := getg()
+	if assistG.m.curg != nil {
+		assistG = assistG.m.curg
+	}
+	return assistG.gcAssistBytes
+}
+
+// Expose gcBlackenEnabled for testing.
+func GcBlackenEnable() bool {
+	// Note we do a non-atomic load here.
+	// Some checks against gcBlackenEnabled (e.g., in mallocgc)
+	// are currently done via non-atomic load for performance reasons,
+	// but other checks are done via atomic load (e.g., in mgcmark.go),
+	// so interpreting this value in a test may be subtle.
+	return gcBlackenEnabled != 0
+}
+
 const SizeSpecializedMallocEnabled = sizeSpecializedMallocEnabled
 
 const RuntimeFreegcEnabled = runtimeFreegcEnabled
@@ -1487,6 +1506,15 @@ func Releasem() {
 	releasem(getg().m)
 }
 
+// GoschedIfBusy is an explicit preemption check to call back
+// into the scheduler. This is useful for tests that run code
+// which spend most of their time as non-preemptible, as it
+// can be placed right after becoming preemptible again to ensure
+// that the scheduler gets a chance to preempt the goroutine.
+func GoschedIfBusy() {
+	goschedIfBusy()
+}
+
 type PIController struct {
 	piController
 }
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 6285cdaff7..10c20e6c23 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -249,6 +249,7 @@ func TestFreegc(t *testing.T) {
 		{"size=500", testFreegc[[500]byte], true},
 		{"size=512", testFreegc[[512]byte], true},
 		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=20000", testFreegc[[20000]byte], true},       // not power of 2 or spc boundary
 		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
 	}
 
@@ -300,7 +301,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			t.Helper()
 			var zero T
 			if *p != zero {
-				t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p)
+				t.Fatalf("found non-zero memory before freegc (tests do not modify memory): %v", *p)
 			}
 			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
 		}
@@ -405,7 +406,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// Confirm we are graceful if we have more freed elements at once
 			// than the max free list size.
 			s := make([]*T, 0, 1000)
-			iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short
+			iterations := stressMultiple * stressMultiple // currently 1 (-short) or 100
 			for range iterations {
 				s = s[:0]
 				for range 1000 {
@@ -431,7 +432,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 					p := alloc()
 					uptr := uintptr(unsafe.Pointer(p))
 					if live[uptr] {
-						t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+						t.Fatalf("found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
 					}
 					live[uptr] = true
 					s = append(s, p)
@@ -451,7 +452,7 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 			// Use explicit free, but the free happens on a different goroutine than the alloc.
 			// This also lightly simulates how the free code sees P migration or flushing
 			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
-			iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 (-short) or 1000
 			for _, capacity := range []int{2} {
 				for range iterations {
 					ch := make(chan *T, capacity)
@@ -501,6 +502,90 @@ func testFreegc[T comparable](noscan bool) func(*testing.T) {
 				wg.Wait()
 			}
 		})
+
+		t.Run("assist-credit", func(t *testing.T) {
+			// Allocate and free using the same span class repeatedly while
+			// verifying it results in a net zero change in assist credit.
+			// This helps double-check our manipulation of the assist credit
+			// during mallocgc/freegc, including in cases when there is
+			// internal fragmentation when the requested mallocgc size is
+			// smaller than the size class.
+			//
+			// See https://go.dev/cl/717520 for some additional discussion,
+			// including how we can deliberately cause the test to fail currently
+			// if we purposefully introduce some assist credit bugs.
+			if SizeSpecializedMallocEnabled {
+				// TODO(thepudds): skip this test at this point in the stack; later CL has
+				// integration with sizespecializedmalloc.
+				t.Skip("temporarily skip assist credit test for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping assist credit test with runtime.freegc disabled")
+			}
+
+			// Use a background goroutine to continuously run the GC.
+			done := make(chan struct{})
+			defer close(done)
+			go func() {
+				for {
+					select {
+					case <-done:
+						return
+					default:
+						runtime.GC()
+					}
+				}
+			}()
+
+			// If making changes related to this test, consider testing locally with
+			// larger counts, like 100K or 1M.
+			counts := []int{1, 2, 10, 100 * stressMultiple}
+			// Dropping down to GOMAXPROCS=1 might help reduce noise.
+			defer GOMAXPROCS(GOMAXPROCS(1))
+			size := int64(unsafe.Sizeof(*new(T)))
+			for _, count := range counts {
+				// Start by forcing a GC to reset this g's assist credit
+				// and perhaps help us get a cleaner measurement of GC cycle count.
+				runtime.GC()
+				for i := range count {
+					// We disable preemption to reduce other code's ability to adjust this g's
+					// assist credit or otherwise change things while we are measuring.
+					Acquirem()
+
+					// We do two allocations per loop, with the second allocation being
+					// the one we measure. The first allocation tries to ensure at least one
+					// reusable object on the mspan's free list when we do our measured allocation.
+					p := alloc()
+					free(p)
+
+					// Now do our primary allocation of interest, bracketed by measurements.
+					// We measure more than we strictly need (to log details in case of a failure).
+					creditStart := AssistCredit()
+					blackenStart := GcBlackenEnable()
+					p = alloc()
+					blackenAfterAlloc := GcBlackenEnable()
+					creditAfterAlloc := AssistCredit()
+					free(p)
+					blackenEnd := GcBlackenEnable()
+					creditEnd := AssistCredit()
+
+					Releasem()
+					GoschedIfBusy()
+
+					delta := creditEnd - creditStart
+					if delta != 0 {
+						t.Logf("assist credit non-zero delta: %d", delta)
+						t.Logf("\t| size: %d i: %d count: %d", size, i, count)
+						t.Logf("\t| credit before: %d credit after: %d", creditStart, creditEnd)
+						t.Logf("\t| alloc delta: %d free delta: %d",
+							creditAfterAlloc-creditStart, creditEnd-creditAfterAlloc)
+						t.Logf("\t| gcBlackenEnable (start / after alloc / end): %v/%v/%v",
+							blackenStart, blackenAfterAlloc, blackenEnd)
+						t.FailNow()
+					}
+				}
+			}
+		})
 	}
 }
 
-- 
cgit v1.3-5-g9baa


From e912618bd2de2121d6c9fed3473b5e0a47da138c Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Tue, 10 Jun 2025 19:19:08 -0400
Subject: runtime: add hexdumper

Currently, we have a simple hexdumpWords facility for debugging. It's
useful but pretty limited.

This CL adds a much more configurable and capable "hexdumper". It can
be configured for any word size (including bytes), handles unaligned
data, includes an ASCII dump, and accepts data in multiple slices. It
also has a much nicer "mark" facility for annotating the hexdump that
isn't limited to a single character per word.

We use this to improve our existing hexdumps, particularly the new
mark facility. The next CL will integrate hexdumps into debuglog,
which will make use of several other new capabilities.

Also this adds an actual test.

The output looks like:

                       7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
    000000c00006ef70:                    03000000 00000000          ........
    000000c00006ef80: 00000000 0053da80  000000c0 000bc380  ..S.............
                      ^ <testing.tRunner.func2+0x0>
    000000c00006ef90: 00000000 0053dac0  000000c0 000bc380  ..S.............
                      ^ <testing.tRunner.func1+0x0>
    000000c00006efa0: 000000c0 0006ef90  000000c0 0006ef80  ................
    000000c00006efb0: 000000c0 0006efd0  00000000 0053eb65  ........e.S.....
                                         ^ <testing.(*T).Run.gowrap1+0x25>
    000000c00006efc0: 000000c0 000bc380  00000000 009aaae8  ................
    000000c00006efd0: 00000000 00000000  00000000 00496b01  .........kI.....
                                         ^ <runtime.goexit+0x1>
    000000c00006efe0: 00000000 00000000  00000000 00000000  ................
    000000c00006eff0: 00000000 00000000                     ........

The header gives column labels, indicating the order of bytes within
the following words. The addresses on the left are always 16-byte
aligned so it's easy to combine that address with the column header to
determine the full address of a byte. Annotations are no longer
interleaved with the data, so the data stays in nicely aligned
columns. The annotations are also now much more flexible, including
support for multiple annotations on the same word (not shown).

Change-Id: I27e83800a1f6a7bdd3cc2c59614661a810a57d4d
Reviewed-on: https://go-review.googlesource.com/c/go/+/681375
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Austin Clements <austin@google.com>
---
 src/runtime/export_test.go  |  33 ++++++
 src/runtime/hexdump.go      | 269 ++++++++++++++++++++++++++++++++++++++++++++
 src/runtime/hexdump_test.go | 151 +++++++++++++++++++++++++
 src/runtime/mgcmark.go      |  15 ++-
 src/runtime/mgcsweep.go     |   2 +-
 src/runtime/print.go        |  41 -------
 src/runtime/traceback.go    |  21 ++--
 7 files changed, 475 insertions(+), 57 deletions(-)
 create mode 100644 src/runtime/hexdump.go
 create mode 100644 src/runtime/hexdump_test.go

(limited to 'src/runtime/export_test.go')

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 8438603b9e..2db8add7e4 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -2029,3 +2029,36 @@ func (head *ListHeadManual) Pop() unsafe.Pointer {
 func (head *ListHeadManual) Remove(p unsafe.Pointer) {
 	head.l.remove(p)
 }
+
+func Hexdumper(base uintptr, wordBytes int, mark func(addr uintptr, start func()), data ...[]byte) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	h := hexdumper{addr: base, addrBytes: 4, wordBytes: uint8(wordBytes)}
+	if mark != nil {
+		h.mark = func(addr uintptr, m hexdumpMarker) {
+			mark(addr, m.start)
+		}
+	}
+	for _, d := range data {
+		h.write(d)
+	}
+	h.close()
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("Hexdumper buf too small")
+	}
+	return string(buf[:n])
+}
+
+func HexdumpWords(p, bytes uintptr) string {
+	buf := make([]byte, 0, 2048)
+	getg().writebuf = buf
+	hexdumpWords(p, bytes, nil)
+	n := len(getg().writebuf)
+	getg().writebuf = nil
+	if n == cap(buf) {
+		panic("HexdumpWords buf too small")
+	}
+	return string(buf[:n])
+}
diff --git a/src/runtime/hexdump.go b/src/runtime/hexdump.go
new file mode 100644
index 0000000000..0d7dbb540b
--- /dev/null
+++ b/src/runtime/hexdump.go
@@ -0,0 +1,269 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// hexdumpWords prints a word-oriented hex dump of [p, p+len).
+//
+// If mark != nil, it will be passed to hexdumper.mark.
+func hexdumpWords(p, len uintptr, mark func(uintptr, hexdumpMarker)) {
+	printlock()
+
+	// Provide a default annotation
+	symMark := func(u uintptr, hm hexdumpMarker) {
+		if mark != nil {
+			mark(u, hm)
+		}
+
+		// Can we symbolize this value?
+		val := *(*uintptr)(unsafe.Pointer(u))
+		fn := findfunc(val)
+		if fn.valid() {
+			hm.start()
+			print("<", funcname(fn), "+", hex(val-fn.entry()), ">\n")
+		}
+	}
+
+	h := hexdumper{addr: p, mark: symMark}
+	h.write(unsafe.Slice((*byte)(unsafe.Pointer(p)), len))
+	h.close()
+	printunlock()
+}
+
+// hexdumper is a Swiss-army knife hex dumper.
+//
+// To use, optionally set addr and wordBytes, then call write repeatedly,
+// followed by close.
+type hexdumper struct {
+	// addr is the address to print for the first byte of data.
+	addr uintptr
+
+	// addrBytes is the number of bytes of addr to print. If this is 0, it
+	// defaults to goarch.PtrSize.
+	addrBytes uint8
+
+	// wordBytes is the number of bytes in a word. If wordBytes is 1, this
+	// prints a byte-oriented dump. If it's > 1, this interprets the data as a
+	// sequence of words of the given size. If it's 0, it's treated as
+	// goarch.PtrSize.
+	wordBytes uint8
+
+	// mark is an optional function that can annotate values in the hex dump.
+	//
+	// If non-nil, it is called with the address of every complete, aligned word
+	// in the hex dump.
+	//
+	// If it decides to print an annotation, it must first call m.start(), then
+	// print the annotation, followed by a new line.
+	mark func(addr uintptr, m hexdumpMarker)
+
+	// Below here is state
+
+	ready int8 // 0=need to init state; 1=need to print header; 2=ready
+
+	// dataBuf accumulates a line at a time of data, in case it's split across
+	// buffers.
+	dataBuf  [16]byte
+	dataPos  uint8
+	dataSkip uint8 // Skip first n bytes of buf on first line
+
+	// toPos maps from byte offset in data to a visual offset in the printed line.
+	toPos [16]byte
+}
+
+type hexdumpMarker struct {
+	chars int
+}
+
+func (h *hexdumper) write(data []byte) {
+	if h.ready == 0 {
+		h.init()
+	}
+
+	// Handle leading data
+	if h.dataPos > 0 {
+		n := copy(h.dataBuf[h.dataPos:], data)
+		h.dataPos += uint8(n)
+		data = data[n:]
+		if h.dataPos < uint8(len(h.dataBuf)) {
+			return
+		}
+		h.flushLine(h.dataBuf[:])
+		h.dataPos = 0
+	}
+
+	// Handle full lines in data
+	for len(data) >= len(h.dataBuf) {
+		h.flushLine(data[:len(h.dataBuf)])
+		data = data[len(h.dataBuf):]
+	}
+
+	// Handle trailing data
+	h.dataPos = uint8(copy(h.dataBuf[:], data))
+}
+
+func (h *hexdumper) close() {
+	if h.dataPos > 0 {
+		h.flushLine(h.dataBuf[:h.dataPos])
+	}
+}
+
+func (h *hexdumper) init() {
+	const bytesPerLine = len(h.dataBuf)
+
+	if h.addrBytes == 0 {
+		h.addrBytes = goarch.PtrSize
+	} else if h.addrBytes < 0 || h.addrBytes > goarch.PtrSize {
+		throw("invalid addrBytes")
+	}
+
+	if h.wordBytes == 0 {
+		h.wordBytes = goarch.PtrSize
+	}
+	wb := int(h.wordBytes)
+	if wb < 0 || wb >= bytesPerLine || wb&(wb-1) != 0 {
+		throw("invalid wordBytes")
+	}
+
+	// Construct position mapping.
+	for i := range h.toPos {
+		// First, calculate the "field" within the line, applying byte swizzling.
+		field := 0
+		if goarch.BigEndian {
+			field = i
+		} else {
+			field = i ^ int(wb-1)
+		}
+		// Translate this field into a visual offset.
+		// "00112233 44556677  8899AABB CCDDEEFF"
+		h.toPos[i] = byte(field*2 + field/4 + field/8)
+	}
+
+	// The first line may need to skip some fields to get to alignment.
+	// Round down the starting address.
+	nAddr := h.addr &^ uintptr(bytesPerLine-1)
+	// Skip bytes to get to alignment.
+	h.dataPos = uint8(h.addr - nAddr)
+	h.dataSkip = uint8(h.addr - nAddr)
+	h.addr = nAddr
+
+	// We're ready to print the header.
+	h.ready = 1
+}
+
+func (h *hexdumper) flushLine(data []byte) {
+	const bytesPerLine = len(h.dataBuf)
+
+	const maxAddrChars = 2 * goarch.PtrSize
+	const addrSep = ": "
+	dataStart := int(2*h.addrBytes) + len(addrSep)
+	// dataChars uses the same formula to toPos above. We calculate it with the
+	// "last field", then add the size of the last field.
+	const dataChars = (bytesPerLine-1)*2 + (bytesPerLine-1)/4 + (bytesPerLine-1)/8 + 2
+	const asciiSep = "  "
+	asciiStart := dataStart + dataChars + len(asciiSep)
+	const asciiChars = bytesPerLine
+	nlPos := asciiStart + asciiChars
+
+	var lineBuf [maxAddrChars + len(addrSep) + dataChars + len(asciiSep) + asciiChars + 1]byte
+	clear := func() {
+		for i := range lineBuf {
+			lineBuf[i] = ' '
+		}
+	}
+	clear()
+
+	if h.ready == 1 {
+		// Print column offsets header.
+		for offset, pos := range h.toPos {
+			h.fmtHex(lineBuf[dataStart+int(pos+1):][:1], uint64(offset))
+		}
+		// Print ASCII offsets.
+		for offset := range asciiChars {
+			h.fmtHex(lineBuf[asciiStart+offset:][:1], uint64(offset))
+		}
+		lineBuf[nlPos] = '\n'
+		gwrite(lineBuf[:nlPos+1])
+		clear()
+		h.ready = 2
+	}
+
+	// Format address.
+	h.fmtHex(lineBuf[:2*h.addrBytes], uint64(h.addr))
+	copy(lineBuf[2*h.addrBytes:], addrSep)
+	// Format data in hex and ASCII.
+	for offset, b := range data {
+		if offset < int(h.dataSkip) {
+			continue
+		}
+
+		pos := h.toPos[offset]
+		h.fmtHex(lineBuf[dataStart+int(pos):][:2], uint64(b))
+
+		copy(lineBuf[dataStart+dataChars:], asciiSep)
+		ascii := uint8('.')
+		if b >= ' ' && b <= '~' {
+			ascii = b
+		}
+		lineBuf[asciiStart+offset] = ascii
+	}
+	// Trim buffer.
+	end := asciiStart + len(data)
+	lineBuf[end] = '\n'
+	buf := lineBuf[:end+1]
+
+	// Print.
+	gwrite(buf)
+
+	// Print marks.
+	if h.mark != nil {
+		clear()
+		for offset := 0; offset+int(h.wordBytes) <= len(data); offset += int(h.wordBytes) {
+			if offset < int(h.dataSkip) {
+				continue
+			}
+			addr := h.addr + uintptr(offset)
+			// Find the position of the left edge of this word
+			caret := dataStart + int(min(h.toPos[offset], h.toPos[offset+int(h.wordBytes)-1]))
+			h.mark(addr, hexdumpMarker{caret})
+		}
+	}
+
+	h.addr += uintptr(bytesPerLine)
+	h.dataPos = 0
+	h.dataSkip = 0
+}
+
+// fmtHex formats v in base 16 into buf. It fills all of buf. If buf is too
+// small to represent v, it the output will start with '*'.
+func (h *hexdumper) fmtHex(buf []byte, v uint64) {
+	const dig = "0123456789abcdef"
+	i := len(buf) - 1
+	for ; i >= 0; i-- {
+		buf[i] = dig[v%16]
+		v /= 16
+	}
+	if v != 0 {
+		// Indicate that we couldn't fit the whole number.
+		buf[0] = '*'
+	}
+}
+
+func (m hexdumpMarker) start() {
+	var spaces [64]byte
+	for i := range spaces {
+		spaces[i] = ' '
+	}
+	for m.chars > len(spaces) {
+		gwrite(spaces[:])
+		m.chars -= len(spaces)
+	}
+	gwrite(spaces[:m.chars])
+	print("^ ")
+}
diff --git a/src/runtime/hexdump_test.go b/src/runtime/hexdump_test.go
new file mode 100644
index 0000000000..cc44e48e4b
--- /dev/null
+++ b/src/runtime/hexdump_test.go
@@ -0,0 +1,151 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"internal/abi"
+	"internal/goarch"
+	"runtime"
+	"slices"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+func TestHexdumper(t *testing.T) {
+	check := func(label, got, want string) {
+		got = strings.TrimRight(got, "\n")
+		want = strings.TrimPrefix(want, "\n")
+		want = strings.TrimRight(want, "\n")
+		if got != want {
+			t.Errorf("%s: got\n%s\nwant\n%s", label, got, want)
+		}
+	}
+
+	data := make([]byte, 32)
+	for i := range data {
+		data[i] = 0x10 + byte(i)
+	}
+
+	check("basic", runtime.Hexdumper(0, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./`)
+
+	if !goarch.BigEndian {
+		// Different word sizes
+		check("word=4", runtime.Hexdumper(0, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./`)
+		check("word=8", runtime.Hexdumper(0, 8, nil, data), `
+           7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+00000000: 17161514 13121110  1f1e1d1c 1b1a1918  ................
+00000010: 27262524 23222120  2f2e2d2c 2b2a2928   !"#$%&'()*+,-./`)
+	}
+
+	// Starting offset
+	check("offset=1", runtime.Hexdumper(1, 1, nil, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+	if !goarch.BigEndian {
+		// ... combined with a word size
+		check("offset=1 and word=4", runtime.Hexdumper(1, 4, nil, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 121110   16151413  1a191817 1e1d1c1b   ...............
+00000010: 2221201f 26252423  2a292827 2e2d2c2b  . !"#$%&'()*+,-.
+00000020:       2f                              /`)
+	}
+
+	// Partial data full of annoying boundaries.
+	partials := make([][]byte, 0)
+	for i := 0; i < len(data); i += 2 {
+		partials = append(partials, data[i:i+2])
+	}
+	check("partials", runtime.Hexdumper(1, 1, nil, partials...), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000:   101112 13141516  1718191a 1b1c1d1e   ...............
+00000010: 1f202122 23242526  2728292a 2b2c2d2e  . !"#$%&'()*+,-.
+00000020: 2f                                    /`)
+
+	// Marks.
+	check("marks", runtime.Hexdumper(0, 1, func(addr uintptr, start func()) {
+		if addr%7 == 0 {
+			start()
+			println("mark")
+		}
+	}, data), `
+           0 1 2 3  4 5 6 7   8 9 a b  c d e f  0123456789abcdef
+00000000: 10111213 14151617  18191a1b 1c1d1e1f  ................
+          ^ mark
+                         ^ mark
+                                          ^ mark
+00000010: 20212223 24252627  28292a2b 2c2d2e2f   !"#$%&'()*+,-./
+                     ^ mark
+                                      ^ mark`)
+	if !goarch.BigEndian {
+		check("marks and word=4", runtime.Hexdumper(0, 4, func(addr uintptr, start func()) {
+			if addr%7 == 0 {
+				start()
+				println("mark")
+			}
+		}, data), `
+           3 2 1 0  7 6 5 4   b a 9 8  f e d c  0123456789abcdef
+00000000: 13121110 17161514  1b1a1918 1f1e1d1c  ................
+          ^ mark
+00000010: 23222120 27262524  2b2a2928 2f2e2d2c   !"#$%&'()*+,-./
+                                      ^ mark`)
+	}
+}
+
+func TestHexdumpWords(t *testing.T) {
+	if goarch.BigEndian || goarch.PtrSize != 8 {
+		// We could support these, but it's kind of a pain.
+		t.Skip("requires 64-bit little endian")
+	}
+
+	// Most of this is in hexdumper. Here we just test the symbolizer.
+
+	pc := abi.FuncPCABIInternal(TestHexdumpWords)
+	pcs := slices.Repeat([]uintptr{pc}, 3)
+
+	// Make sure pcs doesn't move around on us.
+	var p runtime.Pinner
+	defer p.Unpin()
+	p.Pin(&pcs[0])
+	// Get a 16 byte, 16-byte-aligned chunk of pcs so the hexdump is simple.
+	start := uintptr(unsafe.Pointer(&pcs[0]))
+	start = (start + 15) &^ uintptr(15)
+
+	// Do the hex dump.
+	got := runtime.HexdumpWords(start, 16)
+
+	// Construct the expected output.
+	pcStr := fmt.Sprintf("%016x", pc)
+	pcStr = pcStr[:8] + " " + pcStr[8:] // Add middle space
+	ascii := make([]byte, 8)
+	for i := range ascii {
+		b := byte(pc >> (8 * i))
+		if b >= ' ' && b <= '~' {
+			ascii[i] = b
+		} else {
+			ascii[i] = '.'
+		}
+	}
+	want := fmt.Sprintf(`
+                   7 6 5 4  3 2 1 0   f e d c  b a 9 8  0123456789abcdef
+%016x: %s  %s  %s%s
+                  ^ <runtime_test.TestHexdumpWords+0x0>
+                                     ^ <runtime_test.TestHexdumpWords+0x0>
+`, start, pcStr, pcStr, ascii, ascii)
+	want = strings.TrimPrefix(want, "\n")
+
+	if got != want {
+		t.Errorf("got\n%s\nwant\n%s", got, want)
+	}
+}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index c9234c5084..714b9a51df 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -1524,29 +1524,32 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
 	if debugScanConservative {
 		printlock()
 		print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n")
-		hexdumpWords(b, b+n, func(p uintptr) byte {
+		hexdumpWords(b, n, func(p uintptr, m hexdumpMarker) {
 			if ptrmask != nil {
 				word := (p - b) / goarch.PtrSize
 				bits := *addb(ptrmask, word/8)
 				if (bits>>(word%8))&1 == 0 {
-					return '$'
+					return
 				}
 			}
 
 			val := *(*uintptr)(unsafe.Pointer(p))
 			if state != nil && state.stack.lo <= val && val < state.stack.hi {
-				return '@'
+				m.start()
+				println("ptr to stack")
+				return
 			}
 
 			span := spanOfHeap(val)
 			if span == nil {
-				return ' '
+				return
 			}
 			idx := span.objIndex(val)
 			if span.isFreeOrNewlyAllocated(idx) {
-				return ' '
+				return
 			}
-			return '*'
+			m.start()
+			println("ptr to heap")
 		})
 		printunlock()
 	}
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index c3d6afb90a..4eecb1cfd9 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -885,7 +885,7 @@ func (s *mspan) reportZombies() {
 			if length > 1024 {
 				length = 1024
 			}
-			hexdumpWords(addr, addr+length, nil)
+			hexdumpWords(addr, length, nil)
 		}
 		mbits.advance()
 		abits.advance()
diff --git a/src/runtime/print.go b/src/runtime/print.go
index c01db9d7f9..d2733fb266 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"internal/goarch"
 	"internal/strconv"
 	"unsafe"
 )
@@ -212,43 +211,3 @@ func printeface(e eface) {
 func printiface(i iface) {
 	print("(", i.tab, ",", i.data, ")")
 }
-
-// hexdumpWords prints a word-oriented hex dump of [p, end).
-//
-// If mark != nil, it will be called with each printed word's address
-// and should return a character mark to appear just before that
-// word's value. It can return 0 to indicate no mark.
-func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
-	printlock()
-	var markbuf [1]byte
-	markbuf[0] = ' '
-	minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2)
-	for i := uintptr(0); p+i < end; i += goarch.PtrSize {
-		if i%16 == 0 {
-			if i != 0 {
-				println()
-			}
-			print(hex(p+i), ": ")
-		}
-
-		if mark != nil {
-			markbuf[0] = mark(p + i)
-			if markbuf[0] == 0 {
-				markbuf[0] = ' '
-			}
-		}
-		gwrite(markbuf[:])
-		val := *(*uintptr)(unsafe.Pointer(p + i))
-		print(hex(val))
-		print(" ")
-
-		// Can we symbolize val?
-		fn := findfunc(val)
-		if fn.valid() {
-			print("<", funcname(fn), "+", hex(val-fn.entry()), "> ")
-		}
-	}
-	minhexdigits = 0
-	println()
-	printunlock()
-}
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 6649f72471..74aaeba876 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1366,16 +1366,19 @@ func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) {
 
 	// Print the hex dump.
 	print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n")
-	hexdumpWords(lo, hi, func(p uintptr) byte {
-		switch p {
-		case frame.fp:
-			return '>'
-		case frame.sp:
-			return '<'
-		case bad:
-			return '!'
+	hexdumpWords(lo, hi-lo, func(p uintptr, m hexdumpMarker) {
+		if p == frame.fp {
+			m.start()
+			println("FP")
+		}
+		if p == frame.sp {
+			m.start()
+			println("SP")
+		}
+		if p == bad {
+			m.start()
+			println("bad")
 		}
-		return 0
 	})
 }
 
-- 
cgit v1.3-5-g9baa