From fecfcaa4f68a220f47e2c7c8b65d55906dbf8d46 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Tue, 4 Nov 2025 09:33:17 -0500
Subject: runtime: add runtime.freegc to reduce GC work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This CL is part of a set of CLs that attempt to reduce how much work the
GC must do. See the design in https://go.dev/design/74299-runtime-freegc

This CL adds runtime.freegc:

 func freegc(ptr unsafe.Pointer, uintptr size, noscan bool)

Memory freed via runtime.freegc is made immediately reusable for
the next allocation in the same size class, without waiting for a
GC cycle, and hence can dramatically reduce pressure on the GC. A sample
microbenchmark included below shows strings.Builder operating roughly
2x faster.

An experimental modification to reflect to use runtime.freegc
and then using that reflect with json/v2 gave reported memory
allocation reductions of -43.7%, -32.9%, -21.9%, -22.0%, -1.0%
for the 5 official real-world unmarshalling benchmarks from
go-json-experiment/jsonbench by the authors of json/v2, covering
the CanadaGeometry through TwitterStatus datasets.

Note: there is no intent to modify the standard library to have
explicit calls to runtime.freegc, and of course such an ability
would never be exposed to end-user code.

Later CLs in this stack teach the compiler how to automatically
insert runtime.freegc calls when it can prove it is safe to do so.

(The reflect modification and other experimental changes to
the standard library were just that -- experiments. It was
very helpful while initially developing runtime.freegc to see
more complex uses and closer-to-real-world benchmark results
prior to updating the compiler.)

This CL only addresses noscan span classes (heap objects without
pointers), such as the backing memory for a []byte or string. A
follow-on CL adds support for heap objects with pointers.

If we update strings.Builder to explicitly call runtime.freegc on its
internal buf after a resize operation (but without freeing the usually
final incarnation of buf that will be returned to the user as a string),
we can see some nice benchmark results on the existing strings
benchmarks that call Builder.Write N times and then call Builder.String.
Here, the (uncommon) case of a single Builder.Write is not helped (given
it never resizes after first alloc if there is only one Write), but the
impact grows such that it is up to ~2x faster as there are more resize
operations due to more strings.Builder.Write calls:

                                               │     disabled.out │      new-free-20.txt                │
                                               │      sec/op      │   sec/op     vs base                │
BuildString_Builder/1Write_36Bytes_NoGrow-4           55.82n ± 2%   55.86n ± 2%        ~ (p=0.794 n=20)
BuildString_Builder/2Write_36Bytes_NoGrow-4           125.2n ± 2%   115.4n ± 1%   -7.86% (p=0.000 n=20)
BuildString_Builder/3Write_36Bytes_NoGrow-4           224.0n ± 1%   188.2n ± 2%  -16.00% (p=0.000 n=20)
BuildString_Builder/5Write_36Bytes_NoGrow-4           239.1n ± 9%   205.1n ± 1%  -14.20% (p=0.000 n=20)
BuildString_Builder/8Write_36Bytes_NoGrow-4           422.8n ± 3%   325.4n ± 1%  -23.04% (p=0.000 n=20)
BuildString_Builder/10Write_36Bytes_NoGrow-4          436.9n ± 2%   342.3n ± 1%  -21.64% (p=0.000 n=20)
BuildString_Builder/100Write_36Bytes_NoGrow-4         4.403µ ± 1%   2.381µ ± 2%  -45.91% (p=0.000 n=20)
BuildString_Builder/1000Write_36Bytes_NoGrow-4        48.28µ ± 2%   21.38µ ± 2%  -55.71% (p=0.000 n=20)

See the design document for more discussion of the strings.Builder case.

For testing, we add tests that attempt to exercise different aspects
of the underlying freegc and mallocgc behavior on the reuse path.
Validating the assist credit manipulations turned out to be subtle,
so a test for that is added in the next CL. There are also
invariant checks added, controlled by consts (primarily the
doubleCheckReusable const currently).

This CL also adds support in runtime.freegc for GODEBUG=clobberfree=1
to immediately overwrite freed memory with 0xdeadbeef, which
can help a higher-level test fail faster in the event of a bug,
and also the GC specifically looks for that pattern and throws
a fatal error if it unexpectedly finds it.

A later CL (currently experimental) adds GODEBUG=clobberfree=2,
which uses mprotect (or VirtualProtect on Windows) to set
freed memory to fault if read or written, until the runtime
later unprotects the memory on the mallocgc reuse path.

For the cases where a normal allocation is happening without any reuse,
some initial microbenchmarks suggest the impact of these changes could
be small to negligible (at least with GOAMD64=v3):

goos: linux
goarch: amd64
pkg: runtime
cpu: AMD EPYC 7B13
                        │ base-512M-v3.bench │   ps16-512M-goamd64-v3.bench     │
                        │        sec/op      │ sec/op     vs base               │
Malloc8-16                      11.01n ± 1%   10.94n ± 1%  -0.68% (p=0.038 n=20)
Malloc16-16                     17.15n ± 1%   17.05n ± 0%  -0.55% (p=0.007 n=20)
Malloc32-16                     18.65n ± 1%   18.42n ± 0%  -1.26% (p=0.000 n=20)
MallocTypeInfo8-16              18.63n ± 0%   18.36n ± 0%  -1.45% (p=0.000 n=20)
MallocTypeInfo16-16             22.32n ± 0%   22.65n ± 0%  +1.50% (p=0.000 n=20)
MallocTypeInfo32-16             23.37n ± 0%   23.89n ± 0%  +2.23% (p=0.000 n=20)
geomean                         18.02n        18.01n       -0.05%

These last benchmark results include the runtime updates to support
span classes with pointers (which was originally part of this CL,
but later split out for ease of review).

Updates #74299

Change-Id: Icceaa0f79f85c70cd1a718f9a4e7f0cf3d77803c
Reviewed-on: https://go-review.googlesource.com/c/go/+/673695
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/runtime/malloc_test.go | 286 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)

(limited to 'src/runtime/malloc_test.go')

diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index bf58947bbc..6285cdaff7 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -16,6 +16,7 @@ import (
 	"runtime"
 	. "runtime"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -234,6 +235,275 @@ func TestTinyAllocIssue37262(t *testing.T) {
 	runtime.Releasem()
 }
 
+// TestFreegc does basic testing of explicit frees.
+func TestFreegc(t *testing.T) {
+	tests := []struct {
+		size   string
+		f      func(noscan bool) func(*testing.T)
+		noscan bool
+	}{
+		// Types without pointers.
+		{"size=16", testFreegc[[16]byte], true}, // smallest we support currently
+		{"size=17", testFreegc[[17]byte], true},
+		{"size=64", testFreegc[[64]byte], true},
+		{"size=500", testFreegc[[500]byte], true},
+		{"size=512", testFreegc[[512]byte], true},
+		{"size=4096", testFreegc[[4096]byte], true},
+		{"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit
+	}
+
+	// Run the tests twice if not in -short mode or not otherwise saving test time.
+	// First while manually calling runtime.GC to slightly increase isolation (perhaps making
+	// problems more reproducible).
+	for _, tt := range tests {
+		runtime.GC()
+		t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		return
+	}
+
+	// Again, but without manually calling runtime.GC in the loop (perhaps less isolation might
+	// trigger problems).
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan))
+	}
+	runtime.GC()
+}
+
+func testFreegc[T comparable](noscan bool) func(*testing.T) {
+	// We use stressMultiple to influence the duration of the tests.
+	// When testing freegc changes, stressMultiple can be increased locally
+	// to test longer or in some cases with more goroutines.
+	// It can also be helpful to test with GODEBUG=clobberfree=1 and
+	// with and without doubleCheckMalloc and doubleCheckReusable enabled.
+	stressMultiple := 10
+	if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled {
+		stressMultiple = 1
+	}
+
+	return func(t *testing.T) {
+		alloc := func() *T {
+			// Force heap alloc, plus some light validation of zeroed memory.
+			t.Helper()
+			p := Escape(new(T))
+			var zero T
+			if *p != zero {
+				t.Fatalf("allocator returned non-zero memory: %v", *p)
+			}
+			return p
+		}
+
+		free := func(p *T) {
+			t.Helper()
+			var zero T
+			if *p != zero {
+				t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p)
+			}
+			runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan)
+		}
+
+		t.Run("basic-free", func(t *testing.T) {
+			// Test that freeing a live heap object doesn't crash.
+			for range 100 {
+				p := alloc()
+				free(p)
+			}
+		})
+
+		t.Run("stack-free", func(t *testing.T) {
+			// Test that freeing a stack object doesn't crash.
+			for range 100 {
+				var x [32]byte
+				var y [32]*int
+				runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true)  // noscan
+				runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan
+			}
+		})
+
+		// Check our allocations. These tests rely on the
+		// current implementation treating a re-used object
+		// as not adding to the allocation counts seen
+		// by testing.AllocsPerRun. (This is not the desired
+		// long-term behavior, but it is the current behavior and
+		// makes these tests convenient).
+
+		t.Run("allocs-baseline", func(t *testing.T) {
+			// Baseline result without any explicit free.
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					_ = p
+				}
+			})
+			if allocs < 100 {
+				// TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why
+				// ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8.
+				t.Fatalf("expected >=100 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("allocs-with-free", func(t *testing.T) {
+			// Same allocations, but now using explicit free so that
+			// no allocs get reported. (Again, not the desired long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			allocs := testing.AllocsPerRun(100, func() {
+				for range 100 {
+					p := alloc()
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		t.Run("free-multiple", func(t *testing.T) {
+			// Multiple allocations outstanding before explicitly freeing,
+			// but still within the limit of our smallest free list size
+			// so that no allocs are reported. (Again, not long-term behavior).
+			if SizeSpecializedMallocEnabled {
+				t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc")
+			}
+			if !RuntimeFreegcEnabled {
+				t.Skip("skipping alloc tests with runtime.freegc disabled")
+			}
+			const maxOutstanding = 20
+			s := make([]*T, 0, maxOutstanding)
+			allocs := testing.AllocsPerRun(100*stressMultiple, func() {
+				s = s[:0]
+				for range maxOutstanding {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			})
+			if allocs != 0 {
+				t.Fatalf("expected 0 allocations, got %v", allocs)
+			}
+		})
+
+		if runtime.GOARCH == "wasm" {
+			// TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem,
+			// vs. something else. It might have been wasm was slowest with tests that spawn
+			// many goroutines, which might be expected for wasm. This skip might no longer be
+			// needed now that we have tuned test execution time more, or perhaps wasm should just
+			// always run in short mode, which might also let us remove this skip.
+			t.Skip("skipping remaining freegc tests, was timing out on wasm")
+		}
+
+		t.Run("free-many", func(t *testing.T) {
+			// Confirm we are graceful if we have more freed elements at once
+			// than the max free list size.
+			s := make([]*T, 0, 1000)
+			iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short
+			for range iterations {
+				s = s[:0]
+				for range 1000 {
+					p := alloc()
+					s = append(s, p)
+				}
+				for _, p := range s {
+					free(p)
+				}
+			}
+		})
+
+		t.Run("duplicate-check", func(t *testing.T) {
+			// A simple duplicate allocation test. We track what should be the set
+			// of live pointers in a map across a series of allocs and frees,
+			// and fail if a live pointer value is returned by an allocation.
+			// TODO: maybe add randomness? allow more live pointers? do across goroutines?
+			live := make(map[uintptr]bool)
+			for i := range 100 * stressMultiple {
+				var s []*T
+				// Alloc 10 times, tracking the live pointer values.
+				for j := range 10 {
+					p := alloc()
+					uptr := uintptr(unsafe.Pointer(p))
+					if live[uptr] {
+						t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j)
+					}
+					live[uptr] = true
+					s = append(s, p)
+				}
+				// Explicitly free those pointers, removing them from the live map.
+				for k := range s {
+					p := s[k]
+					s[k] = nil
+					uptr := uintptr(unsafe.Pointer(p))
+					free(p)
+					delete(live, uptr)
+				}
+			}
+		})
+
+		t.Run("free-other-goroutine", func(t *testing.T) {
+			// Use explicit free, but the free happens on a different goroutine than the alloc.
+			// This also lightly simulates how the free code sees P migration or flushing
+			// the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here).
+			iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short
+			for _, capacity := range []int{2} {
+				for range iterations {
+					ch := make(chan *T, capacity)
+					var wg sync.WaitGroup
+					for range 2 {
+						wg.Add(1)
+						go func() {
+							defer wg.Done()
+							for p := range ch {
+								free(p)
+							}
+						}()
+					}
+					for range 100 {
+						p := alloc()
+						ch <- p
+					}
+					close(ch)
+					wg.Wait()
+				}
+			}
+		})
+
+		t.Run("many-goroutines", func(t *testing.T) {
+			// Allocate across multiple goroutines, freeing on the same goroutine.
+			// TODO: probably remove the duplicate checking here; not that useful.
+			counts := []int{1, 2, 4, 8, 10 * stressMultiple}
+			for _, goroutines := range counts {
+				var wg sync.WaitGroup
+				for range goroutines {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						live := make(map[uintptr]bool)
+						for range 100 * stressMultiple {
+							p := alloc()
+							uptr := uintptr(unsafe.Pointer(p))
+							if live[uptr] {
+								panic("TestFreeLive: found duplicate pointer")
+							}
+							live[uptr] = true
+							free(p)
+							delete(live, uptr)
+						}
+					}()
+				}
+				wg.Wait()
+			}
+		})
+	}
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
@@ -337,6 +607,13 @@ func BenchmarkMalloc16(b *testing.B) {
 	}
 }
 
+func BenchmarkMalloc32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new([4]int64)
+		Escape(p)
+	}
+}
+
 func BenchmarkMallocTypeInfo8(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
@@ -355,6 +632,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) {
 	}
 }
 
+func BenchmarkMallocTypeInfo32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [32 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		Escape(p)
+	}
+}
+
 type LargeStruct struct {
 	x [16][]byte
 }
-- 
cgit v1.3