From fecfcaa4f68a220f47e2c7c8b65d55906dbf8d46 Mon Sep 17 00:00:00 2001 From: thepudds Date: Tue, 4 Nov 2025 09:33:17 -0500 Subject: runtime: add runtime.freegc to reduce GC work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL is part of a set of CLs that attempt to reduce how much work the GC must do. See the design in https://go.dev/design/74299-runtime-freegc This CL adds runtime.freegc: func freegc(ptr unsafe.Pointer, uintptr size, noscan bool) Memory freed via runtime.freegc is made immediately reusable for the next allocation in the same size class, without waiting for a GC cycle, and hence can dramatically reduce pressure on the GC. A sample microbenchmark included below shows strings.Builder operating roughly 2x faster. An experimental modification to reflect to use runtime.freegc and then using that reflect with json/v2 gave reported memory allocation reductions of -43.7%, -32.9%, -21.9%, -22.0%, -1.0% for the 5 official real-world unmarshalling benchmarks from go-json-experiment/jsonbench by the authors of json/v2, covering the CanadaGeometry through TwitterStatus datasets. Note: there is no intent to modify the standard library to have explicit calls to runtime.freegc, and of course such an ability would never be exposed to end-user code. Later CLs in this stack teach the compiler how to automatically insert runtime.freegc calls when it can prove it is safe to do so. (The reflect modification and other experimental changes to the standard library were just that -- experiments. It was very helpful while initially developing runtime.freegc to see more complex uses and closer-to-real-world benchmark results prior to updating the compiler.) This CL only addresses noscan span classes (heap objects without pointers), such as the backing memory for a []byte or string. A follow-on CL adds support for heap objects with pointers. If we update strings.Builder to explicitly call runtime.freegc on its internal buf after a resize operation (but without freeing the usually final incarnation of buf that will be returned to the user as a string), we can see some nice benchmark results on the existing strings benchmarks that call Builder.Write N times and then call Builder.String. Here, the (uncommon) case of a single Builder.Write is not helped (given it never resizes after first alloc if there is only one Write), but the impact grows such that it is up to ~2x faster as there are more resize operations due to more strings.Builder.Write calls: │ disabled.out │ new-free-20.txt │ │ sec/op │ sec/op vs base │ BuildString_Builder/1Write_36Bytes_NoGrow-4 55.82n ± 2% 55.86n ± 2% ~ (p=0.794 n=20) BuildString_Builder/2Write_36Bytes_NoGrow-4 125.2n ± 2% 115.4n ± 1% -7.86% (p=0.000 n=20) BuildString_Builder/3Write_36Bytes_NoGrow-4 224.0n ± 1% 188.2n ± 2% -16.00% (p=0.000 n=20) BuildString_Builder/5Write_36Bytes_NoGrow-4 239.1n ± 9% 205.1n ± 1% -14.20% (p=0.000 n=20) BuildString_Builder/8Write_36Bytes_NoGrow-4 422.8n ± 3% 325.4n ± 1% -23.04% (p=0.000 n=20) BuildString_Builder/10Write_36Bytes_NoGrow-4 436.9n ± 2% 342.3n ± 1% -21.64% (p=0.000 n=20) BuildString_Builder/100Write_36Bytes_NoGrow-4 4.403µ ± 1% 2.381µ ± 2% -45.91% (p=0.000 n=20) BuildString_Builder/1000Write_36Bytes_NoGrow-4 48.28µ ± 2% 21.38µ ± 2% -55.71% (p=0.000 n=20) See the design document for more discussion of the strings.Builder case. For testing, we add tests that attempt to exercise different aspects of the underlying freegc and mallocgc behavior on the reuse path. Validating the assist credit manipulations turned out to be subtle, so a test for that is added in the next CL. There are also invariant checks added, controlled by consts (primarily the doubleCheckReusable const currently). This CL also adds support in runtime.freegc for GODEBUG=clobberfree=1 to immediately overwrite freed memory with 0xdeadbeef, which can help a higher-level test fail faster in the event of a bug, and also the GC specifically looks for that pattern and throws a fatal error if it unexpectedly finds it. A later CL (currently experimental) adds GODEBUG=clobberfree=2, which uses mprotect (or VirtualProtect on Windows) to set freed memory to fault if read or written, until the runtime later unprotects the memory on the mallocgc reuse path. For the cases where a normal allocation is happening without any reuse, some initial microbenchmarks suggest the impact of these changes could be small to negligible (at least with GOAMD64=v3): goos: linux goarch: amd64 pkg: runtime cpu: AMD EPYC 7B13 │ base-512M-v3.bench │ ps16-512M-goamd64-v3.bench │ │ sec/op │ sec/op vs base │ Malloc8-16 11.01n ± 1% 10.94n ± 1% -0.68% (p=0.038 n=20) Malloc16-16 17.15n ± 1% 17.05n ± 0% -0.55% (p=0.007 n=20) Malloc32-16 18.65n ± 1% 18.42n ± 0% -1.26% (p=0.000 n=20) MallocTypeInfo8-16 18.63n ± 0% 18.36n ± 0% -1.45% (p=0.000 n=20) MallocTypeInfo16-16 22.32n ± 0% 22.65n ± 0% +1.50% (p=0.000 n=20) MallocTypeInfo32-16 23.37n ± 0% 23.89n ± 0% +2.23% (p=0.000 n=20) geomean 18.02n 18.01n -0.05% These last benchmark results include the runtime updates to support span classes with pointers (which was originally part of this CL, but later split out for ease of review). Updates #74299 Change-Id: Icceaa0f79f85c70cd1a718f9a4e7f0cf3d77803c Reviewed-on: https://go-review.googlesource.com/c/go/+/673695 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek Reviewed-by: Junyang Shao --- src/runtime/malloc_test.go | 286 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) (limited to 'src/runtime/malloc_test.go') diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go index bf58947bbc..6285cdaff7 100644 --- a/src/runtime/malloc_test.go +++ b/src/runtime/malloc_test.go @@ -16,6 +16,7 @@ import ( "runtime" . "runtime" "strings" + "sync" "sync/atomic" "testing" "time" @@ -234,6 +235,275 @@ func TestTinyAllocIssue37262(t *testing.T) { runtime.Releasem() } +// TestFreegc does basic testing of explicit frees. +func TestFreegc(t *testing.T) { + tests := []struct { + size string + f func(noscan bool) func(*testing.T) + noscan bool + }{ + // Types without pointers. + {"size=16", testFreegc[[16]byte], true}, // smallest we support currently + {"size=17", testFreegc[[17]byte], true}, + {"size=64", testFreegc[[64]byte], true}, + {"size=500", testFreegc[[500]byte], true}, + {"size=512", testFreegc[[512]byte], true}, + {"size=4096", testFreegc[[4096]byte], true}, + {"size=32KiB-8", testFreegc[[1<<15 - 8]byte], true}, // max noscan small object for 64-bit + } + + // Run the tests twice if not in -short mode or not otherwise saving test time. + // First while manually calling runtime.GC to slightly increase isolation (perhaps making + // problems more reproducible). + for _, tt := range tests { + runtime.GC() + t.Run(fmt.Sprintf("gc=yes/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan)) + } + runtime.GC() + + if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled { + return + } + + // Again, but without manually calling runtime.GC in the loop (perhaps less isolation might + // trigger problems). + for _, tt := range tests { + t.Run(fmt.Sprintf("gc=no/ptrs=%v/%s", !tt.noscan, tt.size), tt.f(tt.noscan)) + } + runtime.GC() +} + +func testFreegc[T comparable](noscan bool) func(*testing.T) { + // We use stressMultiple to influence the duration of the tests. + // When testing freegc changes, stressMultiple can be increased locally + // to test longer or in some cases with more goroutines. + // It can also be helpful to test with GODEBUG=clobberfree=1 and + // with and without doubleCheckMalloc and doubleCheckReusable enabled. + stressMultiple := 10 + if testing.Short() || !RuntimeFreegcEnabled || runtime.Raceenabled { + stressMultiple = 1 + } + + return func(t *testing.T) { + alloc := func() *T { + // Force heap alloc, plus some light validation of zeroed memory. + t.Helper() + p := Escape(new(T)) + var zero T + if *p != zero { + t.Fatalf("allocator returned non-zero memory: %v", *p) + } + return p + } + + free := func(p *T) { + t.Helper() + var zero T + if *p != zero { + t.Fatalf("found non-zero memory before freeing (tests do not modify memory): %v", *p) + } + runtime.Freegc(unsafe.Pointer(p), unsafe.Sizeof(*p), noscan) + } + + t.Run("basic-free", func(t *testing.T) { + // Test that freeing a live heap object doesn't crash. + for range 100 { + p := alloc() + free(p) + } + }) + + t.Run("stack-free", func(t *testing.T) { + // Test that freeing a stack object doesn't crash. + for range 100 { + var x [32]byte + var y [32]*int + runtime.Freegc(unsafe.Pointer(&x), unsafe.Sizeof(x), true) // noscan + runtime.Freegc(unsafe.Pointer(&y), unsafe.Sizeof(y), false) // !noscan + } + }) + + // Check our allocations. These tests rely on the + // current implementation treating a re-used object + // as not adding to the allocation counts seen + // by testing.AllocsPerRun. (This is not the desired + // long-term behavior, but it is the current behavior and + // makes these tests convenient). + + t.Run("allocs-baseline", func(t *testing.T) { + // Baseline result without any explicit free. + allocs := testing.AllocsPerRun(100, func() { + for range 100 { + p := alloc() + _ = p + } + }) + if allocs < 100 { + // TODO(thepudds): we get exactly 100 for almost all the tests, but investigate why + // ~101 allocs for TestFreegc/ptrs=true/size=32KiB-8. + t.Fatalf("expected >=100 allocations, got %v", allocs) + } + }) + + t.Run("allocs-with-free", func(t *testing.T) { + // Same allocations, but now using explicit free so that + // no allocs get reported. (Again, not the desired long-term behavior). + if SizeSpecializedMallocEnabled { + t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc") + } + if !RuntimeFreegcEnabled { + t.Skip("skipping alloc tests with runtime.freegc disabled") + } + allocs := testing.AllocsPerRun(100, func() { + for range 100 { + p := alloc() + free(p) + } + }) + if allocs != 0 { + t.Fatalf("expected 0 allocations, got %v", allocs) + } + }) + + t.Run("free-multiple", func(t *testing.T) { + // Multiple allocations outstanding before explicitly freeing, + // but still within the limit of our smallest free list size + // so that no allocs are reported. (Again, not long-term behavior). + if SizeSpecializedMallocEnabled { + t.Skip("temporarily skipping alloc tests for GOEXPERIMENT=sizespecializedmalloc") + } + if !RuntimeFreegcEnabled { + t.Skip("skipping alloc tests with runtime.freegc disabled") + } + const maxOutstanding = 20 + s := make([]*T, 0, maxOutstanding) + allocs := testing.AllocsPerRun(100*stressMultiple, func() { + s = s[:0] + for range maxOutstanding { + p := alloc() + s = append(s, p) + } + for _, p := range s { + free(p) + } + }) + if allocs != 0 { + t.Fatalf("expected 0 allocations, got %v", allocs) + } + }) + + if runtime.GOARCH == "wasm" { + // TODO(thepudds): for wasm, double-check if just slow, vs. some test logic problem, + // vs. something else. It might have been wasm was slowest with tests that spawn + // many goroutines, which might be expected for wasm. This skip might no longer be + // needed now that we have tuned test execution time more, or perhaps wasm should just + // always run in short mode, which might also let us remove this skip. + t.Skip("skipping remaining freegc tests, was timing out on wasm") + } + + t.Run("free-many", func(t *testing.T) { + // Confirm we are graceful if we have more freed elements at once + // than the max free list size. + s := make([]*T, 0, 1000) + iterations := stressMultiple * stressMultiple // currently 1 or 100 depending on -short + for range iterations { + s = s[:0] + for range 1000 { + p := alloc() + s = append(s, p) + } + for _, p := range s { + free(p) + } + } + }) + + t.Run("duplicate-check", func(t *testing.T) { + // A simple duplicate allocation test. We track what should be the set + // of live pointers in a map across a series of allocs and frees, + // and fail if a live pointer value is returned by an allocation. + // TODO: maybe add randomness? allow more live pointers? do across goroutines? + live := make(map[uintptr]bool) + for i := range 100 * stressMultiple { + var s []*T + // Alloc 10 times, tracking the live pointer values. + for j := range 10 { + p := alloc() + uptr := uintptr(unsafe.Pointer(p)) + if live[uptr] { + t.Fatalf("TestFreeLive: found duplicate pointer (0x%x). i: %d j: %d", uptr, i, j) + } + live[uptr] = true + s = append(s, p) + } + // Explicitly free those pointers, removing them from the live map. + for k := range s { + p := s[k] + s[k] = nil + uptr := uintptr(unsafe.Pointer(p)) + free(p) + delete(live, uptr) + } + } + }) + + t.Run("free-other-goroutine", func(t *testing.T) { + // Use explicit free, but the free happens on a different goroutine than the alloc. + // This also lightly simulates how the free code sees P migration or flushing + // the mcache, assuming we have > 1 P. (Not using testing.AllocsPerRun here). + iterations := 10 * stressMultiple * stressMultiple // currently 10 or 1000 depending on -short + for _, capacity := range []int{2} { + for range iterations { + ch := make(chan *T, capacity) + var wg sync.WaitGroup + for range 2 { + wg.Add(1) + go func() { + defer wg.Done() + for p := range ch { + free(p) + } + }() + } + for range 100 { + p := alloc() + ch <- p + } + close(ch) + wg.Wait() + } + } + }) + + t.Run("many-goroutines", func(t *testing.T) { + // Allocate across multiple goroutines, freeing on the same goroutine. + // TODO: probably remove the duplicate checking here; not that useful. + counts := []int{1, 2, 4, 8, 10 * stressMultiple} + for _, goroutines := range counts { + var wg sync.WaitGroup + for range goroutines { + wg.Add(1) + go func() { + defer wg.Done() + live := make(map[uintptr]bool) + for range 100 * stressMultiple { + p := alloc() + uptr := uintptr(unsafe.Pointer(p)) + if live[uptr] { + panic("TestFreeLive: found duplicate pointer") + } + live[uptr] = true + free(p) + delete(live, uptr) + } + }() + } + wg.Wait() + } + }) + } +} + func TestPageCacheLeak(t *testing.T) { defer GOMAXPROCS(GOMAXPROCS(1)) leaked := PageCachePagesLeaked() @@ -337,6 +607,13 @@ func BenchmarkMalloc16(b *testing.B) { } } +func BenchmarkMalloc32(b *testing.B) { + for i := 0; i < b.N; i++ { + p := new([4]int64) + Escape(p) + } +} + func BenchmarkMallocTypeInfo8(b *testing.B) { for i := 0; i < b.N; i++ { p := new(struct { @@ -355,6 +632,15 @@ func BenchmarkMallocTypeInfo16(b *testing.B) { } } +func BenchmarkMallocTypeInfo32(b *testing.B) { + for i := 0; i < b.N; i++ { + p := new(struct { + p [32 / unsafe.Sizeof(uintptr(0))]*int + }) + Escape(p) + } +} + type LargeStruct struct { x [16][]byte } -- cgit v1.3