cmd/compile: stack allocate backing stores during append

We can already stack allocate the backing store during append if the resulting backing store doesn't escape. See CL 664299. This CL enables us to often stack allocate the backing store during append *even if* the result escapes. Typically, for code like: func f(n int) []int { var r []int for i := range n { r = append(r, i) } return r } the backing store for r escapes, but only by returning it. Could we operate with r on the stack for most of its lifeime, and only move it to the heap at the return point? The current implementation of append will need to do an allocation each time it calls growslice. This will happen on the 1st, 2nd, 4th, 8th, etc. append calls. The allocations done by all but the last growslice call will then immediately be garbage. We'd like to avoid doing some of those intermediate allocations if possible. We rewrite the above code by introducing a move2heap operation: func f(n int) []int { var r []int for i := range n { r = append(r, i) } r = move2heap(r) return r } Using the move2heap runtime function, which does: move2heap(r): If r is already backed by heap storage, return r. Otherwise, copy r to the heap and return the copy. Now we can treat the backing store of r allocated at the append site as not escaping. Previous stack allocation optimizations now apply, which can use a fixed-size stack-allocated backing store for r when appending. See the description in cmd/compile/internal/slice/slice.go for how we ensure that this optimization is safe. Change-Id: I81f36e58bade2241d07f67967d8d547fff5302b8 Reviewed-on: https://go-review.googlesource.com/c/go/+/707755 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
author: khr@golang.org <khr@golang.org> 2025-09-12 14:43:19 -0700
committer: Keith Randall <khr@golang.org> 2025-11-20 09:19:39 -0800
commit: 32f5aadd2ffc60421c62b185fa7668012fb5e73e (patch)
tree: 98720009a4a90ef893980660e2152532fcaf609e /src/runtime/slice_test.go
parent: a18aff805706bfdaeb9aca042111fae32f9f8b61 (diff)
download: go-32f5aadd2ffc60421c62b185fa7668012fb5e73e.tar.xz
1 files changed, 319 insertions, 0 deletions
diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go
index cd2bc26d1e..5463b6c02f 100644
--- a/src/runtime/slice_test.go
+++ b/src/runtime/slice_test.go
@@ -6,6 +6,9 @@ package runtime_test
 
 import (
 	"fmt"
+	"internal/race"
+	"internal/testenv"
+	"runtime"
 	"testing"
 )
 
@@ -499,3 +502,319 @@ func BenchmarkAppendInPlace(b *testing.B) {
 
 	})
 }
+
+//go:noinline
+func byteSlice(n int) []byte {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r
+}
+func TestAppendByteInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64}, // If we up the stack buffer size from 32->64, this line and the next would become 48.
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r = byteSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func ptrSlice(n int, p *[]*byte) {
+	var r []*byte
+	for range n {
+		r = append(r, nil)
+	}
+	*p = r
+}
+func TestAppendPtrInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	var tests [][3]int
+	if runtime.PtrSize == 8 {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 1},
+			{2, 1, 2},
+			{3, 1, 3}, // This is the interesting case, allocates 24 bytes when before it was 32.
+			{4, 1, 4},
+			{5, 1, 8},
+			{6, 1, 8},
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 2, 16},
+		}
+	} else {
+		tests = [][3]int{
+			{0, 0, 0},
+			{1, 1, 2},
+			{2, 1, 2},
+			{3, 1, 4},
+			{4, 1, 4},
+			{5, 1, 6}, // These two are also 24 bytes instead of 32.
+			{6, 1, 6}, //
+			{7, 1, 8},
+			{8, 1, 8},
+			{9, 1, 16},
+			{10, 1, 16},
+			{11, 1, 16},
+			{12, 1, 16},
+			{13, 1, 16},
+			{14, 1, 16},
+			{15, 1, 16},
+			{16, 1, 16},
+			{17, 2, 32},
+		}
+	}
+	for _, test := range tests {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []*byte
+		got := testing.AllocsPerRun(10, func() {
+			ptrSlice(n, &r)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+//go:noinline
+func byteCapSlice(n int) ([]byte, int) {
+	var r []byte
+	for i := range n {
+		r = append(r, byte(i))
+	}
+	return r, cap(r)
+}
+func TestAppendByteCapInLoop(t *testing.T) {
+	testenv.SkipIfOptimizationOff(t)
+	if race.Enabled {
+		t.Skip("skipping in -race mode")
+	}
+	for _, test := range [][3]int{
+		{0, 0, 0},
+		{1, 1, 8},
+		{2, 1, 8},
+		{8, 1, 8},
+		{9, 1, 16},
+		{16, 1, 16},
+		{17, 1, 24},
+		{24, 1, 24},
+		{25, 1, 32},
+		{32, 1, 32},
+		{33, 1, 64},
+		{48, 1, 64},
+		{49, 1, 64},
+		{64, 1, 64},
+		{65, 2, 128},
+	} {
+		n := test[0]
+		want := test[1]
+		wantCap := test[2]
+		var r []byte
+		got := testing.AllocsPerRun(10, func() {
+			r, _ = byteCapSlice(n)
+		})
+		if got != float64(want) {
+			t.Errorf("for size %d, got %f allocs want %d", n, got, want)
+		}
+		if cap(r) != wantCap {
+			t.Errorf("for size %d, got capacity %d want %d", n, cap(r), wantCap)
+		}
+	}
+}
+
+func TestAppendGeneric(t *testing.T) {
+	type I *int
+	r := testAppendGeneric[I](100)
+	if len(r) != 100 {
+		t.Errorf("bad length")
+	}
+}
+
+//go:noinline
+func testAppendGeneric[E any](n int) []E {
+	var r []E
+	var z E
+	for range n {
+		r = append(r, z)
+	}
+	return r
+}
+
+func appendSomeBytes(r []byte, s []byte) []byte {
+	for _, b := range s {
+		r = append(r, b)
+	}
+	return r
+}
+
+func TestAppendOfArg(t *testing.T) {
+	r := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		r[i] = byte(i)
+	}
+	appendSomeBytes(r, []byte{25, 26, 27})
+	// Do the same thing, trying to overwrite any
+	// stack-allocated buffers used above.
+	s := make([]byte, 24)
+	for i := 0; i < 24; i++ {
+		s[i] = 99
+	}
+	appendSomeBytes(s, []byte{99, 99, 99})
+	// Check that we still have the right data.
+	for i, b := range r {
+		if b != byte(i) {
+			t.Errorf("r[%d]=%d, want %d", i, b, byte(i))
+		}
+	}
+
+}
+
+func BenchmarkAppendInLoop(b *testing.B) {
+	for _, size := range []int{0, 1, 8, 16, 32, 64, 128} {
+		b.Run(fmt.Sprintf("%d", size),
+			func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					byteSlice(size)
+				}
+			})
+	}
+}
+
+func TestMoveToHeapEarly(t *testing.T) {
+	// Just checking that this compiles.
+	var x []int
+	y := x // causes a move2heap in the entry block
+	for range 5 {
+		x = append(x, 5)
+	}
+	_ = y
+}
+
+func TestMoveToHeapCap(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = cap(s)
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func runit(f func()) {
+	f()
+}
+
+func TestMoveToHeapClosure1(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		runit(func() {
+			c = cap(s)
+		})
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+func TestMoveToHeapClosure2(t *testing.T) {
+	var c int
+	r := func() []byte {
+		var s []byte
+		for i := range 10 {
+			s = append(s, byte(i))
+		}
+		c = func() int {
+			return cap(s)
+		}()
+		return s
+	}()
+	if c != cap(r) {
+		t.Errorf("got cap=%d, want %d", c, cap(r))
+	}
+	sinkSlice = r
+}
+
+//go:noinline
+func buildClosure(t *testing.T) ([]byte, func()) {
+	var s []byte
+	for i := range 20 {
+		s = append(s, byte(i))
+	}
+	c := func() {
+		for i, b := range s {
+			if b != byte(i) {
+				t.Errorf("s[%d]=%d, want %d", i, b, i)
+			}
+		}
+	}
+	return s, c
+}
+
+func TestMoveToHeapClosure3(t *testing.T) {
+	_, f := buildClosure(t)
+	overwriteStack(0)
+	f()
+}
+
+//go:noinline
+func overwriteStack(n int) uint64 {
+	var x [100]uint64
+	for i := range x {
+		x[i] = 0xabcdabcdabcdabcd
+	}
+	return x[n]
+}
+
+var sinkSlice []byte
author	khr@golang.org <khr@golang.org>	2025-09-12 14:43:19 -0700
committer	Keith Randall <khr@golang.org>	2025-11-20 09:19:39 -0800
commit	32f5aadd2ffc60421c62b185fa7668012fb5e73e (patch)
tree	98720009a4a90ef893980660e2152532fcaf609e /src/runtime/slice_test.go
parent	a18aff805706bfdaeb9aca042111fae32f9f8b61 (diff)
download	go-32f5aadd2ffc60421c62b185fa7668012fb5e73e.tar.xz