2 files changed, 57 insertions, 6 deletions
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index 119b1f62b1..e2e5d5fda7 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -578,8 +578,8 @@ func Map(mapping func(r rune) rune, s []byte) []byte {
 
 // Repeat returns a new byte slice consisting of count copies of b.
 //
-// It panics if count is negative or if
-// the result of (len(b) * count) overflows.
+// It panics if count is negative or if the result of (len(b) * count)
+// overflows.
 func Repeat(b []byte, count int) []byte {
 	if count == 0 {
 		return []byte{}
@@ -587,18 +587,45 @@ func Repeat(b []byte, count int) []byte {
 	// Since we cannot return an error on overflow,
 	// we should panic if the repeat will generate
 	// an overflow.
-	// See Issue golang.org/issue/16237.
+	// See golang.org/issue/16237.
 	if count < 0 {
 		panic("bytes: negative Repeat count")
 	} else if len(b)*count/count != len(b) {
 		panic("bytes: Repeat count causes overflow")
 	}
 
-	nb := make([]byte, len(b)*count)
+	if len(b) == 0 {
+		return []byte{}
+	}
+
+	n := len(b) * count
+
+	// Past a certain chunk size it is counterproductive to use
+	// larger chunks as the source of the write, as when the source
+	// is too large we are basically just thrashing the CPU D-cache.
+	// So if the result length is larger than an empirically-found
+	// limit (8KB), we stop growing the source string once the limit
+	// is reached and keep reusing the same source string - that
+	// should therefore be always resident in the L1 cache - until we
+	// have completed the construction of the result.
+	// This yields significant speedups (up to +100%) in cases where
+	// the result length is large (roughly, over L2 cache size).
+	const chunkLimit = 8 * 1024
+	chunkMax := n
+	if chunkMax > chunkLimit {
+		chunkMax = chunkLimit / len(b) * len(b)
+		if chunkMax == 0 {
+			chunkMax = len(b)
+		}
+	}
+	nb := make([]byte, n)
 	bp := copy(nb, b)
 	for bp < len(nb) {
-		copy(nb[bp:], nb[:bp])
-		bp *= 2
+		chunk := bp
+		if chunk > chunkMax {
+			chunk = chunkMax
+		}
+		bp += copy(nb[bp:], nb[:chunk])
 	}
 	return nb
 }
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index 7263af3ed0..f58f18c461 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -1159,6 +1159,8 @@ type RepeatTest struct {
 	count   int
 }
 
+var longString = "a" + string(make([]byte, 1<<16)) + "z"
+
 var RepeatTests = []RepeatTest{
 	{"", "", 0},
 	{"", "", 1},
@@ -1167,6 +1169,9 @@ var RepeatTests = []RepeatTest{
 	{"-", "-", 1},
 	{"-", "----------", 10},
 	{"abc ", "abc abc abc ", 3},
+	// Tests for results over the chunkLimit
+	{string(rune(0)), string(make([]byte, 1<<16)), 1 << 16},
+	{longString, longString + longString, 2},
 }
 
 func TestRepeat(t *testing.T) {
@@ -2048,6 +2053,25 @@ func BenchmarkRepeat(b *testing.B) {
 	}
 }
 
+func BenchmarkRepeatLarge(b *testing.B) {
+	s := Repeat([]byte("@"), 8*1024)
+	for j := 8; j <= 30; j++ {
+		for _, k := range []int{1, 16, 4097} {
+			s := s[:k]
+			n := (1 << j) / k
+			if n == 0 {
+				continue
+			}
+			b.Run(fmt.Sprintf("%d/%d", 1<<j, k), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					Repeat(s, n)
+				}
+				b.SetBytes(int64(n * len(s)))
+			})
+		}
+	}
+}
+
 func BenchmarkBytesCompare(b *testing.B) {
 	for n := 1; n <= 2048; n <<= 1 {
 		b.Run(fmt.Sprint(n), func(b *testing.B) {