3 files changed, 204 insertions, 116 deletions
diff --git a/src/strconv/ftoa.go b/src/strconv/ftoa.go
index acbcb9fcc9..1d4bf770be 100644
--- a/src/strconv/ftoa.go
+++ b/src/strconv/ftoa.go
@@ -483,7 +483,7 @@ func fmtB(dst []byte, neg bool, mant uint64, exp int, flt *floatInfo) []byte {
 	}
 
 	// mantissa
-	dst, _ = formatBits(dst, mant, 10, false, true)
+	dst = AppendUint(dst, mant, 10)
 
 	// p
 	dst = append(dst, 'p')
@@ -493,7 +493,7 @@ func fmtB(dst []byte, neg bool, mant uint64, exp int, flt *floatInfo) []byte {
 	if exp >= 0 {
 		dst = append(dst, '+')
 	}
-	dst, _ = formatBits(dst, uint64(exp), 10, exp < 0, true)
+	dst = AppendInt(dst, int64(exp), 10)
 
 	return dst
 }
diff --git a/src/strconv/ftoaryu.go b/src/strconv/ftoaryu.go
index c1aaa250f3..9349df955f 100644
--- a/src/strconv/ftoaryu.go
+++ b/src/strconv/ftoaryu.go
@@ -192,30 +192,9 @@ func formatDecimal(d *decimalSlice, m uint64, trunc bool, roundUp bool, prec int
 		m /= 10
 		trimmed++
 	}
-	// render digits (similar to formatBits)
-	n := uint(prec)
+	// render digits
+	formatBase10(d.d[:prec], m)
 	d.nd = prec
-	v := m
-	for v >= 100 {
-		var v1, v2 uint64
-		if v>>32 == 0 {
-			v1, v2 = uint64(uint32(v)/100), uint64(uint32(v)%100)
-		} else {
-			v1, v2 = v/100, v%100
-		}
-		n -= 2
-		d.d[n+1] = smallsString[2*v2+1]
-		d.d[n+0] = smallsString[2*v2+0]
-		v = v1
-	}
-	if v > 0 {
-		n--
-		d.d[n] = smallsString[2*v+1]
-	}
-	if v >= 10 {
-		n--
-		d.d[n] = smallsString[2*v]
-	}
 	for d.d[d.nd-1] == '0' {
 		d.nd--
 		trimmed++
@@ -448,8 +427,8 @@ func ryuDigits32(d *decimalSlice, lower, central, upper uint32,
 	n := endindex
 	for n > d.nd {
 		v1, v2 := v/100, v%100
-		d.d[n] = smallsString[2*v2+1]
-		d.d[n-1] = smallsString[2*v2+0]
+		d.d[n] = smalls[2*v2+1]
+		d.d[n-1] = smalls[2*v2+0]
 		n -= 2
 		v = v1
 	}
@@ -535,7 +514,7 @@ func divisibleByPower5(m uint64, k int) bool {
 // divmod1e9 computes quotient and remainder of division by 1e9,
 // avoiding runtime uint64 division on 32-bit platforms.
 func divmod1e9(x uint64) (uint32, uint32) {
-	if !host32bit {
+	if host64bit {
 		return uint32(x / 1e9), uint32(x % 1e9)
 	}
 	// Use the same sequence of operations as the amd64 compiler.
diff --git a/src/strconv/itoa.go b/src/strconv/itoa.go
index 928b37ffa6..7884e8e987 100644
--- a/src/strconv/itoa.go
+++ b/src/strconv/itoa.go
@@ -4,16 +4,22 @@
 
 package strconv
 
-import "math/bits"
-
-const fastSmalls = true // enable fast path for small integers
+import (
+	"math/bits"
+	"runtime"
+)
 
 // FormatUint returns the string representation of i in the given base,
 // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z'
 // for digit values >= 10.
 func FormatUint(i uint64, base int) string {
-	if fastSmalls && i < nSmalls && base == 10 {
-		return small(int(i))
+	if base == 10 {
+		if i < nSmalls {
+			return small(int(i))
+		}
+		var a [24]byte
+		j := formatBase10(a[:], i)
+		return string(a[j:])
 	}
 	_, s := formatBits(nil, i, base, false, false)
 	return s
@@ -23,8 +29,21 @@ func FormatUint(i uint64, base int) string {
 // for 2 <= base <= 36. The result uses the lower-case letters 'a' to 'z'
 // for digit values >= 10.
 func FormatInt(i int64, base int) string {
-	if fastSmalls && 0 <= i && i < nSmalls && base == 10 {
-		return small(int(i))
+	if base == 10 {
+		if 0 <= i && i < nSmalls {
+			return small(int(i))
+		}
+		var a [24]byte
+		u := uint64(i)
+		if i < 0 {
+			u = -u
+		}
+		j := formatBase10(a[:], u)
+		if i < 0 {
+			j--
+			a[j] = '-'
+		}
+		return string(a[j:])
 	}
 	_, s := formatBits(nil, uint64(i), base, i < 0, false)
 	return s
@@ -38,46 +57,29 @@ func Itoa(i int) string {
 // AppendInt appends the string form of the integer i,
 // as generated by [FormatInt], to dst and returns the extended buffer.
 func AppendInt(dst []byte, i int64, base int) []byte {
-	if fastSmalls && 0 <= i && i < nSmalls && base == 10 {
-		return append(dst, small(int(i))...)
+	u := uint64(i)
+	if i < 0 {
+		dst = append(dst, '-')
+		u = -u
 	}
-	dst, _ = formatBits(dst, uint64(i), base, i < 0, true)
-	return dst
+	return AppendUint(dst, u, base)
 }
 
 // AppendUint appends the string form of the unsigned integer i,
 // as generated by [FormatUint], to dst and returns the extended buffer.
 func AppendUint(dst []byte, i uint64, base int) []byte {
-	if fastSmalls && i < nSmalls && base == 10 {
-		return append(dst, small(int(i))...)
+	if base == 10 {
+		if i < nSmalls {
+			return append(dst, small(int(i))...)
+		}
+		var a [24]byte
+		j := formatBase10(a[:], i)
+		return append(dst, a[j:]...)
 	}
 	dst, _ = formatBits(dst, i, base, false, true)
 	return dst
 }
 
-// small returns the string for an i with 0 <= i < nSmalls.
-func small(i int) string {
-	if i < 10 {
-		return digits[i : i+1]
-	}
-	return smallsString[i*2 : i*2+2]
-}
-
-const nSmalls = 100
-
-const smallsString = "00010203040506070809" +
-	"10111213141516171819" +
-	"20212223242526272829" +
-	"30313233343536373839" +
-	"40414243444546474849" +
-	"50515253545556575859" +
-	"60616263646566676869" +
-	"70717273747576777879" +
-	"80818283848586878889" +
-	"90919293949596979899"
-
-const host32bit = ^uint(0)>>32 == 0
-
 const digits = "0123456789abcdefghijklmnopqrstuvwxyz"
 
 // formatBits computes the string representation of u in the given base.
@@ -85,15 +87,15 @@ const digits = "0123456789abcdefghijklmnopqrstuvwxyz"
 // set, the string is appended to dst and the resulting byte slice is
 // returned as the first result value; otherwise the string is returned
 // as the second result value.
+// The caller is expected to have handled base 10 separately for speed.
 func formatBits(dst []byte, u uint64, base int, neg, append_ bool) (d []byte, s string) {
-	if base < 2 || base > len(digits) {
+	if base < 2 || base == 10 || base > len(digits) {
 		panic("strconv: illegal AppendInt/FormatInt base")
 	}
 	// 2 <= base && base <= len(digits)
 
 	var a [64 + 1]byte // +1 for sign of 64bit value in base 2
 	i := len(a)
-
 	if neg {
 		u = -u
 	}
@@ -101,56 +103,7 @@ func formatBits(dst []byte, u uint64, base int, neg, append_ bool) (d []byte, s
 	// convert bits
 	// We use uint values where we can because those will
 	// fit into a single register even on a 32bit machine.
-	if base == 10 {
-		// common case: use constants for / because
-		// the compiler can optimize it into a multiply+shift
-
-		if host32bit {
-			// convert the lower digits using 32bit operations
-			for u >= 1e9 {
-				// Avoid using r = a%b in addition to q = a/b
-				// since 64bit division and modulo operations
-				// are calculated by runtime functions on 32bit machines.
-				q := u / 1e9
-				us := uint(u - q*1e9) // u % 1e9 fits into a uint
-				for j := 4; j > 0; j-- {
-					is := us % 100 * 2
-					us /= 100
-					i -= 2
-					a[i+1] = smallsString[is+1]
-					a[i+0] = smallsString[is+0]
-				}
-
-				// us < 10, since it contains the last digit
-				// from the initial 9-digit us.
-				i--
-				a[i] = smallsString[us*2+1]
-
-				u = q
-			}
-			// u < 1e9
-		}
-
-		// u guaranteed to fit into a uint
-		us := uint(u)
-		for us >= 100 {
-			is := us % 100 * 2
-			us /= 100
-			i -= 2
-			a[i+1] = smallsString[is+1]
-			a[i+0] = smallsString[is+0]
-		}
-
-		// us < 100
-		is := us * 2
-		i--
-		a[i] = smallsString[is+1]
-		if us >= 10 {
-			i--
-			a[i] = smallsString[is]
-		}
-
-	} else if isPowerOfTwo(base) {
+	if isPowerOfTwo(base) {
 		// Use shifts and masks instead of / and %.
 		shift := uint(bits.TrailingZeros(uint(base)))
 		b := uint64(base)
@@ -197,3 +150,159 @@ func formatBits(dst []byte, u uint64, base int, neg, append_ bool) (d []byte, s
 func isPowerOfTwo(x int) bool {
 	return x&(x-1) == 0
 }
+
+const nSmalls = 100
+
+// smalls is the formatting of 00..99 concatenated.
+// It is then padded out with 56 x's to 256 bytes,
+// so that smalls[x&0xFF] has no bounds check.
+//
+// TODO(rsc): Once the compiler does a better job
+// at tracking mod bounds, the &0xFF should not be needed:
+// go.dev/issue/75954 and go.dev/issue/63110.
+const smalls = "00010203040506070809" +
+	"10111213141516171819" +
+	"20212223242526272829" +
+	"30313233343536373839" +
+	"40414243444546474849" +
+	"50515253545556575859" +
+	"60616263646566676869" +
+	"70717273747576777879" +
+	"80818283848586878889" +
+	"90919293949596979899" +
+	"xxxxxxxxxxxxxxxxxxxx" +
+	"xxxxxxxxxxxxxxxxxxxx" +
+	"xxxxxxxxxxxxxxxxxxxx" +
+	"xxxxxxxxxxxxxxxxxxxx" +
+	"xxxxxxxxxxxxxxxxxxxx" +
+	"xxxxxx"
+
+const host64bit = ^uint(0)>>32 != 0
+
+// small returns the string for an i with 0 <= i < nSmalls.
+func small(i int) string {
+	if i < 10 {
+		return digits[i : i+1]
+	}
+	return smalls[i*2 : i*2+2]
+}
+
+// formatBase10 formats the decimal representation of u into the tail of a
+// and returns the offset of the first byte written to a. That is, after
+//
+//	i := formatBase10(a, u)
+//
+// the decimal representation is in a[i:].
+func formatBase10(a []byte, u uint64) int {
+	// Decide implementation strategy based on architecture.
+	const (
+		// 64-bit systems can work in 64-bit math the whole time
+		// or can split the uint64 into uint32-sized chunks.
+		// On most systems, the uint32 math is faster, but not all.
+		// The decision here is based on benchmarking.
+		itoaPure64 = host64bit && runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" && runtime.GOARCH != "s390x"
+
+		// 64-bit systems can all use 64-bit div and mod by a constant,
+		// which the compiler rewrites to use 64x64→128-bit multiplies.
+		itoaDivMod64 = host64bit // can use 64-bit div/mod by constant
+	)
+
+	if itoaPure64 {
+		// Convert 2 digits at a time, using 64-bit math.
+		i := len(a)
+		u := uint(u)
+		for u >= 100 {
+			var dd uint
+			u, dd = u/100, (u%100)*2
+			i -= 2
+			a[i+0], a[i+1] = smalls[(dd+0)&0xFF], smalls[(dd+1)&0xFF]
+		}
+
+		dd := u * 2
+		i--
+		a[i] = smalls[(dd+1)&0xFF]
+		if u >= 10 {
+			i--
+			a[i] = smalls[(dd+0)&0xFF]
+		}
+		return i
+	}
+
+	// Convert 9-digit chunks using 32-bit math.
+	// Most numbers are small, so the comparison u >= 1e9 is usually pure overhead,
+	// so we approximate it by u>>29 != 0, which is usually faster and good enough.
+	i := len(a)
+	for (host64bit && u>>29 != 0) || (!host64bit && (u>>32 != 0 || uint32(u)>>29 != 0)) {
+		var lo uint32
+		if itoaDivMod64 {
+			u, lo = u/1e9, uint32(u%1e9)
+		} else {
+			// On 64-bit systems the compiler rewrites the div and mod above
+			// into a 64x64→128-bit multiply (https://godbolt.org/z/EPnK8zvMK):
+			//	hi, _ := bits.Mul64(u>>1, 0x89705f4136b4a598)
+			//	q := hi >> 28
+			//	lo = uint32(u - q*1e9)
+			//	u = q
+			// On 32-bit systems, the compiler invokes a uint64 software divide,
+			// which is quite slow. We could write the bits.Mul64 code above
+			// but even that is slower than we'd like, since it calls a software mul64
+			// instead of having a hardware instruction to use.
+			// Instead we inline bits.Mul64 here and change y0/y1 to constants.
+			// The compiler does use direct 32x32→64-bit multiplies for this code.
+			//
+			// For lots more about division by multiplication see Warren, _Hacker's Delight_.
+			// For a concise overview, see the first two sections of
+			// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html.
+			const mask32 = 1<<32 - 1
+			x0 := ((u >> 1) & mask32)
+			x1 := (u >> 1) >> 32
+			const y0 = 0x36b4a598
+			const y1 = 0x89705f41
+			w0 := x0 * y0
+			t := x1*y0 + w0>>32
+			w1 := t & mask32
+			w2 := t >> 32
+			w1 += x0 * y1
+			hi := x1*y1 + w2 + w1>>32
+			q := hi >> 28
+
+			lo = uint32(u) - uint32(q)*1e9 // uint32(u - q*1e9) but faster
+			u = q
+		}
+
+		// Convert 9 digits.
+		for range 4 {
+			var dd uint32
+			lo, dd = lo/100, (lo%100)*2
+			i -= 2
+			a[i+0], a[i+1] = smalls[(dd+0)&0xFF], smalls[(dd+1)&0xFF]
+		}
+		i--
+		a[i] = smalls[(lo*2+1)&0xFF]
+
+		// If we'd been using u >= 1e9 then we would be guaranteed that u/1e9 > 0,
+		// but since we used u>>29 != 0, u/1e9 might be 0, so we might be done.
+		// (If u is now 0, then at the start we had 2²⁹ ≤ u < 10⁹, so it was still correct
+		// to write 9 digits; we have not accidentally written any leading zeros.)
+		if u == 0 {
+			return i
+		}
+	}
+
+	// Convert final chunk, at most 8 digits.
+	lo := uint32(u)
+	for lo >= 100 {
+		var dd uint32
+		lo, dd = lo/100, (lo%100)*2
+		i -= 2
+		a[i+0], a[i+1] = smalls[(dd+0)&0xFF], smalls[(dd+1)&0xFF]
+	}
+	i--
+	dd := lo * 2
+	a[i] = smalls[(dd+1)&0xFF]
+	if lo >= 10 {
+		i--
+		a[i] = smalls[(dd+0)&0xFF]
+	}
+	return i
+}