From 691f5c34ad7b7d676644dd749bc4ddb5d20b90d0 Mon Sep 17 00:00:00 2001 From: erifan01 Date: Thu, 19 Jul 2018 09:55:17 +0000 Subject: strings, bytes: optimize function Index This change compares the first two characters instead of the first one, and if they match, the entire string is compared. Comparing the first two characters helps to filter out the case where the first character matches but the subsequent characters do not match, thereby improving the substring search speed in this case. Benchmarks with no effect or minimal impact (less than 5%) is not listed, the following are improved benchmarks: On arm64: strings: IndexPeriodic/IndexPeriodic16-8 172890.00ns +- 2% 124156.20ns +- 0% -28.19% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic32-8 78092.80ns +- 0% 65138.60ns +- 0% -16.59% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic64-8 42322.20ns +- 0% 34661.60ns +- 0% -18.10% (p=0.008 n=5+5) bytes: IndexPeriodic/IndexPeriodic16-8 183468.20ns +- 6% 123759.00ns +- 0% -32.54% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic32-8 84776.40ns +- 0% 63907.80ns +- 0% -24.62% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic64-8 45835.60ns +- 0% 34194.20ns +- 0% -25.40% (p=0.008 n=5+5) On amd64: strings: IndexPeriodic/IndexPeriodic8-16 219499.00ns +- 0% 178123.40ns +- 0% -18.85% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic16-16 109760.20ns +- 0% 88957.80ns +- 0% -18.95% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic32-16 54943.00ns +- 0% 44573.80ns +- 0% -18.87% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic64-16 29804.80ns +- 0% 24417.80ns +- 0% -18.07% (p=0.008 n=5+5) bytes: IndexPeriodic/IndexPeriodic8-16 226592.60ns +- 0% 181183.20ns +- 0% -20.04% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic16-16 111432.60ns +- 0% 90634.60ns +- 0% -18.66% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic32-16 55640.60ns +- 0% 45433.00ns +- 0% -18.35% (p=0.008 n=5+5) IndexPeriodic/IndexPeriodic64-16 30833.00ns +- 0% 24784.20ns +- 0% -19.62% (p=0.008 n=5+5) Change-Id: I2d9e7e138d29e960d20a203eb74dc2ec976a9d71 Reviewed-on: https://go-review.googlesource.com/131177 Run-TryBot: Ian Lance Taylor TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/strings/strings.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'src/strings/strings.go') diff --git a/src/strings/strings.go b/src/strings/strings.go index df95715ec8..26aceda212 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -947,21 +947,22 @@ func Index(s, substr string) int { if len(s) <= bytealg.MaxBruteForce { return bytealg.IndexString(s, substr) } - c := substr[0] + c0 := substr[0] + c1 := substr[1] i := 0 - t := s[:len(s)-n+1] + t := len(s) - n + 1 fails := 0 - for i < len(t) { - if t[i] != c { + for i < t { + if s[i] != c0 { // IndexByte is faster than bytealg.IndexString, so use it as long as // we're not getting lots of false positives. - o := IndexByte(t[i:], c) + o := IndexByte(s[i:t], c0) if o < 0 { return -1 } i += o } - if s[i:i+n] == substr { + if s[i+1] == c1 && s[i:i+n] == substr { return i } fails++ @@ -977,24 +978,25 @@ func Index(s, substr string) int { } return -1 } - c := substr[0] + c0 := substr[0] + c1 := substr[1] i := 0 - t := s[:len(s)-n+1] + t := len(s) - n + 1 fails := 0 - for i < len(t) { - if t[i] != c { - o := IndexByte(t[i:], c) + for i < t { + if s[i] != c0 { + o := IndexByte(s[i:t], c0) if o < 0 { return -1 } i += o } - if s[i:i+n] == substr { + if s[i+1] == c1 && s[i:i+n] == substr { return i } i++ fails++ - if fails >= 4+i>>4 && i < len(t) { + if fails >= 4+i>>4 && i < t { // See comment in ../bytes/bytes_generic.go. j := indexRabinKarp(s[i:], substr) if j < 0 { -- cgit v1.3-5-g9baa From 14e7f174c1bfb80192274b049487716cdde0b4ee Mon Sep 17 00:00:00 2001 From: Tom Thorogood Date: Wed, 26 Sep 2018 11:29:18 +0000 Subject: strings: use Builder in ToUpper and ToLower MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map was optimized to use Builder in 45c7d80832, which avoided the []byte to string converstion. This left the ToUpper and ToLower ASCII fast path with an extra allocation over Map. name old time/op new time/op delta ToUpper/#00-12 3.59ns ± 4% 3.71ns ± 1% ~ (p=0.056 n=5+5) ToUpper/ONLYUPPER-12 11.8ns ± 2% 10.5ns ± 2% -10.85% (p=0.008 n=5+5) ToUpper/abc-12 31.8ns ± 1% 25.3ns ± 1% -20.40% (p=0.008 n=5+5) ToUpper/AbC123-12 46.2ns ± 7% 31.9ns ± 8% -30.89% (p=0.008 n=5+5) ToUpper/azAZ09_-12 47.1ns ± 8% 32.6ns ± 4% -30.77% (p=0.008 n=5+5) ToUpper/longStrinGwitHmixofsmaLLandcAps-12 137ns ±15% 104ns ±11% -24.11% (p=0.008 n=5+5) ToUpper/longɐstringɐwithɐnonasciiⱯchars-12 231ns ± 1% 228ns ± 1% ~ (p=0.079 n=5+5) ToUpper/ɐɐɐɐɐ-12 207ns ± 3% 206ns ± 1% ~ (p=0.913 n=5+5) ToUpper/a\u0080\U0010ffff-12 90.8ns ± 1% 89.6ns ± 1% -1.30% (p=0.024 n=5+5) ToLower/#00-12 3.59ns ± 1% 4.26ns ± 2% +18.66% (p=0.008 n=5+5) ToLower/abc-12 6.32ns ± 1% 6.62ns ± 1% +4.72% (p=0.008 n=5+5) ToLower/AbC123-12 45.0ns ±13% 31.5ns ± 4% -29.89% (p=0.008 n=5+5) ToLower/azAZ09_-12 48.8ns ± 6% 33.2ns ± 3% -31.91% (p=0.008 n=5+5) ToLower/longStrinGwitHmixofsmaLLandcAps-12 149ns ±13% 98ns ± 8% -34.30% (p=0.008 n=5+5) ToLower/LONGⱯSTRINGⱯWITHⱯNONASCIIⱯCHARS-12 237ns ± 4% 237ns ± 2% ~ (p=0.635 n=5+5) ToLower/ⱭⱭⱭⱭⱭ-12 181ns ± 1% 181ns ± 1% ~ (p=0.762 n=5+5) ToLower/A\u0080\U0010ffff-12 90.6ns ± 1% 92.5ns ± 1% +2.05% (p=0.016 n=5+5) name old alloc/op new alloc/op delta ToUpper/#00-12 0.00B 0.00B ~ (all equal) ToUpper/ONLYUPPER-12 0.00B 0.00B ~ (all equal) ToUpper/abc-12 6.00B ± 0% 3.00B ± 0% -50.00% (p=0.008 n=5+5) ToUpper/AbC123-12 16.0B ± 0% 8.0B ± 0% -50.00% (p=0.008 n=5+5) ToUpper/azAZ09_-12 16.0B ± 0% 8.0B ± 0% -50.00% (p=0.008 n=5+5) ToUpper/longStrinGwitHmixofsmaLLandcAps-12 64.0B ± 0% 32.0B ± 0% -50.00% (p=0.008 n=5+5) ToUpper/longɐstringɐwithɐnonasciiⱯchars-12 48.0B ± 0% 48.0B ± 0% ~ (all equal) ToUpper/ɐɐɐɐɐ-12 48.0B ± 0% 48.0B ± 0% ~ (all equal) ToUpper/a\u0080\U0010ffff-12 16.0B ± 0% 16.0B ± 0% ~ (all equal) ToLower/#00-12 0.00B 0.00B ~ (all equal) ToLower/abc-12 0.00B 0.00B ~ (all equal) ToLower/AbC123-12 16.0B ± 0% 8.0B ± 0% -50.00% (p=0.008 n=5+5) ToLower/azAZ09_-12 16.0B ± 0% 8.0B ± 0% -50.00% (p=0.008 n=5+5) ToLower/longStrinGwitHmixofsmaLLandcAps-12 64.0B ± 0% 32.0B ± 0% -50.00% (p=0.008 n=5+5) ToLower/LONGⱯSTRINGⱯWITHⱯNONASCIIⱯCHARS-12 48.0B ± 0% 48.0B ± 0% ~ (all equal) ToLower/ⱭⱭⱭⱭⱭ-12 32.0B ± 0% 32.0B ± 0% ~ (all equal) ToLower/A\u0080\U0010ffff-12 16.0B ± 0% 16.0B ± 0% ~ (all equal) name old allocs/op new allocs/op delta ToUpper/#00-12 0.00 0.00 ~ (all equal) ToUpper/ONLYUPPER-12 0.00 0.00 ~ (all equal) ToUpper/abc-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToUpper/AbC123-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToUpper/azAZ09_-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToUpper/longStrinGwitHmixofsmaLLandcAps-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToUpper/longɐstringɐwithɐnonasciiⱯchars-12 1.00 ± 0% 1.00 ± 0% ~ (all equal) ToUpper/ɐɐɐɐɐ-12 2.00 ± 0% 2.00 ± 0% ~ (all equal) ToUpper/a\u0080\U0010ffff-12 1.00 ± 0% 1.00 ± 0% ~ (all equal) ToLower/#00-12 0.00 0.00 ~ (all equal) ToLower/abc-12 0.00 0.00 ~ (all equal) ToLower/AbC123-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToLower/azAZ09_-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToLower/longStrinGwitHmixofsmaLLandcAps-12 2.00 ± 0% 1.00 ± 0% -50.00% (p=0.008 n=5+5) ToLower/LONGⱯSTRINGⱯWITHⱯNONASCIIⱯCHARS-12 1.00 ± 0% 1.00 ± 0% ~ (all equal) ToLower/ⱭⱭⱭⱭⱭ-12 1.00 ± 0% 1.00 ± 0% ~ (all equal) ToLower/A\u0080\U0010ffff-12 1.00 ± 0% 1.00 ± 0% ~ (all equal) Updates #26304 Change-Id: I4179e21d5e60d950b925fe3ffc74b376b82812d2 GitHub-Last-Rev: 2c7c3bb75b8fb16fed5f0c8979ee9941675ed6bf GitHub-Pull-Request: golang/go#27872 Reviewed-on: https://go-review.googlesource.com/137575 Run-TryBot: Ian Lance Taylor TryBot-Result: Gobot Gobot Reviewed-by: Brad Fitzpatrick --- src/strings/strings.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'src/strings/strings.go') diff --git a/src/strings/strings.go b/src/strings/strings.go index 26aceda212..b033c38e91 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -561,15 +561,16 @@ func ToUpper(s string) string { if !hasLower { return s } - b := make([]byte, len(s)) + var b Builder + b.Grow(len(s)) for i := 0; i < len(s); i++ { c := s[i] if c >= 'a' && c <= 'z' { c -= 'a' - 'A' } - b[i] = c + b.WriteByte(c) } - return string(b) + return b.String() } return Map(unicode.ToUpper, s) } @@ -590,15 +591,16 @@ func ToLower(s string) string { if !hasUpper { return s } - b := make([]byte, len(s)) + var b Builder + b.Grow(len(s)) for i := 0; i < len(s); i++ { c := s[i] if c >= 'A' && c <= 'Z' { c += 'a' - 'A' } - b[i] = c + b.WriteByte(c) } - return string(b) + return b.String() } return Map(unicode.ToLower, s) } -- cgit v1.3-5-g9baa From ebdc0b8d68e04ad383088c8b3ab963de4a9b5c5d Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 26 Sep 2018 20:19:11 +0000 Subject: bytes, strings: add ReplaceAll Credit to Harald Nordgren for the proposal in https://golang.org/cl/137456 and #27864. Fixes #27864 Change-Id: I80546683b0623124fe4627a71af88add2f6c1c27 Reviewed-on: https://go-review.googlesource.com/137855 Reviewed-by: Ian Lance Taylor --- src/bytes/bytes.go | 9 +++++++++ src/bytes/bytes_test.go | 6 ++++++ src/strings/strings.go | 9 +++++++++ src/strings/strings_test.go | 6 ++++++ 4 files changed, 30 insertions(+) (limited to 'src/strings/strings.go') diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 876fa3c1ed..6492db088a 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -774,6 +774,15 @@ func Replace(s, old, new []byte, n int) []byte { return t[0:w] } +// ReplaceAll returns a copy of the slice s with all +// non-overlapping instances of old replaced by new. +// If old is empty, it matches at the beginning of the slice +// and after each UTF-8 sequence, yielding up to k+1 replacements +// for a k-rune slice. +func ReplaceAll(s, old, new []byte) []byte { + return Replace(s, old, new, -1) +} + // EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding. func EqualFold(s, t []byte) bool { diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 55a22bae22..f4c0ffd2a9 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -1362,6 +1362,12 @@ func TestReplace(t *testing.T) { if cap(in) == cap(out) && &in[:1][0] == &out[:1][0] { t.Errorf("Replace(%q, %q, %q, %d) didn't copy", tt.in, tt.old, tt.new, tt.n) } + if tt.n == -1 { + out := ReplaceAll(in, []byte(tt.old), []byte(tt.new)) + if s := string(out); s != tt.out { + t.Errorf("ReplaceAll(%q, %q, %q) = %q, want %q", tt.in, tt.old, tt.new, s, tt.out) + } + } } } diff --git a/src/strings/strings.go b/src/strings/strings.go index b033c38e91..00200e4e24 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -874,6 +874,15 @@ func Replace(s, old, new string, n int) string { return string(t[0:w]) } +// ReplaceAll returns a copy of the string s with all +// non-overlapping instances of old replaced by new. +// If old is empty, it matches at the beginning of the string +// and after each UTF-8 sequence, yielding up to k+1 replacements +// for a k-rune string. +func ReplaceAll(s, old, new string) string { + return Replace(s, old, new, -1) +} + // EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding. func EqualFold(s, t string) bool { diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index 20bc484f39..bb6a5b931b 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -1243,6 +1243,12 @@ func TestReplace(t *testing.T) { if s := Replace(tt.in, tt.old, tt.new, tt.n); s != tt.out { t.Errorf("Replace(%q, %q, %q, %d) = %q, want %q", tt.in, tt.old, tt.new, tt.n, s, tt.out) } + if tt.n == -1 { + s := ReplaceAll(tt.in, tt.old, tt.new) + if s != tt.out { + t.Errorf("ReplaceAll(%q, %q, %q) = %q, want %q", tt.in, tt.old, tt.new, s, tt.out) + } + } } } -- cgit v1.3-5-g9baa From f74de24fbd94d021b047afe0dc62eddeb65ca384 Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Sun, 26 Aug 2018 14:22:39 +0200 Subject: strings: correctly handle invalid utf8 sequences in Map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an invalid UTF-8 byte sequence is decoded in a range loop over a string a utf8.RuneError rune is returned. This is not distinguishable from decoding the valid '\uFFFD' sequence representing utf8.RuneError from a string without further checks within the range loop. The previous Map code did not do any extra checks and would thereby not map invalid UTF-8 byte sequences correctly when those were mapping to utf8.RuneError. Fix this by adding the extra checks necessary to distinguish the decoding of invalid utf8 byte sequences from decoding the sequence for utf8.RuneError when the mapping of a rune is utf8.RuneError. This fix does not result in a measureable performance regression: name old time/op new time/op delta ByteByteMap 1.05µs ± 3% 1.03µs ± 3% ~ (p=0.118 n=10+10) Map/identity/ASCII 169ns ± 2% 170ns ± 1% ~ (p=0.501 n=9+10) Map/identity/Greek 298ns ± 1% 303ns ± 4% ~ (p=0.338 n=10+10) Map/change/ASCII 323ns ± 3% 325ns ± 4% ~ (p=0.679 n=8+10) Map/change/Greek 628ns ± 5% 635ns ± 1% ~ (p=0.460 n=10+9) MapNoChanges 120ns ± 4% 119ns ± 1% ~ (p=0.496 n=10+9) Fixes #26305 Change-Id: I70e99fa244983c5040756fa4549ac1e8cb6022c3 Reviewed-on: https://go-review.googlesource.com/c/131495 Reviewed-by: Brad Fitzpatrick --- src/strings/strings.go | 24 ++++++++++++------------ src/strings/strings_test.go | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'src/strings/strings.go') diff --git a/src/strings/strings.go b/src/strings/strings.go index 00200e4e24..ecc8c97d9e 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -463,27 +463,27 @@ func Map(mapping func(rune) rune, s string) string { for i, c := range s { r := mapping(c) - if r == c { + if r == c && c != utf8.RuneError { continue } + var width int + if c == utf8.RuneError { + c, width = utf8.DecodeRuneInString(s[i:]) + if width != 1 && r == c { + continue + } + } else { + width = utf8.RuneLen(c) + } + b.Grow(len(s) + utf8.UTFMax) b.WriteString(s[:i]) if r >= 0 { b.WriteRune(r) } - if c == utf8.RuneError { - // RuneError is the result of either decoding - // an invalid sequence or '\uFFFD'. Determine - // the correct number of bytes we need to advance. - _, w := utf8.DecodeRuneInString(s[i:]) - i += w - } else { - i += utf8.RuneLen(c) - } - - s = s[i:] + s = s[i+width:] break } diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index bb6a5b931b..eee2dd55df 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -646,10 +646,10 @@ func TestMap(t *testing.T) { if unicode.Is(unicode.Latin, r) { return r } - return '?' + return utf8.RuneError } m = Map(replaceNotLatin, "Hello\255World") - expect = "Hello?World" + expect = "Hello\uFFFDWorld" if m != expect { t.Errorf("replace invalid sequence: expected %q got %q", expect, m) } -- cgit v1.3-5-g9baa