diff options
Diffstat (limited to 'src/bytes')
| -rw-r--r-- | src/bytes/bytes.go | 97 | ||||
| -rw-r--r-- | src/bytes/bytes_test.go | 54 |
2 files changed, 139 insertions, 12 deletions
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index e7931387aa..ef7294d805 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -183,6 +183,29 @@ func IndexAny(s []byte, chars string) int { // Avoid scanning all of s. return -1 } + if len(s) == 1 { + r := rune(s[0]) + if r >= utf8.RuneSelf { + // search utf8.RuneError. + for _, r = range chars { + if r == utf8.RuneError { + return 0 + } + } + return -1 + } + if bytealg.IndexByteString(chars, s[0]) >= 0 { + return 0 + } + return -1 + } + if len(chars) == 1 { + r := rune(chars[0]) + if r >= utf8.RuneSelf { + r = utf8.RuneError + } + return IndexRune(s, r) + } if len(s) > 8 { if as, isASCII := makeASCIISet(chars); isASCII { for i, c := range s { @@ -197,14 +220,26 @@ func IndexAny(s []byte, chars string) int { for i := 0; i < len(s); i += width { r := rune(s[i]) if r < utf8.RuneSelf { + if bytealg.IndexByteString(chars, s[i]) >= 0 { + return i + } width = 1 - } else { - r, width = utf8.DecodeRune(s[i:]) + continue } - for _, ch := range chars { - if r == ch { - return i + r, width = utf8.DecodeRune(s[i:]) + if r == utf8.RuneError { + for _, r = range chars { + if r == utf8.RuneError { + return i + } } + continue + } + // r is 2 to 4 bytes. Using strings.Index is more reasonable, but as the bytes + // package should not import the strings package, use bytealg.IndexString + // instead. And this does not seem to lose much performance. + if chars == string(r) || bytealg.IndexString(chars, string(r)) >= 0 { + return i } } return -1 @@ -229,13 +264,59 @@ func LastIndexAny(s []byte, chars string) int { return -1 } } + if len(s) == 1 { + r := rune(s[0]) + if r >= utf8.RuneSelf { + for _, r = range chars { + if r == utf8.RuneError { + return 0 + } + } + return -1 + } + if bytealg.IndexByteString(chars, s[0]) >= 0 { + return 0 + } + return -1 + } + if len(chars) == 1 { + cr := rune(chars[0]) + if cr >= utf8.RuneSelf { + cr = utf8.RuneError + } + for i := len(s); i > 0; { + r, size := utf8.DecodeLastRune(s[:i]) + i -= size + if r == cr { + return i + } + } + return -1 + } for i := len(s); i > 0; { + r := rune(s[i-1]) + if r < utf8.RuneSelf { + if bytealg.IndexByteString(chars, s[i-1]) >= 0 { + return i - 1 + } + i-- + continue + } r, size := utf8.DecodeLastRune(s[:i]) i -= size - for _, c := range chars { - if r == c { - return i + if r == utf8.RuneError { + for _, r = range chars { + if r == utf8.RuneError { + return i + } } + continue + } + // r is 2 to 4 bytes. Using strings.Index is more reasonable, but as the bytes + // package should not import the strings package, use bytealg.IndexString + // instead. And this does not seem to lose much performance. + if chars == string(r) || bytealg.IndexString(chars, string(r)) >= 0 { + return i } } return -1 diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index a208d4ed76..544ee46f90 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -169,6 +169,7 @@ var indexAnyTests = []BinOpTest{ {"", "abc", -1}, {"a", "", -1}, {"a", "a", 0}, + {"\x80", "\xffb", 0}, {"aaa", "a", 0}, {"abc", "xyz", -1}, {"abc", "xcz", 2}, @@ -179,6 +180,7 @@ var indexAnyTests = []BinOpTest{ {dots + dots + dots, " ", -1}, {"012abcba210", "\xffb", 4}, {"012\x80bcb\x80210", "\xffb", 3}, + {"0123456\xcf\x80abc", "\xcfb\x80", 10}, } var lastIndexAnyTests = []BinOpTest{ @@ -187,6 +189,7 @@ var lastIndexAnyTests = []BinOpTest{ {"", "abc", -1}, {"a", "", -1}, {"a", "a", 0}, + {"\x80", "\xffb", 0}, {"aaa", "a", 2}, {"abc", "xyz", -1}, {"abc", "ab", 1}, @@ -197,6 +200,7 @@ var lastIndexAnyTests = []BinOpTest{ {dots + dots + dots, " ", -1}, {"012abcba210", "\xffb", 6}, {"012\x80bcb\x80210", "\xffb", 7}, + {"0123456\xcf\x80abc", "\xcfb\x80", 10}, } // Execute f on each test case. funcName should be the name of f; it's used @@ -1890,10 +1894,24 @@ func BenchmarkBytesCompare(b *testing.B) { } func BenchmarkIndexAnyASCII(b *testing.B) { - x := Repeat([]byte{'#'}, 4096) // Never matches set - cs := "0123456789abcdef" - for k := 1; k <= 4096; k <<= 4 { - for j := 1; j <= 16; j <<= 1 { + x := Repeat([]byte{'#'}, 2048) // Never matches set + cs := "0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz" + for k := 1; k <= 2048; k <<= 4 { + for j := 1; j <= 64; j <<= 1 { + b.Run(fmt.Sprintf("%d:%d", k, j), func(b *testing.B) { + for i := 0; i < b.N; i++ { + IndexAny(x[:k], cs[:j]) + } + }) + } + } +} + +func BenchmarkIndexAnyUTF8(b *testing.B) { + x := Repeat([]byte{'#'}, 2048) // Never matches set + cs := "你好世界, hello world. 你好世界, hello world. 你好世界, hello world." + for k := 1; k <= 2048; k <<= 4 { + for j := 1; j <= 64; j <<= 1 { b.Run(fmt.Sprintf("%d:%d", k, j), func(b *testing.B) { for i := 0; i < b.N; i++ { IndexAny(x[:k], cs[:j]) @@ -1903,6 +1921,34 @@ func BenchmarkIndexAnyASCII(b *testing.B) { } } +func BenchmarkLastIndexAnyASCII(b *testing.B) { + x := Repeat([]byte{'#'}, 2048) // Never matches set + cs := "0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz" + for k := 1; k <= 2048; k <<= 4 { + for j := 1; j <= 64; j <<= 1 { + b.Run(fmt.Sprintf("%d:%d", k, j), func(b *testing.B) { + for i := 0; i < b.N; i++ { + LastIndexAny(x[:k], cs[:j]) + } + }) + } + } +} + +func BenchmarkLastIndexAnyUTF8(b *testing.B) { + x := Repeat([]byte{'#'}, 2048) // Never matches set + cs := "你好世界, hello world. 你好世界, hello world. 你好世界, hello world." + for k := 1; k <= 2048; k <<= 4 { + for j := 1; j <= 64; j <<= 1 { + b.Run(fmt.Sprintf("%d:%d", k, j), func(b *testing.B) { + for i := 0; i < b.N; i++ { + LastIndexAny(x[:k], cs[:j]) + } + }) + } + } +} + func BenchmarkTrimASCII(b *testing.B) { cs := "0123456789abcdef" for k := 1; k <= 4096; k <<= 4 { |
