aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRémy Oudompheng <oudomphe@phare.normalesup.org>2013-02-17 13:07:17 +0100
committerRémy Oudompheng <oudomphe@phare.normalesup.org>2013-02-17 13:07:17 +0100
commit23093f86eebbefb0cf11298c45513da360d2b48d (patch)
treeaba59fceb7730d4b50d382118df22b4019468ee8 /src
parentf1037f0e86d6f02fd29fe859ff0b5ed2ded6ecf5 (diff)
downloadgo-23093f86eebbefb0cf11298c45513da360d2b48d.tar.xz
strings: better mean complexity for Count and Index.
The O(n+m) complexity is obtained probabilistically by using Rabin-Karp algorithm, which provides the needed complexity unless exceptional collisions occur, without memory allocation. benchmark old ns/op new ns/op delta BenchmarkIndexHard1 6532331 4045886 -38.06% BenchmarkIndexHard2 8178173 4038975 -50.61% BenchmarkIndexHard3 6973687 4042591 -42.03% BenchmarkCountHard1 6270864 4071090 -35.08% BenchmarkCountHard2 7838039 4072853 -48.04% BenchmarkCountHard3 6697828 4071964 -39.20% BenchmarkIndexTorture 2730546 28934 -98.94% BenchmarkCountTorture 2729622 29064 -98.94% Fixes #4600. R=rsc, donovanhide, remyoudompheng CC=golang-dev https://golang.org/cl/7314095
Diffstat (limited to 'src')
-rw-r--r--src/pkg/strings/strings.go66
-rw-r--r--src/pkg/strings/strings_test.go51
2 files changed, 109 insertions, 8 deletions
diff --git a/src/pkg/strings/strings.go b/src/pkg/strings/strings.go
index d4b3f03473..72b8d223af 100644
--- a/src/pkg/strings/strings.go
+++ b/src/pkg/strings/strings.go
@@ -36,15 +36,35 @@ func explode(s string, n int) []string {
return a
}
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashstr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashstr(sep string) (uint32, uint32) {
+ hash := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ hash = hash*primeRK + uint32(sep[i])
+
+ }
+ var pow, sq uint32 = 1, primeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
// Count counts the number of non-overlapping instances of sep in s.
func Count(s, sep string) int {
if sep == "" {
return utf8.RuneCountInString(s) + 1
}
c := sep[0]
- l := len(sep)
n := 0
- if l == 1 {
+ if len(sep) == 1 {
// special case worth making fast
for i := 0; i < len(s); i++ {
if s[i] == c {
@@ -53,11 +73,26 @@ func Count(s, sep string) int {
}
return n
}
- for i := 0; i+l <= len(s); i++ {
- if s[i] == c && s[i:i+l] == sep {
+ if len(sep) > len(s) {
+ return 0
+ }
+ hashsep, pow := hashstr(sep)
+ h := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ h = h*primeRK + uint32(s[i])
+ }
+ lastmatch := 0
+ for i := len(sep); ; i++ {
+ // Invariant: h = hash(s[i-l : i])
+ if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
n++
- i += l - 1
+ lastmatch = i
+ }
+ if i >= len(s) {
+ break
}
+ h = h*primeRK + uint32(s[i])
+ h -= pow * uint32(s[i-len(sep)])
}
return n
}
@@ -94,10 +129,25 @@ func Index(s, sep string) int {
return -1
}
// n > 1
- for i := 0; i+n <= len(s); i++ {
- if s[i] == c && s[i:i+n] == sep {
- return i
+ if n > len(s) {
+ return -1
+ }
+ // Hash sep.
+ hashsep, pow := hashstr(sep)
+ var h uint32
+ for i := 0; i < n; i++ {
+ h = h*primeRK + uint32(s[i])
+ }
+ for i := n; ; i++ {
+ // Invariant: h = hash(s[i-n : i])
+ if h == hashsep && s[i-n:i] == sep {
+ return i - n
+ }
+ if i >= len(s) {
+ break
}
+ h = h*primeRK + uint32(s[i])
+ h -= pow * uint32(s[i-n])
}
return -1
}
diff --git a/src/pkg/strings/strings_test.go b/src/pkg/strings/strings_test.go
index e222af14a7..b5bdf35d15 100644
--- a/src/pkg/strings/strings_test.go
+++ b/src/pkg/strings/strings_test.go
@@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) {
}
}
+func makeBenchInputHard() string {
+ tokens := [...]string{
+ "<a>", "<p>", "<b>", "<strong>",
+ "</a>", "</p>", "</b>", "</strong>",
+ "hello", "world",
+ }
+ x := make([]byte, 0, 1<<20)
+ for len(x) < 1<<20 {
+ i := rand.Intn(len(tokens))
+ x = append(x, tokens[i]...)
+ }
+ return string(x)
+}
+
+var benchInputHard = makeBenchInputHard()
+
+func benchmarkIndexHard(b *testing.B, sep string) {
+ for i := 0; i < b.N; i++ {
+ Index(benchInputHard, sep)
+ }
+}
+
+func benchmarkCountHard(b *testing.B, sep string) {
+ for i := 0; i < b.N; i++ {
+ Count(benchInputHard, sep)
+ }
+}
+
+func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") }
+func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "</pre>") }
+func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "<b>hello world</b>") }
+
+func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") }
+func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "</pre>") }
+func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "<b>hello world</b>") }
+
+var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10)
+var benchNeedleTorture = Repeat("ABC", 1<<10+1)
+
+func BenchmarkIndexTorture(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ Index(benchInputTorture, benchNeedleTorture)
+ }
+}
+
+func BenchmarkCountTorture(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ Count(benchInputTorture, benchNeedleTorture)
+ }
+}
+
var makeFieldsInput = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.