aboutsummaryrefslogtreecommitdiff
path: root/src/pkg/index/suffixarray/suffixarray.go
diff options
context:
space:
mode:
authorEric Eisner <eric.d.eisner@gmail.com>2011-01-11 21:46:50 -0800
committerRobert Griesemer <gri@golang.org>2011-01-11 21:46:50 -0800
commite3c9565b438cb2eb4be6c2bd57696e378f754e46 (patch)
treef603c18865447d3291e3fc316265057e9841434b /src/pkg/index/suffixarray/suffixarray.go
parent6f62e407733cb8b340e75bc91bc2633976675d9d (diff)
downloadgo-e3c9565b438cb2eb4be6c2bd57696e378f754e46.tar.xz
suffixarray: faster creation algorithm
This implements the algorithm qsufsort using the sort package as a sorting primitive. Its worst-case performance is O(N*log(N)), and it uses only an additional slice of N ints of memory during creation. Benchmarks (seconds): old new 10k nulls 149 0.044 1M English corpus 32.0 3.6 R=gri, gri1 CC=golang-dev https://golang.org/cl/3752044
Diffstat (limited to 'src/pkg/index/suffixarray/suffixarray.go')
-rw-r--r--src/pkg/index/suffixarray/suffixarray.go25
1 files changed, 2 insertions, 23 deletions
diff --git a/src/pkg/index/suffixarray/suffixarray.go b/src/pkg/index/suffixarray/suffixarray.go
index 88cf925fc2..628e000e1d 100644
--- a/src/pkg/index/suffixarray/suffixarray.go
+++ b/src/pkg/index/suffixarray/suffixarray.go
@@ -22,11 +22,6 @@ import (
"sort"
)
-// BUG(gri): For larger data (10MB) which contains very long (say 100000)
-// contiguous sequences of identical bytes, index creation time will be extremely slow.
-
-// TODO(gri): Use a more sophisticated algorithm to create the suffix array.
-
// Index implements a suffix array for fast substring search.
type Index struct {
@@ -36,16 +31,9 @@ type Index struct {
// New creates a new Index for data.
-// Index creation time is approximately O(N*log(N)) for N = len(data).
-//
+// Index creation time is O(N*log(N)) for N = len(data).
func New(data []byte) *Index {
- sa := make([]int, len(data))
- for i := range sa {
- sa[i] = i
- }
- x := &Index{data, sa}
- sort.Sort((*index)(x))
- return x
+ return &Index{data, qsufsort(data)}
}
@@ -192,12 +180,3 @@ func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
}
return
}
-
-
-// index is used to hide the sort.Interface
-type index Index
-
-func (x *index) Len() int { return len(x.sa) }
-func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
-func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }