aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/index/suffixarray/gen64.go31
-rw-r--r--src/index/suffixarray/qsufsort.go66
-rw-r--r--src/index/suffixarray/qsufsort64.go173
-rw-r--r--src/index/suffixarray/suffixarray.go132
-rw-r--r--src/index/suffixarray/suffixarray_test.go207
5 files changed, 525 insertions, 84 deletions
diff --git a/src/index/suffixarray/gen64.go b/src/index/suffixarray/gen64.go
new file mode 100644
index 0000000000..4f0e35e227
--- /dev/null
+++ b/src/index/suffixarray/gen64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// Gen64 generates qsufsort64.go from qsufsort.go by s/32/64/g.
+package main
+
+import (
+ "bytes"
+ "io/ioutil"
+ "log"
+)
+
+func main() {
+ log.SetPrefix("gen64: ")
+ log.SetFlags(0)
+
+ data, err := ioutil.ReadFile("qsufsort.go")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ data = bytes.Replace(data, []byte("\n\n"), []byte("\n\n// Code generated by gen64.go; DO NOT EDIT.\n//go:generate go run gen64.go\n\n"), 1)
+ data = bytes.Replace(data, []byte("32"), []byte("64"), -1)
+
+ if err := ioutil.WriteFile("qsufsort64.go", data, 0666); err != nil {
+ log.Fatal(err)
+ }
+}
diff --git a/src/index/suffixarray/qsufsort.go b/src/index/suffixarray/qsufsort.go
index 9c36a98f82..71aae02c0a 100644
--- a/src/index/suffixarray/qsufsort.go
+++ b/src/index/suffixarray/qsufsort.go
@@ -24,26 +24,28 @@
package suffixarray
-import "sort"
+import (
+ "sort"
+)
-func qsufsort(data []byte) []int {
+func qsufsort32(data []byte) []int32 {
// initial sorting by first byte of suffix
- sa := sortedByFirstByte(data)
+ sa := sortedByFirstByte32(data)
if len(sa) < 2 {
return sa
}
// initialize the group lookup table
// this becomes the inverse of the suffix array when all groups are sorted
- inv := initGroups(sa, data)
+ inv := initGroups32(sa, data)
// the index starts 1-ordered
- sufSortable := &suffixSortable{sa: sa, inv: inv, h: 1}
+ sufSortable := &suffixSortable32{sa: sa, inv: inv, h: 1}
- for sa[0] > -len(sa) { // until all suffixes are one big sorted group
+ for sa[0] > -int32(len(sa)) { // until all suffixes are one big sorted group
// The suffixes are h-ordered, make them 2*h-ordered
- pi := 0 // pi is first position of first group
- sl := 0 // sl is negated length of sorted groups
- for pi < len(sa) {
+ pi := int32(0) // pi is first position of first group
+ sl := int32(0) // sl is negated length of sorted groups
+ for pi < int32(len(sa)) {
if s := sa[pi]; s < 0 { // if pi starts sorted group
pi -= s // skip over sorted group
sl += s // add negated length to sl
@@ -67,12 +69,12 @@ func qsufsort(data []byte) []int {
}
for i := range sa { // reconstruct suffix array from inverse
- sa[inv[i]] = i
+ sa[inv[i]] = int32(i)
}
return sa
}
-func sortedByFirstByte(data []byte) []int {
+func sortedByFirstByte32(data []byte) []int32 {
// total byte counts
var count [256]int
for _, b := range data {
@@ -84,20 +86,20 @@ func sortedByFirstByte(data []byte) []int {
count[b], sum = sum, count[b]+sum
}
// iterate through bytes, placing index into the correct spot in sa
- sa := make([]int, len(data))
+ sa := make([]int32, len(data))
for i, b := range data {
- sa[count[b]] = i
+ sa[count[b]] = int32(i)
count[b]++
}
return sa
}
-func initGroups(sa []int, data []byte) []int {
+func initGroups32(sa []int32, data []byte) []int32 {
// label contiguous same-letter groups with the same group number
- inv := make([]int, len(data))
- prevGroup := len(sa) - 1
+ inv := make([]int32, len(data))
+ prevGroup := int32(len(sa)) - 1
groupByte := data[sa[prevGroup]]
- for i := len(sa) - 1; i >= 0; i-- {
+ for i := int32(len(sa)) - 1; i >= 0; i-- {
if b := data[sa[i]]; b < groupByte {
if prevGroup == i+1 {
sa[i+1] = -1
@@ -114,13 +116,13 @@ func initGroups(sa []int, data []byte) []int {
// This is necessary to ensure the suffix "a" is before "aba"
// when using a potentially unstable sort.
lastByte := data[len(data)-1]
- s := -1
+ s := int32(-1)
for i := range sa {
if sa[i] >= 0 {
if data[sa[i]] == lastByte && s == -1 {
- s = i
+ s = int32(i)
}
- if sa[i] == len(sa)-1 {
+ if sa[i] == int32(len(sa))-1 {
sa[i], sa[s] = sa[s], sa[i]
inv[sa[s]] = s
sa[s] = -1 // mark it as an isolated sorted group
@@ -131,31 +133,31 @@ func initGroups(sa []int, data []byte) []int {
return inv
}
-type suffixSortable struct {
- sa []int
- inv []int
- h int
- buf []int // common scratch space
+type suffixSortable32 struct {
+ sa []int32
+ inv []int32
+ h int32
+ buf []int32 // common scratch space
}
-func (x *suffixSortable) Len() int { return len(x.sa) }
-func (x *suffixSortable) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
-func (x *suffixSortable) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
+func (x *suffixSortable32) Len() int { return len(x.sa) }
+func (x *suffixSortable32) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
+func (x *suffixSortable32) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-func (x *suffixSortable) updateGroups(offset int) {
+func (x *suffixSortable32) updateGroups(offset int32) {
bounds := x.buf[0:0]
group := x.inv[x.sa[0]+x.h]
for i := 1; i < len(x.sa); i++ {
if g := x.inv[x.sa[i]+x.h]; g > group {
- bounds = append(bounds, i)
+ bounds = append(bounds, int32(i))
group = g
}
}
- bounds = append(bounds, len(x.sa))
+ bounds = append(bounds, int32(len(x.sa)))
x.buf = bounds
// update the group numberings after all new groups are determined
- prev := 0
+ prev := int32(0)
for _, b := range bounds {
for i := prev; i < b; i++ {
x.inv[x.sa[i]] = offset + b - 1
diff --git a/src/index/suffixarray/qsufsort64.go b/src/index/suffixarray/qsufsort64.go
new file mode 100644
index 0000000000..907d6e3726
--- /dev/null
+++ b/src/index/suffixarray/qsufsort64.go
@@ -0,0 +1,173 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by gen64.go; DO NOT EDIT.
+//go:generate go run gen64.go
+
+// This algorithm is based on "Faster Suffix Sorting"
+// by N. Jesper Larsson and Kunihiko Sadakane
+// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
+// code: http://www.larsson.dogma.net/qsufsort.c
+
+// This algorithm computes the suffix array sa by computing its inverse.
+// Consecutive groups of suffixes in sa are labeled as sorted groups or
+// unsorted groups. For a given pass of the sorter, all suffixes are ordered
+// up to their first h characters, and sa is h-ordered. Suffixes in their
+// final positions and unambiguously sorted in h-order are in a sorted group.
+// Consecutive groups of suffixes with identical first h characters are an
+// unsorted group. In each pass of the algorithm, unsorted groups are sorted
+// according to the group number of their following suffix.
+
+// In the implementation, if sa[i] is negative, it indicates that i is
+// the first element of a sorted group of length -sa[i], and can be skipped.
+// An unsorted group sa[i:k] is given the group number of the index of its
+// last element, k-1. The group numbers are stored in the inverse slice (inv),
+// and when all groups are sorted, this slice is the inverse suffix array.
+
+package suffixarray
+
+import (
+ "sort"
+)
+
+func qsufsort64(data []byte) []int64 {
+ // initial sorting by first byte of suffix
+ sa := sortedByFirstByte64(data)
+ if len(sa) < 2 {
+ return sa
+ }
+ // initialize the group lookup table
+ // this becomes the inverse of the suffix array when all groups are sorted
+ inv := initGroups64(sa, data)
+
+ // the index starts 1-ordered
+ sufSortable := &suffixSortable64{sa: sa, inv: inv, h: 1}
+
+ for sa[0] > -int64(len(sa)) { // until all suffixes are one big sorted group
+ // The suffixes are h-ordered, make them 2*h-ordered
+ pi := int64(0) // pi is first position of first group
+ sl := int64(0) // sl is negated length of sorted groups
+ for pi < int64(len(sa)) {
+ if s := sa[pi]; s < 0 { // if pi starts sorted group
+ pi -= s // skip over sorted group
+ sl += s // add negated length to sl
+ } else { // if pi starts unsorted group
+ if sl != 0 {
+ sa[pi+sl] = sl // combine sorted groups before pi
+ sl = 0
+ }
+ pk := inv[s] + 1 // pk-1 is last position of unsorted group
+ sufSortable.sa = sa[pi:pk]
+ sort.Sort(sufSortable)
+ sufSortable.updateGroups(pi)
+ pi = pk // next group
+ }
+ }
+ if sl != 0 { // if the array ends with a sorted group
+ sa[pi+sl] = sl // combine sorted groups at end of sa
+ }
+
+ sufSortable.h *= 2 // double sorted depth
+ }
+
+ for i := range sa { // reconstruct suffix array from inverse
+ sa[inv[i]] = int64(i)
+ }
+ return sa
+}
+
+func sortedByFirstByte64(data []byte) []int64 {
+ // total byte counts
+ var count [256]int
+ for _, b := range data {
+ count[b]++
+ }
+ // make count[b] equal index of first occurrence of b in sorted array
+ sum := 0
+ for b := range count {
+ count[b], sum = sum, count[b]+sum
+ }
+ // iterate through bytes, placing index into the correct spot in sa
+ sa := make([]int64, len(data))
+ for i, b := range data {
+ sa[count[b]] = int64(i)
+ count[b]++
+ }
+ return sa
+}
+
+func initGroups64(sa []int64, data []byte) []int64 {
+ // label contiguous same-letter groups with the same group number
+ inv := make([]int64, len(data))
+ prevGroup := int64(len(sa)) - 1
+ groupByte := data[sa[prevGroup]]
+ for i := int64(len(sa)) - 1; i >= 0; i-- {
+ if b := data[sa[i]]; b < groupByte {
+ if prevGroup == i+1 {
+ sa[i+1] = -1
+ }
+ groupByte = b
+ prevGroup = i
+ }
+ inv[sa[i]] = prevGroup
+ if prevGroup == 0 {
+ sa[0] = -1
+ }
+ }
+ // Separate out the final suffix to the start of its group.
+ // This is necessary to ensure the suffix "a" is before "aba"
+ // when using a potentially unstable sort.
+ lastByte := data[len(data)-1]
+ s := int64(-1)
+ for i := range sa {
+ if sa[i] >= 0 {
+ if data[sa[i]] == lastByte && s == -1 {
+ s = int64(i)
+ }
+ if sa[i] == int64(len(sa))-1 {
+ sa[i], sa[s] = sa[s], sa[i]
+ inv[sa[s]] = s
+ sa[s] = -1 // mark it as an isolated sorted group
+ break
+ }
+ }
+ }
+ return inv
+}
+
+type suffixSortable64 struct {
+ sa []int64
+ inv []int64
+ h int64
+ buf []int64 // common scratch space
+}
+
+func (x *suffixSortable64) Len() int { return len(x.sa) }
+func (x *suffixSortable64) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
+func (x *suffixSortable64) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
+
+func (x *suffixSortable64) updateGroups(offset int64) {
+ bounds := x.buf[0:0]
+ group := x.inv[x.sa[0]+x.h]
+ for i := 1; i < len(x.sa); i++ {
+ if g := x.inv[x.sa[i]+x.h]; g > group {
+ bounds = append(bounds, int64(i))
+ group = g
+ }
+ }
+ bounds = append(bounds, int64(len(x.sa)))
+ x.buf = bounds
+
+ // update the group numberings after all new groups are determined
+ prev := int64(0)
+ for _, b := range bounds {
+ for i := prev; i < b; i++ {
+ x.inv[x.sa[i]] = offset + b - 1
+ }
+ if b-prev == 1 {
+ x.sa[prev] = -1
+ }
+ prev = b
+ }
+}
diff --git a/src/index/suffixarray/suffixarray.go b/src/index/suffixarray/suffixarray.go
index 0961ac4fb2..339643db4d 100644
--- a/src/index/suffixarray/suffixarray.go
+++ b/src/index/suffixarray/suffixarray.go
@@ -19,21 +19,68 @@ package suffixarray
import (
"bytes"
"encoding/binary"
+ "errors"
"io"
+ "math"
"regexp"
"sort"
)
+// Can change for testing
+var maxData32 int = realMaxData32
+
+const realMaxData32 = math.MaxInt32
+
// Index implements a suffix array for fast substring search.
type Index struct {
data []byte
- sa []int // suffix array for data; len(sa) == len(data)
+ sa ints // suffix array for data; sa.len() == len(data)
+}
+
+// An ints is either an []int32 or an []int64.
+// That is, one of them is empty, and one is the real data.
+// The int64 form is used when len(data) > maxData32
+type ints struct {
+ int32 []int32
+ int64 []int64
+}
+
+func (a *ints) len() int {
+ return len(a.int32) + len(a.int64)
+}
+
+func (a *ints) get(i int) int64 {
+ if a.int32 != nil {
+ return int64(a.int32[i])
+ }
+ return a.int64[i]
+}
+
+func (a *ints) set(i int, v int64) {
+ if a.int32 != nil {
+ a.int32[i] = int32(v)
+ } else {
+ a.int64[i] = v
+ }
+}
+
+func (a *ints) slice(i, j int) ints {
+ if a.int32 != nil {
+ return ints{a.int32[i:j], nil}
+ }
+ return ints{nil, a.int64[i:j]}
}
// New creates a new Index for data.
// Index creation time is O(N*log(N)) for N = len(data).
func New(data []byte) *Index {
- return &Index{data, qsufsort(data)}
+ ix := &Index{data: data}
+ if len(data) <= maxData32 {
+ ix.sa.int32 = qsufsort32(data)
+ } else {
+ ix.sa.int64 = qsufsort64(data)
+ }
+ return ix
}
// writeInt writes an int x to w using buf to buffer the write.
@@ -44,19 +91,20 @@ func writeInt(w io.Writer, buf []byte, x int) error {
}
// readInt reads an int x from r using buf to buffer the read and returns x.
-func readInt(r io.Reader, buf []byte) (int, error) {
+func readInt(r io.Reader, buf []byte) (int64, error) {
_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
x, _ := binary.Varint(buf)
- return int(x), err
+ return x, err
}
// writeSlice writes data[:n] to w and returns n.
// It uses buf to buffer the write.
-func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
+func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
// encode as many elements as fit into buf
p := binary.MaxVarintLen64
- for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
- p += binary.PutUvarint(buf[p:], uint64(data[n]))
+ m := data.len()
+ for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
+ p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
}
// update buffer size
@@ -67,15 +115,22 @@ func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
return
}
+var errTooBig = errors.New("suffixarray: data too large")
+
// readSlice reads data[:n] from r and returns n.
// It uses buf to buffer the read.
-func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
+func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
// read buffer size
- var size int
- size, err = readInt(r, buf)
+ var size64 int64
+ size64, err = readInt(r, buf)
if err != nil {
return
}
+ if int64(int(size64)) != size64 || int(size64) < 0 {
+ // We never write chunks this big anyway.
+ return 0, errTooBig
+ }
+ size := int(size64)
// read buffer w/o the size
if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
@@ -85,7 +140,7 @@ func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
// decode as many elements as present in buf
for p := binary.MaxVarintLen64; p < size; n++ {
x, w := binary.Uvarint(buf[p:])
- data[n] = int(x)
+ data.set(n, int64(x))
p += w
}
@@ -100,21 +155,31 @@ func (x *Index) Read(r io.Reader) error {
buf := make([]byte, bufSize)
// read length
- n, err := readInt(r, buf)
+ n64, err := readInt(r, buf)
if err != nil {
return err
}
+ if int64(int(n64)) != n64 || int(n64) < 0 {
+ return errTooBig
+ }
+ n := int(n64)
// allocate space
- if 2*n < cap(x.data) || cap(x.data) < n {
+ if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
// new data is significantly smaller or larger than
// existing buffers - allocate new ones
x.data = make([]byte, n)
- x.sa = make([]int, n)
+ x.sa.int32 = nil
+ x.sa.int64 = nil
+ if n <= maxData32 {
+ x.sa.int32 = make([]int32, n)
+ } else {
+ x.sa.int64 = make([]int64, n)
+ }
} else {
// re-use existing buffers
x.data = x.data[0:n]
- x.sa = x.sa[0:n]
+ x.sa = x.sa.slice(0, n)
}
// read data
@@ -123,12 +188,13 @@ func (x *Index) Read(r io.Reader) error {
}
// read index
- for sa := x.sa; len(sa) > 0; {
+ sa := x.sa
+ for sa.len() > 0 {
n, err := readSlice(r, buf, sa)
if err != nil {
return err
}
- sa = sa[n:]
+ sa = sa.slice(n, sa.len())
}
return nil
}
@@ -149,12 +215,13 @@ func (x *Index) Write(w io.Writer) error {
}
// write index
- for sa := x.sa; len(sa) > 0; {
+ sa := x.sa
+ for sa.len() > 0 {
n, err := writeSlice(w, buf, sa)
if err != nil {
return err
}
- sa = sa[n:]
+ sa = sa.slice(n, sa.len())
}
return nil
}
@@ -167,18 +234,18 @@ func (x *Index) Bytes() []byte {
}
func (x *Index) at(i int) []byte {
- return x.data[x.sa[i]:]
+ return x.data[x.sa.get(i):]
}
// lookupAll returns a slice into the matching region of the index.
// The runtime is O(log(N)*len(s)).
-func (x *Index) lookupAll(s []byte) []int {
+func (x *Index) lookupAll(s []byte) ints {
// find matching suffix index range [i:j]
// find the first index where s would be the prefix
- i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
+ i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
// starting at i, find the first index at which s is not a prefix
- j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
- return x.sa[i:j]
+ j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
+ return x.sa.slice(i, j)
}
// Lookup returns an unsorted list of at most n indices where the byte string s
@@ -190,13 +257,22 @@ func (x *Index) lookupAll(s []byte) []int {
func (x *Index) Lookup(s []byte, n int) (result []int) {
if len(s) > 0 && n != 0 {
matches := x.lookupAll(s)
- if n < 0 || len(matches) < n {
- n = len(matches)
+ count := matches.len()
+ if n < 0 || count < n {
+ n = count
}
- // 0 <= n <= len(matches)
+ // 0 <= n <= count
if n > 0 {
result = make([]int, n)
- copy(result, matches)
+ if matches.int32 != nil {
+ for i := range result {
+ result[i] = int(matches.int32[i])
+ }
+ } else {
+ for i := range result {
+ result[i] = int(matches.int64[i])
+ }
+ }
}
}
return
diff --git a/src/index/suffixarray/suffixarray_test.go b/src/index/suffixarray/suffixarray_test.go
index 644f00c757..3cb8450105 100644
--- a/src/index/suffixarray/suffixarray_test.go
+++ b/src/index/suffixarray/suffixarray_test.go
@@ -6,7 +6,11 @@ package suffixarray
import (
"bytes"
+ "fmt"
+ "io/ioutil"
"math/rand"
+ "os"
+ "path/filepath"
"regexp"
"sort"
"strings"
@@ -207,10 +211,19 @@ func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
// index is used to hide the sort.Interface
type index Index
-func (x *index) Len() int { return len(x.sa) }
+func (x *index) Len() int { return x.sa.len() }
func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
-func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }
+func (x *index) Swap(i, j int) {
+ if x.sa.int32 != nil {
+ x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i]
+ } else {
+ x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i]
+ }
+}
+
+func (x *index) at(i int) []byte {
+ return x.data[x.sa.get(i):]
+}
func testConstruction(t *testing.T, tc *testCase, x *Index) {
if !sort.IsSorted((*index)(x)) {
@@ -222,8 +235,12 @@ func equal(x, y *Index) bool {
if !bytes.Equal(x.data, y.data) {
return false
}
- for i, j := range x.sa {
- if j != y.sa[i] {
+ if x.sa.len() != y.sa.len() {
+ return false
+ }
+ n := x.sa.len()
+ for i := 0; i < n; i++ {
+ if x.sa.get(i) != y.sa.get(i) {
return false
}
}
@@ -238,16 +255,41 @@ func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
}
size := buf.Len()
var y Index
- if err := y.Read(&buf); err != nil {
+ if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
+ t.Errorf("failed reading index %s (%s)", tc.name, err)
+ }
+ if !equal(x, &y) {
+ t.Errorf("restored index doesn't match saved index %s", tc.name)
+ }
+
+ old := maxData32
+ defer func() {
+ maxData32 = old
+ }()
+ // Reread as forced 32.
+ y = Index{}
+ maxData32 = realMaxData32
+ if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
+ t.Errorf("failed reading index %s (%s)", tc.name, err)
+ }
+ if !equal(x, &y) {
+ t.Errorf("restored index doesn't match saved index %s", tc.name)
+ }
+
+ // Reread as forced 64.
+ y = Index{}
+ maxData32 = -1
+ if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
t.Errorf("failed reading index %s (%s)", tc.name, err)
}
if !equal(x, &y) {
t.Errorf("restored index doesn't match saved index %s", tc.name)
}
+
return size
}
-func TestIndex(t *testing.T) {
+func testIndex(t *testing.T) {
for _, tc := range testCases {
x := New([]byte(tc.source))
testConstruction(t, &tc, x)
@@ -260,45 +302,162 @@ func TestIndex(t *testing.T) {
}
}
+func TestIndex32(t *testing.T) {
+ testIndex(t)
+}
+
+func TestIndex64(t *testing.T) {
+ maxData32 = -1
+ defer func() {
+ maxData32 = realMaxData32
+ }()
+ testIndex(t)
+}
+
+var (
+ benchdata = make([]byte, 1e6)
+ benchrand = make([]byte, 1e6)
+)
+
// Of all possible inputs, the random bytes have the least amount of substring
// repetition, and the repeated bytes have the most. For most algorithms,
// the running time of every input will be between these two.
func benchmarkNew(b *testing.B, random bool) {
+ b.ReportAllocs()
b.StopTimer()
- data := make([]byte, 1e6)
+ data := benchdata
if random {
- for i := range data {
- data[i] = byte(rand.Intn(256))
+ data = benchrand
+ if data[0] == 0 {
+ for i := range data {
+ data[i] = byte(rand.Intn(256))
+ }
}
}
b.StartTimer()
+ b.SetBytes(int64(len(data)))
for i := 0; i < b.N; i++ {
New(data)
}
}
-func BenchmarkNewIndexRandom(b *testing.B) {
- benchmarkNew(b, true)
+func makeText(name string) ([]byte, error) {
+ var data []byte
+ switch name {
+ case "opticks":
+ var err error
+ data, err = ioutil.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
+ if err != nil {
+ return nil, err
+ }
+ case "go":
+ err := filepath.Walk("../..", func(path string, info os.FileInfo, err error) error {
+ if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() {
+ file, err := ioutil.ReadFile(path)
+ if err != nil {
+ return err
+ }
+ data = append(data, file...)
+ }
+ return nil
+ })
+ if err != nil {
+ return nil, err
+ }
+ case "zero":
+ data = make([]byte, 50e6)
+ case "rand":
+ data = make([]byte, 50e6)
+ for i := range data {
+ data[i] = byte(rand.Intn(256))
+ }
+ }
+ return data, nil
}
-func BenchmarkNewIndexRepeat(b *testing.B) {
- benchmarkNew(b, false)
+
+func setBits(bits int) (cleanup func()) {
+ if bits == 32 {
+ maxData32 = realMaxData32
+ } else {
+ maxData32 = -1 // force use of 64-bit code
+ }
+ return func() {
+ maxData32 = realMaxData32
+ }
+}
+
+func BenchmarkNew(b *testing.B) {
+ for _, text := range []string{"opticks", "go", "zero", "rand"} {
+ b.Run("text="+text, func(b *testing.B) {
+ data, err := makeText(text)
+ if err != nil {
+ b.Fatal(err)
+ }
+ if testing.Short() && len(data) > 5e6 {
+ data = data[:5e6]
+ }
+ for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} {
+ if len(data) < size {
+ continue
+ }
+ data := data[:size]
+ name := fmt.Sprintf("%dK", size/1e3)
+ if size >= 1e6 {
+ name = fmt.Sprintf("%dM", size/1e6)
+ }
+ b.Run("size="+name, func(b *testing.B) {
+ for _, bits := range []int{32, 64} {
+ if ^uint(0) == 0xffffffff && bits == 64 {
+ continue
+ }
+ b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
+ cleanup := setBits(bits)
+ defer cleanup()
+
+ b.SetBytes(int64(len(data)))
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ New(data)
+ }
+ })
+ }
+ })
+ }
+ })
+ }
}
func BenchmarkSaveRestore(b *testing.B) {
- b.StopTimer()
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
data := make([]byte, 1<<20) // 1MB of data to index
for i := range data {
data[i] = byte(r.Intn(256))
}
- x := New(data)
- size := testSaveRestore(nil, nil, x) // verify correctness
- buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
- b.SetBytes(int64(size))
- b.StartTimer()
- for i := 0; i < b.N; i++ {
- x.Write(buf)
- var y Index
- y.Read(buf)
+ for _, bits := range []int{32, 64} {
+ if ^uint(0) == 0xffffffff && bits == 64 {
+ continue
+ }
+ b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
+ cleanup := setBits(bits)
+ defer cleanup()
+
+ b.StopTimer()
+ x := New(data)
+ size := testSaveRestore(nil, nil, x) // verify correctness
+ buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
+ b.SetBytes(int64(size))
+ b.StartTimer()
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ buf.Reset()
+ if err := x.Write(buf); err != nil {
+ b.Fatal(err)
+ }
+ var y Index
+ if err := y.Read(buf); err != nil {
+ b.Fatal(err)
+ }
+ }
+ })
}
}