diff options
Diffstat (limited to 'src/index/suffixarray/suffixarray_test.go')
| -rw-r--r-- | src/index/suffixarray/suffixarray_test.go | 207 |
1 files changed, 183 insertions, 24 deletions
diff --git a/src/index/suffixarray/suffixarray_test.go b/src/index/suffixarray/suffixarray_test.go index 644f00c757..3cb8450105 100644 --- a/src/index/suffixarray/suffixarray_test.go +++ b/src/index/suffixarray/suffixarray_test.go @@ -6,7 +6,11 @@ package suffixarray import ( "bytes" + "fmt" + "io/ioutil" "math/rand" + "os" + "path/filepath" "regexp" "sort" "strings" @@ -207,10 +211,19 @@ func testLookups(t *testing.T, tc *testCase, x *Index, n int) { // index is used to hide the sort.Interface type index Index -func (x *index) Len() int { return len(x.sa) } +func (x *index) Len() int { return x.sa.len() } func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } -func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } -func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } +func (x *index) Swap(i, j int) { + if x.sa.int32 != nil { + x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i] + } else { + x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i] + } +} + +func (x *index) at(i int) []byte { + return x.data[x.sa.get(i):] +} func testConstruction(t *testing.T, tc *testCase, x *Index) { if !sort.IsSorted((*index)(x)) { @@ -222,8 +235,12 @@ func equal(x, y *Index) bool { if !bytes.Equal(x.data, y.data) { return false } - for i, j := range x.sa { - if j != y.sa[i] { + if x.sa.len() != y.sa.len() { + return false + } + n := x.sa.len() + for i := 0; i < n; i++ { + if x.sa.get(i) != y.sa.get(i) { return false } } @@ -238,16 +255,41 @@ func testSaveRestore(t *testing.T, tc *testCase, x *Index) int { } size := buf.Len() var y Index - if err := y.Read(&buf); err != nil { + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + old := maxData32 + defer func() { + maxData32 = old + }() + // Reread as forced 32. + y = Index{} + maxData32 = realMaxData32 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + // Reread as forced 64. + y = Index{} + maxData32 = -1 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { t.Errorf("failed reading index %s (%s)", tc.name, err) } if !equal(x, &y) { t.Errorf("restored index doesn't match saved index %s", tc.name) } + return size } -func TestIndex(t *testing.T) { +func testIndex(t *testing.T) { for _, tc := range testCases { x := New([]byte(tc.source)) testConstruction(t, &tc, x) @@ -260,45 +302,162 @@ func TestIndex(t *testing.T) { } } +func TestIndex32(t *testing.T) { + testIndex(t) +} + +func TestIndex64(t *testing.T) { + maxData32 = -1 + defer func() { + maxData32 = realMaxData32 + }() + testIndex(t) +} + +var ( + benchdata = make([]byte, 1e6) + benchrand = make([]byte, 1e6) +) + // Of all possible inputs, the random bytes have the least amount of substring // repetition, and the repeated bytes have the most. For most algorithms, // the running time of every input will be between these two. func benchmarkNew(b *testing.B, random bool) { + b.ReportAllocs() b.StopTimer() - data := make([]byte, 1e6) + data := benchdata if random { - for i := range data { - data[i] = byte(rand.Intn(256)) + data = benchrand + if data[0] == 0 { + for i := range data { + data[i] = byte(rand.Intn(256)) + } } } b.StartTimer() + b.SetBytes(int64(len(data))) for i := 0; i < b.N; i++ { New(data) } } -func BenchmarkNewIndexRandom(b *testing.B) { - benchmarkNew(b, true) +func makeText(name string) ([]byte, error) { + var data []byte + switch name { + case "opticks": + var err error + data, err = ioutil.ReadFile("../../testdata/Isaac.Newton-Opticks.txt") + if err != nil { + return nil, err + } + case "go": + err := filepath.Walk("../..", func(path string, info os.FileInfo, err error) error { + if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() { + file, err := ioutil.ReadFile(path) + if err != nil { + return err + } + data = append(data, file...) + } + return nil + }) + if err != nil { + return nil, err + } + case "zero": + data = make([]byte, 50e6) + case "rand": + data = make([]byte, 50e6) + for i := range data { + data[i] = byte(rand.Intn(256)) + } + } + return data, nil } -func BenchmarkNewIndexRepeat(b *testing.B) { - benchmarkNew(b, false) + +func setBits(bits int) (cleanup func()) { + if bits == 32 { + maxData32 = realMaxData32 + } else { + maxData32 = -1 // force use of 64-bit code + } + return func() { + maxData32 = realMaxData32 + } +} + +func BenchmarkNew(b *testing.B) { + for _, text := range []string{"opticks", "go", "zero", "rand"} { + b.Run("text="+text, func(b *testing.B) { + data, err := makeText(text) + if err != nil { + b.Fatal(err) + } + if testing.Short() && len(data) > 5e6 { + data = data[:5e6] + } + for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} { + if len(data) < size { + continue + } + data := data[:size] + name := fmt.Sprintf("%dK", size/1e3) + if size >= 1e6 { + name = fmt.Sprintf("%dM", size/1e6) + } + b.Run("size="+name, func(b *testing.B) { + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.SetBytes(int64(len(data))) + b.ReportAllocs() + for i := 0; i < b.N; i++ { + New(data) + } + }) + } + }) + } + }) + } } func BenchmarkSaveRestore(b *testing.B) { - b.StopTimer() r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence data := make([]byte, 1<<20) // 1MB of data to index for i := range data { data[i] = byte(r.Intn(256)) } - x := New(data) - size := testSaveRestore(nil, nil, x) // verify correctness - buf := bytes.NewBuffer(make([]byte, size)) // avoid growing - b.SetBytes(int64(size)) - b.StartTimer() - for i := 0; i < b.N; i++ { - x.Write(buf) - var y Index - y.Read(buf) + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.StopTimer() + x := New(data) + size := testSaveRestore(nil, nil, x) // verify correctness + buf := bytes.NewBuffer(make([]byte, size)) // avoid growing + b.SetBytes(int64(size)) + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buf.Reset() + if err := x.Write(buf); err != nil { + b.Fatal(err) + } + var y Index + if err := y.Read(buf); err != nil { + b.Fatal(err) + } + } + }) } } |
