aboutsummaryrefslogtreecommitdiff
path: root/src/bytes
diff options
context:
space:
mode:
Diffstat (limited to 'src/bytes')
-rw-r--r--src/bytes/bytes.go114
-rw-r--r--src/bytes/bytes_test.go53
2 files changed, 132 insertions, 35 deletions
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index 7c878af688..457e149410 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -265,9 +265,57 @@ func SplitAfter(s, sep []byte) [][]byte {
return genSplit(s, sep, len(sep), -1)
}
+var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
+
// Fields splits the slice s around each instance of one or more consecutive white space
-// characters, returning a slice of subslices of s or an empty list if s contains only white space.
+// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
+// empty slice if s contains only white space.
func Fields(s []byte) [][]byte {
+ // First count the fields.
+ // This is an exact count if s is ASCII, otherwise it is an approximation.
+ n := 0
+ wasSpace := 1
+ // setBits is used to track which bits are set in the bytes of s.
+ setBits := uint8(0)
+ for i := 0; i < len(s); i++ {
+ r := s[i]
+ setBits |= r
+ isSpace := int(asciiSpace[r])
+ n += wasSpace & ^isSpace
+ wasSpace = isSpace
+ }
+
+ if setBits < utf8.RuneSelf { // ASCII fast path
+ a := make([][]byte, n)
+ na := 0
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ for i < len(s) {
+ if asciiSpace[s[i]] == 0 {
+ i++
+ continue
+ }
+ a[na] = s[fieldStart:i]
+ na++
+ i++
+ // Skip spaces in between fields.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ }
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a[na] = s[fieldStart:]
+ }
+ return a
+ }
+
+ // Some runes in the input slice are not ASCII.
return FieldsFunc(s, unicode.IsSpace)
}
@@ -278,39 +326,49 @@ func Fields(s []byte) [][]byte {
// FieldsFunc makes no guarantees about the order in which it calls f(c).
// If f does not return consistent results for a given c, FieldsFunc may crash.
func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
- n := 0
- inField := false
- for i := 0; i < len(s); {
- r, size := utf8.DecodeRune(s[i:])
- wasInField := inField
- inField = !f(r)
- if inField && !wasInField {
- n++
- }
- i += size
+ // A span is used to record a slice of s of the form s[start:end].
+ // The start index is inclusive and the end index is exclusive.
+ type span struct {
+ start int
+ end int
}
+ spans := make([]span, 0, 32)
- a := make([][]byte, n)
- na := 0
- fieldStart := -1
- for i := 0; i <= len(s) && na < n; {
- r, size := utf8.DecodeRune(s[i:])
- if fieldStart < 0 && size > 0 && !f(r) {
- fieldStart = i
- i += size
- continue
- }
- if fieldStart >= 0 && (size == 0 || f(r)) {
- a[na] = s[fieldStart:i]
- na++
- fieldStart = -1
+ // Find the field start and end indices.
+ wasField := false
+ fromIndex := 0
+ for i := 0; i < len(s); {
+ size := 1
+ r := rune(s[i])
+ if r >= utf8.RuneSelf {
+ r, size = utf8.DecodeRune(s[i:])
}
- if size == 0 {
- break
+ if f(r) {
+ if wasField {
+ spans = append(spans, span{start: fromIndex, end: i})
+ wasField = false
+ }
+ } else {
+ if !wasField {
+ fromIndex = i
+ wasField = true
+ }
}
i += size
}
- return a[0:na]
+
+ // Last field might end at EOF.
+ if wasField {
+ spans = append(spans, span{fromIndex, len(s)})
+ }
+
+ // Create subslices from recorded field indices.
+ a := make([][]byte, len(spans))
+ for i, span := range spans {
+ a[i] = s[span.start:span.end]
+ }
+
+ return a
}
// Join concatenates the elements of s to create a new byte slice. The separator
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index ca0cdbb7c9..db28497e39 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -1502,19 +1502,58 @@ var makeFieldsInput = func() []byte {
return x
}
-var fieldsInput = makeFieldsInput()
+var makeFieldsInputASCII = func() []byte {
+ x := make([]byte, 1<<20)
+ // Input is ~10% space, rest ASCII non-space.
+ for i := range x {
+ if rand.Intn(10) == 0 {
+ x[i] = ' '
+ } else {
+ x[i] = 'x'
+ }
+ }
+ return x
+}
+
+var bytesdata = []struct {
+ name string
+ data []byte
+}{
+ {"ASCII", makeFieldsInputASCII()},
+ {"Mixed", makeFieldsInput()},
+}
func BenchmarkFields(b *testing.B) {
- b.SetBytes(int64(len(fieldsInput)))
- for i := 0; i < b.N; i++ {
- Fields(fieldsInput)
+ for _, sd := range bytesdata {
+ b.Run(sd.name, func(b *testing.B) {
+ for j := 1 << 4; j <= 1<<20; j <<= 4 {
+ b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+ b.ReportAllocs()
+ b.SetBytes(int64(j))
+ data := sd.data[:j]
+ for i := 0; i < b.N; i++ {
+ Fields(data)
+ }
+ })
+ }
+ })
}
}
func BenchmarkFieldsFunc(b *testing.B) {
- b.SetBytes(int64(len(fieldsInput)))
- for i := 0; i < b.N; i++ {
- FieldsFunc(fieldsInput, unicode.IsSpace)
+ for _, sd := range bytesdata {
+ b.Run(sd.name, func(b *testing.B) {
+ for j := 1 << 4; j <= 1<<20; j <<= 4 {
+ b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+ b.ReportAllocs()
+ b.SetBytes(int64(j))
+ data := sd.data[:j]
+ for i := 0; i < b.N; i++ {
+ FieldsFunc(data, unicode.IsSpace)
+ }
+ })
+ }
+ })
}
}