aboutsummaryrefslogtreecommitdiff
path: root/src/encoding/json/fold.go
diff options
context:
space:
mode:
authorJoe Tsai <joetsai@digital-static.net>2023-02-20 11:26:10 -0800
committerGopher Robot <gobot@golang.org>2023-02-27 17:37:27 +0000
commitb9b8cecbfc72168ca03ad586cc2ed52b0e8db409 (patch)
tree3c40d1289a66411daf422f15c27640dff524610d /src/encoding/json/fold.go
parent2de406bb9e26df19a31b5f6111bb221b60964d48 (diff)
downloadgo-b9b8cecbfc72168ca03ad586cc2ed52b0e8db409.tar.xz
encoding/json: simplify folded name logic
The folded name logic (despite all attempts to optimize it) was fundamentally an O(n) operation where every field in a struct needed to be linearly scanned in order to find a match. This made unmashaling of unknown fields always O(n). Instead of optimizing the comparison for each field, make it such that we can look up a name in O(1). We accomplish this by maintaining a map keyed by pre-folded names, which we can pre-calculate when processing the struct type. Using a stack-allocated buffer, we can fold the input name and look up its presence in the map. Also, instead of mapping from names to indexes, map directly to a pointer to the field information. The memory cost of this is the same and avoids an extra slice index. The new logic is both simpler and faster. Performance: name old time/op new time/op delta CodeDecoder 2.47ms ± 4% 2.42ms ± 2% -1.83% (p=0.022 n=10+9) UnicodeDecoder 259ns ± 2% 248ns ± 1% -4.32% (p=0.000 n=10+10) DecoderStream 150ns ± 1% 149ns ± 1% ~ (p=0.516 n=10+10) CodeUnmarshal 3.13ms ± 2% 3.09ms ± 2% -1.37% (p=0.022 n=10+9) CodeUnmarshalReuse 2.50ms ± 1% 2.45ms ± 1% -1.96% (p=0.001 n=8+9) UnmarshalString 67.1ns ± 5% 64.5ns ± 5% -3.90% (p=0.005 n=10+10) UnmarshalFloat64 60.1ns ± 4% 58.4ns ± 2% -2.89% (p=0.002 n=10+8) UnmarshalInt64 51.0ns ± 4% 49.2ns ± 1% -3.53% (p=0.001 n=10+8) Issue10335 80.7ns ± 2% 79.2ns ± 1% -1.82% (p=0.016 n=10+8) Issue34127 28.6ns ± 3% 28.8ns ± 3% ~ (p=0.388 n=9+10) Unmapped 177ns ± 2% 177ns ± 2% ~ (p=0.956 n=10+10) Change-Id: I478b2b958f5a63a69c9a991a39cd5ffb43244a2a Reviewed-on: https://go-review.googlesource.com/c/go/+/471196 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Run-TryBot: Joseph Tsai <joetsai@digital-static.net> Auto-Submit: Joseph Tsai <joetsai@digital-static.net> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Johan Brandhorst-Satzkorn <johan.brandhorst@gmail.com> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Daniel Martí <mvdan@mvdan.cc>
Diffstat (limited to 'src/encoding/json/fold.go')
-rw-r--r--src/encoding/json/fold.go147
1 files changed, 27 insertions, 120 deletions
diff --git a/src/encoding/json/fold.go b/src/encoding/json/fold.go
index 0f9b09d712..c4c671b527 100644
--- a/src/encoding/json/fold.go
+++ b/src/encoding/json/fold.go
@@ -5,137 +5,44 @@
package json
import (
- "bytes"
+ "unicode"
"unicode/utf8"
)
-const (
- caseMask = ^byte(0x20) // Mask to ignore case in ASCII.
- kelvin = '\u212a'
- smallLongEss = '\u017f'
-)
-
-// foldFunc returns one of four different case folding equivalence
-// functions, from most general (and slow) to fastest:
-//
-// 1) bytes.EqualFold, if the key s contains any non-ASCII UTF-8
-// 2) equalFoldRight, if s contains special folding ASCII ('k', 'K', 's', 'S')
-// 3) asciiEqualFold, no special, but includes non-letters (including _)
-// 4) simpleLetterEqualFold, no specials, no non-letters.
-//
-// The letters S and K are special because they map to 3 runes, not just 2:
-// - S maps to s and to U+017F 'ſ' Latin small letter long s
-// - k maps to K and to U+212A 'K' Kelvin sign
-//
-// See https://play.golang.org/p/tTxjOc0OGo
-//
-// The returned function is specialized for matching against s and
-// should only be given s. It's not curried for performance reasons.
-func foldFunc(s []byte) func(s, t []byte) bool {
- nonLetter := false
- special := false // special letter
- for _, b := range s {
- if b >= utf8.RuneSelf {
- return bytes.EqualFold
- }
- upper := b & caseMask
- if upper < 'A' || upper > 'Z' {
- nonLetter = true
- } else if upper == 'K' || upper == 'S' {
- // See above for why these letters are special.
- special = true
- }
- }
- if special {
- return equalFoldRight
- }
- if nonLetter {
- return asciiEqualFold
- }
- return simpleLetterEqualFold
+// foldName returns a folded string such that foldName(x) == foldName(y)
+// is identical to bytes.EqualFold(x, y).
+func foldName(in []byte) []byte {
+ // This is inlinable to take advantage of "function outlining".
+ var arr [32]byte // large enough for most JSON names
+ return appendFoldedName(arr[:0], in)
}
-// equalFoldRight is a specialization of bytes.EqualFold when s is
-// known to be all ASCII (including punctuation), but contains an 's',
-// 'S', 'k', or 'K', requiring a Unicode fold on the bytes in t.
-// See comments on foldFunc.
-func equalFoldRight(s, t []byte) bool {
- for _, sb := range s {
- if len(t) == 0 {
- return false
- }
- tb := t[0]
- if tb < utf8.RuneSelf {
- if sb != tb {
- sbUpper := sb & caseMask
- if 'A' <= sbUpper && sbUpper <= 'Z' {
- if sbUpper != tb&caseMask {
- return false
- }
- } else {
- return false
- }
+func appendFoldedName(out, in []byte) []byte {
+ for i := 0; i < len(in); {
+ // Handle single-byte ASCII.
+ if c := in[i]; c < utf8.RuneSelf {
+ if 'a' <= c && c <= 'z' {
+ c -= 'a' - 'A'
}
- t = t[1:]
+ out = append(out, c)
+ i++
continue
}
- // sb is ASCII and t is not. t must be either kelvin
- // sign or long s; sb must be s, S, k, or K.
- tr, size := utf8.DecodeRune(t)
- switch sb {
- case 's', 'S':
- if tr != smallLongEss {
- return false
- }
- case 'k', 'K':
- if tr != kelvin {
- return false
- }
- default:
- return false
- }
- t = t[size:]
-
- }
- return len(t) == 0
-}
-
-// asciiEqualFold is a specialization of bytes.EqualFold for use when
-// s is all ASCII (but may contain non-letters) and contains no
-// special-folding letters.
-// See comments on foldFunc.
-func asciiEqualFold(s, t []byte) bool {
- if len(s) != len(t) {
- return false
- }
- for i, sb := range s {
- tb := t[i]
- if sb == tb {
- continue
- }
- if ('a' <= sb && sb <= 'z') || ('A' <= sb && sb <= 'Z') {
- if sb&caseMask != tb&caseMask {
- return false
- }
- } else {
- return false
- }
+ // Handle multi-byte Unicode.
+ r, n := utf8.DecodeRune(in[i:])
+ out = utf8.AppendRune(out, foldRune(r))
+ i += n
}
- return true
+ return out
}
-// simpleLetterEqualFold is a specialization of bytes.EqualFold for
-// use when s is all ASCII letters (no underscores, etc) and also
-// doesn't contain 'k', 'K', 's', or 'S'.
-// See comments on foldFunc.
-func simpleLetterEqualFold(s, t []byte) bool {
- if len(s) != len(t) {
- return false
- }
- for i, b := range s {
- if b&caseMask != t[i]&caseMask {
- return false
+// foldRune is returns the smallest rune for all runes in the same fold set.
+func foldRune(r rune) rune {
+ for {
+ r2 := unicode.SimpleFold(r)
+ if r2 <= r {
+ return r2
}
+ r = r2
}
- return true
}