From b9b8cecbfc72168ca03ad586cc2ed52b0e8db409 Mon Sep 17 00:00:00 2001 From: Joe Tsai Date: Mon, 20 Feb 2023 11:26:10 -0800 Subject: encoding/json: simplify folded name logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The folded name logic (despite all attempts to optimize it) was fundamentally an O(n) operation where every field in a struct needed to be linearly scanned in order to find a match. This made unmashaling of unknown fields always O(n). Instead of optimizing the comparison for each field, make it such that we can look up a name in O(1). We accomplish this by maintaining a map keyed by pre-folded names, which we can pre-calculate when processing the struct type. Using a stack-allocated buffer, we can fold the input name and look up its presence in the map. Also, instead of mapping from names to indexes, map directly to a pointer to the field information. The memory cost of this is the same and avoids an extra slice index. The new logic is both simpler and faster. Performance: name old time/op new time/op delta CodeDecoder 2.47ms ± 4% 2.42ms ± 2% -1.83% (p=0.022 n=10+9) UnicodeDecoder 259ns ± 2% 248ns ± 1% -4.32% (p=0.000 n=10+10) DecoderStream 150ns ± 1% 149ns ± 1% ~ (p=0.516 n=10+10) CodeUnmarshal 3.13ms ± 2% 3.09ms ± 2% -1.37% (p=0.022 n=10+9) CodeUnmarshalReuse 2.50ms ± 1% 2.45ms ± 1% -1.96% (p=0.001 n=8+9) UnmarshalString 67.1ns ± 5% 64.5ns ± 5% -3.90% (p=0.005 n=10+10) UnmarshalFloat64 60.1ns ± 4% 58.4ns ± 2% -2.89% (p=0.002 n=10+8) UnmarshalInt64 51.0ns ± 4% 49.2ns ± 1% -3.53% (p=0.001 n=10+8) Issue10335 80.7ns ± 2% 79.2ns ± 1% -1.82% (p=0.016 n=10+8) Issue34127 28.6ns ± 3% 28.8ns ± 3% ~ (p=0.388 n=9+10) Unmapped 177ns ± 2% 177ns ± 2% ~ (p=0.956 n=10+10) Change-Id: I478b2b958f5a63a69c9a991a39cd5ffb43244a2a Reviewed-on: https://go-review.googlesource.com/c/go/+/471196 Reviewed-by: Dmitri Shuralyov Run-TryBot: Joseph Tsai Auto-Submit: Joseph Tsai TryBot-Result: Gopher Robot Reviewed-by: Johan Brandhorst-Satzkorn Reviewed-by: Than McIntosh Reviewed-by: Daniel Martí --- src/encoding/json/fold_test.go | 132 +++++++++++------------------------------ 1 file changed, 36 insertions(+), 96 deletions(-) (limited to 'src/encoding/json/fold_test.go') diff --git a/src/encoding/json/fold_test.go b/src/encoding/json/fold_test.go index 4daa3590f5..9d6fd0559d 100644 --- a/src/encoding/json/fold_test.go +++ b/src/encoding/json/fold_test.go @@ -6,105 +6,45 @@ package json import ( "bytes" - "strings" "testing" - "unicode/utf8" ) -var foldTests = []struct { - fn func(s, t []byte) bool - s, t string - want bool -}{ - {equalFoldRight, "", "", true}, - {equalFoldRight, "a", "a", true}, - {equalFoldRight, "", "a", false}, - {equalFoldRight, "a", "", false}, - {equalFoldRight, "a", "A", true}, - {equalFoldRight, "AB", "ab", true}, - {equalFoldRight, "AB", "ac", false}, - {equalFoldRight, "sbkKc", "ſbKKc", true}, - {equalFoldRight, "SbKkc", "ſbKKc", true}, - {equalFoldRight, "SbKkc", "ſbKK", false}, - {equalFoldRight, "e", "é", false}, - {equalFoldRight, "s", "S", true}, - - {simpleLetterEqualFold, "", "", true}, - {simpleLetterEqualFold, "abc", "abc", true}, - {simpleLetterEqualFold, "abc", "ABC", true}, - {simpleLetterEqualFold, "abc", "ABCD", false}, - {simpleLetterEqualFold, "abc", "xxx", false}, - - {asciiEqualFold, "a_B", "A_b", true}, - {asciiEqualFold, "aa@", "aa`", false}, // verify 0x40 and 0x60 aren't case-equivalent -} - -func TestFold(t *testing.T) { - for i, tt := range foldTests { - if got := tt.fn([]byte(tt.s), []byte(tt.t)); got != tt.want { - t.Errorf("%d. %q, %q = %v; want %v", i, tt.s, tt.t, got, tt.want) - } - truth := strings.EqualFold(tt.s, tt.t) - if truth != tt.want { - t.Errorf("strings.EqualFold doesn't agree with case %d", i) - } +func FuzzEqualFold(f *testing.F) { + for _, ss := range [][2]string{ + {"", ""}, + {"123abc", "123ABC"}, + {"αβδ", "ΑΒΔ"}, + {"abc", "xyz"}, + {"abc", "XYZ"}, + {"1", "2"}, + {"hello, world!", "hello, world!"}, + {"hello, world!", "Hello, World!"}, + {"hello, world!", "HELLO, WORLD!"}, + {"hello, world!", "jello, world!"}, + {"γειά, κόσμε!", "γειά, κόσμε!"}, + {"γειά, κόσμε!", "Γειά, Κόσμε!"}, + {"γειά, κόσμε!", "ΓΕΙΆ, ΚΌΣΜΕ!"}, + {"γειά, κόσμε!", "ΛΕΙΆ, ΚΌΣΜΕ!"}, + {"AESKey", "aesKey"}, + {"AESKEY", "aes_key"}, + {"aes_key", "AES_KEY"}, + {"AES_KEY", "aes-key"}, + {"aes-key", "AES-KEY"}, + {"AES-KEY", "aesKey"}, + {"aesKey", "AesKey"}, + {"AesKey", "AESKey"}, + {"AESKey", "aeskey"}, + {"DESKey", "aeskey"}, + {"AES Key", "aeskey"}, + } { + f.Add([]byte(ss[0]), []byte(ss[1])) } -} - -func TestFoldAgainstUnicode(t *testing.T) { - var buf1, buf2 []byte - var runes []rune - for i := 0x20; i <= 0x7f; i++ { - runes = append(runes, rune(i)) - } - runes = append(runes, kelvin, smallLongEss) - - funcs := []struct { - name string - fold func(s, t []byte) bool - letter bool // must be ASCII letter - simple bool // must be simple ASCII letter (not 'S' or 'K') - }{ - { - name: "equalFoldRight", - fold: equalFoldRight, - }, - { - name: "asciiEqualFold", - fold: asciiEqualFold, - simple: true, - }, - { - name: "simpleLetterEqualFold", - fold: simpleLetterEqualFold, - simple: true, - letter: true, - }, - } - - for _, ff := range funcs { - for _, r := range runes { - if r >= utf8.RuneSelf { - continue - } - if ff.letter && !isASCIILetter(byte(r)) { - continue - } - if ff.simple && (r == 's' || r == 'S' || r == 'k' || r == 'K') { - continue - } - for _, r2 := range runes { - buf1 = append(utf8.AppendRune(append(buf1[:0], 'x'), r), 'x') - buf2 = append(utf8.AppendRune(append(buf2[:0], 'x'), r2), 'x') - want := bytes.EqualFold(buf1, buf2) - if got := ff.fold(buf1, buf2); got != want { - t.Errorf("%s(%q, %q) = %v; want %v", ff.name, buf1, buf2, got, want) - } - } + equalFold := func(x, y []byte) bool { return string(foldName(x)) == string(foldName(y)) } + f.Fuzz(func(t *testing.T, x, y []byte) { + got := equalFold(x, y) + want := bytes.EqualFold(x, y) + if got != want { + t.Errorf("equalFold(%q, %q) = %v, want %v", x, y, got, want) } - } -} - -func isASCIILetter(b byte) bool { - return ('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') + }) } -- cgit v1.3