diff options
| author | Martin Möhrmann <moehrmann@google.com> | 2018-10-13 22:40:23 +0200 |
|---|---|---|
| committer | Brad Fitzpatrick <bradfitz@golang.org> | 2019-05-01 18:31:47 +0000 |
| commit | 3259bc441957bf74f069cf7df961367a3472afb2 (patch) | |
| tree | 893219a535c83700f7bcc9df6b7e3ad43764a45c /src/bytes/bytes_test.go | |
| parent | 07f689420a552841270c6b751b0a890f6d27cd30 (diff) | |
| download | go-3259bc441957bf74f069cf7df961367a3472afb2.tar.xz | |
strings, bytes: add ToValidUTF8
The newly added functions create a copy of their input with all bytes in
invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence
given as replacement parameter.
Fixes #25805
Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0
Reviewed-on: https://go-review.googlesource.com/c/go/+/142003
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/bytes/bytes_test.go')
| -rw-r--r-- | src/bytes/bytes_test.go | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 4c50755e7c..2dbbb99f37 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -1061,6 +1061,36 @@ func BenchmarkToLower(b *testing.B) { } } +var toValidUTF8Tests = []struct { + in string + repl string + out string +}{ + {"", "\uFFFD", ""}, + {"abc", "\uFFFD", "abc"}, + {"\uFDDD", "\uFFFD", "\uFDDD"}, + {"a\xffb", "\uFFFD", "a\uFFFDb"}, + {"a\xffb\uFFFD", "X", "aXb\uFFFD"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"}, + {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"}, + {"\xC0\xAF", "\uFFFD", "\uFFFD"}, + {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xed\xa0\x80", "abc", "abc"}, + {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"}, + {"\xF0\x80\x80\xaf", "☺", "☺"}, + {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, + {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"}, +} + +func TestToValidUTF8(t *testing.T) { + for _, tc := range toValidUTF8Tests { + got := ToValidUTF8([]byte(tc.in), []byte(tc.repl)) + if !Equal(got, []byte(tc.out)) { + t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out) + } + } +} + func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) } type RepeatTest struct { @@ -1703,6 +1733,26 @@ func BenchmarkTrimSpace(b *testing.B) { } } +func BenchmarkToValidUTF8(b *testing.B) { + tests := []struct { + name string + input []byte + }{ + {"Valid", []byte("typical")}, + {"InvalidASCII", []byte("foo\xffbar")}, + {"InvalidNonASCII", []byte("日本語\xff日本語")}, + } + replacement := []byte("\uFFFD") + b.ResetTimer() + for _, test := range tests { + b.Run(test.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToValidUTF8(test.input, replacement) + } + }) + } +} + func makeBenchInputHard() []byte { tokens := [...]string{ "<a>", "<p>", "<b>", "<strong>", |
