aboutsummaryrefslogtreecommitdiff
path: root/src/bytes/bytes_test.go
diff options
context:
space:
mode:
authorMartin Möhrmann <moehrmann@google.com>2018-10-13 22:40:23 +0200
committerBrad Fitzpatrick <bradfitz@golang.org>2019-05-01 18:31:47 +0000
commit3259bc441957bf74f069cf7df961367a3472afb2 (patch)
tree893219a535c83700f7bcc9df6b7e3ad43764a45c /src/bytes/bytes_test.go
parent07f689420a552841270c6b751b0a890f6d27cd30 (diff)
downloadgo-3259bc441957bf74f069cf7df961367a3472afb2.tar.xz
strings, bytes: add ToValidUTF8
The newly added functions create a copy of their input with all bytes in invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence given as replacement parameter. Fixes #25805 Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/142003 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/bytes/bytes_test.go')
-rw-r--r--src/bytes/bytes_test.go50
1 files changed, 50 insertions, 0 deletions
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index 4c50755e7c..2dbbb99f37 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -1061,6 +1061,36 @@ func BenchmarkToLower(b *testing.B) {
}
}
+var toValidUTF8Tests = []struct {
+ in string
+ repl string
+ out string
+}{
+ {"", "\uFFFD", ""},
+ {"abc", "\uFFFD", "abc"},
+ {"\uFDDD", "\uFFFD", "\uFDDD"},
+ {"a\xffb", "\uFFFD", "a\uFFFDb"},
+ {"a\xffb\uFFFD", "X", "aXb\uFFFD"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"},
+ {"\xC0\xAF", "\uFFFD", "\uFFFD"},
+ {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xed\xa0\x80", "abc", "abc"},
+ {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"},
+ {"\xF0\x80\x80\xaf", "☺", "☺"},
+ {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+}
+
+func TestToValidUTF8(t *testing.T) {
+ for _, tc := range toValidUTF8Tests {
+ got := ToValidUTF8([]byte(tc.in), []byte(tc.repl))
+ if !Equal(got, []byte(tc.out)) {
+ t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out)
+ }
+ }
+}
+
func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) }
type RepeatTest struct {
@@ -1703,6 +1733,26 @@ func BenchmarkTrimSpace(b *testing.B) {
}
}
+func BenchmarkToValidUTF8(b *testing.B) {
+ tests := []struct {
+ name string
+ input []byte
+ }{
+ {"Valid", []byte("typical")},
+ {"InvalidASCII", []byte("foo\xffbar")},
+ {"InvalidNonASCII", []byte("日本語\xff日本語")},
+ }
+ replacement := []byte("\uFFFD")
+ b.ResetTimer()
+ for _, test := range tests {
+ b.Run(test.name, func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ToValidUTF8(test.input, replacement)
+ }
+ })
+ }
+}
+
func makeBenchInputHard() []byte {
tokens := [...]string{
"<a>", "<p>", "<b>", "<strong>",