From 3259bc441957bf74f069cf7df961367a3472afb2 Mon Sep 17 00:00:00 2001 From: Martin Möhrmann Date: Sat, 13 Oct 2018 22:40:23 +0200 Subject: strings, bytes: add ToValidUTF8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly added functions create a copy of their input with all bytes in invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence given as replacement parameter. Fixes #25805 Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/142003 Run-TryBot: Martin Möhrmann TryBot-Result: Gobot Gobot Reviewed-by: Brad Fitzpatrick --- src/strings/strings.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'src/strings/strings.go') diff --git a/src/strings/strings.go b/src/strings/strings.go index e3fdd9feaf..7337481380 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string { return Map(c.ToTitle, s) } +// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences +// replaced by the replacement string, which may be empty. +func ToValidUTF8(s, replacement string) string { + var b Builder + + for i, c := range s { + if c != utf8.RuneError { + continue + } + + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + b.Grow(len(s) + len(replacement)) + b.WriteString(s[:i]) + s = s[i:] + break + } + } + + // Fast path for unchanged input + if b.Cap() == 0 { // didn't call b.Grow above + return s + } + + invalid := false // previous byte was from an invalid UTF-8 sequence + for i := 0; i < len(s); { + c := s[i] + if c < utf8.RuneSelf { + i++ + invalid = false + b.WriteByte(c) + continue + } + _, wid := utf8.DecodeRuneInString(s[i:]) + if wid == 1 { + i++ + if !invalid { + invalid = true + b.WriteString(replacement) + } + continue + } + invalid = false + b.WriteString(s[i : i+wid]) + i += wid + } + + return b.String() +} + // isSeparator reports whether the rune could mark a word boundary. // TODO: update when package unicode captures more of the properties. func isSeparator(r rune) bool { -- cgit v1.3-5-g45d5