strings, bytes: add ToValidUTF8

The newly added functions create a copy of their input with all bytes in invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence given as replacement parameter. Fixes #25805 Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/142003 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
author: Martin Möhrmann <moehrmann@google.com> 2018-10-13 22:40:23 +0200
committer: Brad Fitzpatrick <bradfitz@golang.org> 2019-05-01 18:31:47 +0000
commit: 3259bc441957bf74f069cf7df961367a3472afb2 (patch)
tree: 893219a535c83700f7bcc9df6b7e3ad43764a45c /src/strings/strings.go
parent: 07f689420a552841270c6b751b0a890f6d27cd30 (diff)
download: go-3259bc441957bf74f069cf7df961367a3472afb2.tar.xz
1 files changed, 50 insertions, 0 deletions
diff --git a/src/strings/strings.go b/src/strings/strings.go
index e3fdd9feaf..7337481380 100644
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string {
 	return Map(c.ToTitle, s)
 }
 
+// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
+// replaced by the replacement string, which may be empty.
+func ToValidUTF8(s, replacement string) string {
+	var b Builder
+
+	for i, c := range s {
+		if c != utf8.RuneError {
+			continue
+		}
+
+		_, wid := utf8.DecodeRuneInString(s[i:])
+		if wid == 1 {
+			b.Grow(len(s) + len(replacement))
+			b.WriteString(s[:i])
+			s = s[i:]
+			break
+		}
+	}
+
+	// Fast path for unchanged input
+	if b.Cap() == 0 { // didn't call b.Grow above
+		return s
+	}
+
+	invalid := false // previous byte was from an invalid UTF-8 sequence
+	for i := 0; i < len(s); {
+		c := s[i]
+		if c < utf8.RuneSelf {
+			i++
+			invalid = false
+			b.WriteByte(c)
+			continue
+		}
+		_, wid := utf8.DecodeRuneInString(s[i:])
+		if wid == 1 {
+			i++
+			if !invalid {
+				invalid = true
+				b.WriteString(replacement)
+			}
+			continue
+		}
+		invalid = false
+		b.WriteString(s[i : i+wid])
+		i += wid
+	}
+
+	return b.String()
+}
+
 // isSeparator reports whether the rune could mark a word boundary.
 // TODO: update when package unicode captures more of the properties.
 func isSeparator(r rune) bool {
author	Martin Möhrmann <moehrmann@google.com>	2018-10-13 22:40:23 +0200
committer	Brad Fitzpatrick <bradfitz@golang.org>	2019-05-01 18:31:47 +0000
commit	3259bc441957bf74f069cf7df961367a3472afb2 (patch)
tree	893219a535c83700f7bcc9df6b7e3ad43764a45c /src/strings/strings.go
parent	07f689420a552841270c6b751b0a890f6d27cd30 (diff)
download	go-3259bc441957bf74f069cf7df961367a3472afb2.tar.xz