diff options
| author | Martin Möhrmann <moehrmann@google.com> | 2018-10-13 22:40:23 +0200 |
|---|---|---|
| committer | Brad Fitzpatrick <bradfitz@golang.org> | 2019-05-01 18:31:47 +0000 |
| commit | 3259bc441957bf74f069cf7df961367a3472afb2 (patch) | |
| tree | 893219a535c83700f7bcc9df6b7e3ad43764a45c /src/bytes/bytes.go | |
| parent | 07f689420a552841270c6b751b0a890f6d27cd30 (diff) | |
| download | go-3259bc441957bf74f069cf7df961367a3472afb2.tar.xz | |
strings, bytes: add ToValidUTF8
The newly added functions create a copy of their input with all bytes in
invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence
given as replacement parameter.
Fixes #25805
Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0
Reviewed-on: https://go-review.googlesource.com/c/go/+/142003
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/bytes/bytes.go')
| -rw-r--r-- | src/bytes/bytes.go | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 9d586581f5..eb13212384 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -592,6 +592,35 @@ func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte { return Map(c.ToTitle, s) } +// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes +// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty. +func ToValidUTF8(s, replacement []byte) []byte { + b := make([]byte, 0, len(s)+len(replacement)) + invalid := false // previous byte was from an invalid UTF-8 sequence + for i := 0; i < len(s); { + c := s[i] + if c < utf8.RuneSelf { + i++ + invalid = false + b = append(b, byte(c)) + continue + } + _, wid := utf8.DecodeRune(s[i:]) + if wid == 1 { + i++ + if !invalid { + invalid = true + b = append(b, replacement...) + } + continue + } + invalid = false + b = append(b, s[i:i+wid]...) + i += wid + } + return b +} + // isSeparator reports whether the rune could mark a word boundary. // TODO: update when package unicode captures more of the properties. func isSeparator(r rune) bool { |
