diff options
| author | Jes Cok <xigua67damn@gmail.com> | 2024-03-07 21:36:47 +0800 |
|---|---|---|
| committer | Gopher Robot <gobot@golang.org> | 2024-03-07 19:08:48 +0000 |
| commit | ef4f2a05972f9b729f5edb897d581f496675f588 (patch) | |
| tree | 3fd678ebc37afce98536a25a35c6fd7a70296dd8 /src/unicode | |
| parent | e0ba596c15dd82aad021c0c812fda6ca14ce118a (diff) | |
| download | go-ef4f2a05972f9b729f5edb897d581f496675f588.tar.xz | |
unicode/utf16: add func RuneLen
This CL adds func RuneLen, while here, also uses RuneLen to simplify
code in Encode.
Fixes #44940
Change-Id: Ifd3b537f69880dfd32a69a6733d8d3c2b5d4ecba
Reviewed-on: https://go-review.googlesource.com/c/go/+/569755
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Commit-Queue: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
Diffstat (limited to 'src/unicode')
| -rw-r--r-- | src/unicode/utf16/export_test.go | 3 | ||||
| -rw-r--r-- | src/unicode/utf16/utf16.go | 21 | ||||
| -rw-r--r-- | src/unicode/utf16/utf16_test.go | 20 |
3 files changed, 39 insertions, 5 deletions
diff --git a/src/unicode/utf16/export_test.go b/src/unicode/utf16/export_test.go index e0c57f52ae..74a89bf39a 100644 --- a/src/unicode/utf16/export_test.go +++ b/src/unicode/utf16/export_test.go @@ -6,6 +6,9 @@ package utf16 // Extra names for constants so we can validate them during testing. const ( + Surr1 = surr1 + Surr3 = surr3 + SurrSelf = surrSelf MaxRune = maxRune ReplacementChar = replacementChar ) diff --git a/src/unicode/utf16/utf16.go b/src/unicode/utf16/utf16.go index 1c6d2c66c3..0293bbf639 100644 --- a/src/unicode/utf16/utf16.go +++ b/src/unicode/utf16/utf16.go @@ -52,6 +52,19 @@ func EncodeRune(r rune) (r1, r2 rune) { return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff } +// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune. +// It returns -1 if the rune is not a valid value to encode in UTF-16. +func RuneLen(r rune) int { + switch { + case 0 <= r && r < surr1, surr3 <= r && r < surrSelf: + return 1 + case surrSelf <= r && r <= maxRune: + return 2 + default: + return -1 + } +} + // Encode returns the UTF-16 encoding of the Unicode code point sequence s. func Encode(s []rune) []uint16 { n := len(s) @@ -64,13 +77,11 @@ func Encode(s []rune) []uint16 { a := make([]uint16, n) n = 0 for _, v := range s { - switch { - case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: - // normal rune + switch RuneLen(v) { + case 1: // normal rune a[n] = uint16(v) n++ - case surrSelf <= v && v <= maxRune: - // needs surrogate sequence + case 2: // needs surrogate sequence r1, r2 := EncodeRune(v) a[n] = uint16(r1) a[n+1] = uint16(r2) diff --git a/src/unicode/utf16/utf16_test.go b/src/unicode/utf16/utf16_test.go index a5a503d387..74a4a6746b 100644 --- a/src/unicode/utf16/utf16_test.go +++ b/src/unicode/utf16/utf16_test.go @@ -22,6 +22,26 @@ func TestConstants(t *testing.T) { } } +func TestRuneLen(t *testing.T) { + for _, tt := range []struct { + r rune + length int + }{ + {0, 1}, + {Surr1 - 1, 1}, + {Surr3, 1}, + {SurrSelf - 1, 1}, + {SurrSelf, 2}, + {MaxRune, 2}, + {MaxRune + 1, -1}, + {-1, -1}, + } { + if length := RuneLen(tt.r); length != tt.length { + t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length) + } + } +} + type encodeTest struct { in []rune out []uint16 |
