diff options
Diffstat (limited to 'src/runtime')
| -rw-r--r-- | src/runtime/rune.go | 204 | ||||
| -rw-r--r-- | src/runtime/string.go | 12 | ||||
| -rw-r--r-- | src/runtime/string_test.go | 1 | ||||
| -rw-r--r-- | src/runtime/utf8.go | 123 |
4 files changed, 130 insertions, 210 deletions
diff --git a/src/runtime/rune.go b/src/runtime/rune.go deleted file mode 100644 index 84f7bbf1c0..0000000000 --- a/src/runtime/rune.go +++ /dev/null @@ -1,204 +0,0 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 2002 by Lucent Technologies. - * Portions Copyright 2009 The Go Authors. All rights reserved. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ - -/* - * This code is copied, with slight editing due to type differences, - * from a subset of ../lib9/utf/rune.c [which no longer exists] - */ - -package runtime - -const ( - bit1 = 7 - bitx = 6 - bit2 = 5 - bit3 = 4 - bit4 = 3 - bit5 = 2 - - t1 = ((1 << (bit1 + 1)) - 1) ^ 0xFF /* 0000 0000 */ - tx = ((1 << (bitx + 1)) - 1) ^ 0xFF /* 1000 0000 */ - t2 = ((1 << (bit2 + 1)) - 1) ^ 0xFF /* 1100 0000 */ - t3 = ((1 << (bit3 + 1)) - 1) ^ 0xFF /* 1110 0000 */ - t4 = ((1 << (bit4 + 1)) - 1) ^ 0xFF /* 1111 0000 */ - t5 = ((1 << (bit5 + 1)) - 1) ^ 0xFF /* 1111 1000 */ - - rune1 = (1 << (bit1 + 0*bitx)) - 1 /* 0000 0000 0111 1111 */ - rune2 = (1 << (bit2 + 1*bitx)) - 1 /* 0000 0111 1111 1111 */ - rune3 = (1 << (bit3 + 2*bitx)) - 1 /* 1111 1111 1111 1111 */ - rune4 = (1 << (bit4 + 3*bitx)) - 1 /* 0001 1111 1111 1111 1111 1111 */ - - maskx = (1 << bitx) - 1 /* 0011 1111 */ - testx = maskx ^ 0xFF /* 1100 0000 */ - - runeerror = 0xFFFD - runeself = 0x80 - - surrogateMin = 0xD800 - surrogateMax = 0xDFFF - - runemax = 0x10FFFF /* maximum rune value */ -) - -// charntorune returns the rune at the start of -// s[k:] and the index after the rune in s. -// -// If the string appears to be incomplete or decoding problems -// are encountered (runeerror, k + 1) is returned to ensure -// progress when charntorune is used to iterate over a string. -// -// Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 -func charntorune(s string, k int) (rune, int) { - // When we're not allowed to read anything */ - if len(s) <= k { - return runeerror, k + 1 - } - - s = s[k:] - - // one character sequence (7-bit value) - // 00000-0007F => T1 - c := s[0] - if c < tx { - return rune(c), k + 1 - } - - // If we can't read more than one character we must stop - if len(s) <= 1 { - return runeerror, k + 1 - } - - // two character sequence (11-bit value) - // 0080-07FF => t2 tx - c1 := s[1] ^ tx - if (c1 & testx) != 0 { - return runeerror, k + 1 - } - if c < t3 { - if c < t2 { - return runeerror, k + 1 - } - l := ((rune(c) << bitx) | rune(c1)) & rune2 - if l <= rune1 { - return runeerror, k + 1 - } - return l, k + 2 - } - - // If we can't read more than two characters we must stop - if len(s) <= 2 { - return runeerror, k + 1 - } - - // three character sequence (16-bit value) - // 0800-FFFF => t3 tx tx - c2 := s[2] ^ tx - if (c2 & testx) != 0 { - return runeerror, k + 1 - } - if c < t4 { - l := ((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) & rune3 - if l <= rune2 { - return runeerror, k + 1 - } - if surrogateMin <= l && l <= surrogateMax { - return runeerror, k + 1 - } - return l, k + 3 - } - - if len(s) <= 3 { - return runeerror, k + 1 - } - - // four character sequence (21-bit value) - // 10000-1FFFFF => t4 tx tx tx - c3 := s[3] ^ tx - if (c3 & testx) != 0 { - return runeerror, k + 1 - } - if c < t5 { - l := ((((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) << bitx) | rune(c3)) & rune4 - if l <= rune3 || l > runemax { - return runeerror, k + 1 - } - return l, k + 4 - } - - // Support for 5-byte or longer UTF-8 would go here, but - // since we don't have that, we'll just return runeerror. - return runeerror, k + 1 -} - -// runetochar converts r to bytes and writes the result to str. -// returns the number of bytes generated. -func runetochar(str []byte, r rune) int { - /* runes are signed, so convert to unsigned for range check. */ - c := uint32(r) - /* - * one character sequence - * 00000-0007F => 00-7F - */ - if c <= rune1 { - str[0] = byte(c) - return 1 - } - /* - * two character sequence - * 0080-07FF => t2 tx - */ - if c <= rune2 { - _ = str[1] - str[0] = byte(t2 | (c >> (1 * bitx))) - str[1] = byte(tx | (c & maskx)) - return 2 - } - - /* - * If the rune is out of range or a surrogate half, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * rune wouldn't have fit in one or two bytes. - */ - if c > runemax { - c = runeerror - } - if surrogateMin <= c && c <= surrogateMax { - c = runeerror - } - - /* - * three character sequence - * 0800-FFFF => t3 tx tx - */ - if c <= rune3 { - _ = str[2] - str[0] = byte(t3 | (c >> (2 * bitx))) - str[1] = byte(tx | ((c >> (1 * bitx)) & maskx)) - str[2] = byte(tx | (c & maskx)) - return 3 - } - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => t4 tx tx tx - */ - _ = str[3] - str[0] = byte(t4 | (c >> (3 * bitx))) - str[1] = byte(tx | ((c >> (2 * bitx)) & maskx)) - str[2] = byte(tx | ((c >> (1 * bitx)) & maskx)) - str[3] = byte(tx | (c & maskx)) - return 4 -} diff --git a/src/runtime/string.go b/src/runtime/string.go index 4c4b736c63..c7a9d27711 100644 --- a/src/runtime/string.go +++ b/src/runtime/string.go @@ -196,7 +196,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string { var dum [4]byte size1 := 0 for _, r := range a { - size1 += runetochar(dum[:], r) + size1 += encoderune(dum[:], r) } s, b := rawstringtmp(buf, size1+3) size2 := 0 @@ -205,7 +205,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string { if size2 >= size1 { break } - size2 += runetochar(b[size2:], r) + size2 += encoderune(b[size2:], r) } return s[:size2] } @@ -235,9 +235,9 @@ func intstring(buf *[4]byte, v int64) string { s, b = rawstring(4) } if int64(rune(v)) != v { - v = runeerror + v = runeError } - n := runetochar(b, rune(v)) + n := encoderune(b, rune(v)) return s[:n] } @@ -378,7 +378,7 @@ func gostringw(strw *uint16) string { str := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(strw)) n1 := 0 for i := 0; str[i] != 0; i++ { - n1 += runetochar(buf[:], rune(str[i])) + n1 += encoderune(buf[:], rune(str[i])) } s, b := rawstring(n1 + 4) n2 := 0 @@ -387,7 +387,7 @@ func gostringw(strw *uint16) string { if n2 >= n1 { break } - n2 += runetochar(b[n2:], rune(str[i])) + n2 += encoderune(b[n2:], rune(str[i])) } b[n2] = 0 // for luck return s[:n2] diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go index 4ee32ea671..ef0b01c237 100644 --- a/src/runtime/string_test.go +++ b/src/runtime/string_test.go @@ -92,6 +92,7 @@ func BenchmarkConcatStringAndBytes(b *testing.B) { var stringdata = []struct{ name, data string }{ {"ASCII", "01234567890"}, {"Japanese", "日本語日本語日本語"}, + {"MixedLength", "$Ѐࠀက퀀𐀀\U00040000\U0010FFFF"}, } func BenchmarkRuneIterate(b *testing.B) { diff --git a/src/runtime/utf8.go b/src/runtime/utf8.go new file mode 100644 index 0000000000..24ef179214 --- /dev/null +++ b/src/runtime/utf8.go @@ -0,0 +1,123 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +// Numbers fundamental to the encoding. +const ( + runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character" + runeSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. + maxRune = '\U0010FFFF' // Maximum valid Unicode code point. +) + +// Code points in the surrogate range are not valid for UTF-8. +const ( + surrogateMin = 0xD800 + surrogateMax = 0xDFFF +) + +const ( + t1 = 0x00 // 0000 0000 + tx = 0x80 // 1000 0000 + t2 = 0xC0 // 1100 0000 + t3 = 0xE0 // 1110 0000 + t4 = 0xF0 // 1111 0000 + t5 = 0xF8 // 1111 1000 + + maskx = 0x3F // 0011 1111 + mask2 = 0x1F // 0001 1111 + mask3 = 0x0F // 0000 1111 + mask4 = 0x07 // 0000 0111 + + rune1Max = 1<<7 - 1 + rune2Max = 1<<11 - 1 + rune3Max = 1<<16 - 1 + + // The default lowest and highest continuation byte. + locb = 0x80 // 1000 0000 + hicb = 0xBF // 1011 1111 +) + +// decoderune returns the non-ASCII rune at the start of +// s[k:] and the index after the rune in s. +// +// decoderune assumes that caller has checked that +// the to be decoded rune is a non-ASCII rune. +// +// If the string appears to be incomplete or decoding problems +// are encountered (runeerror, k + 1) is returned to ensure +// progress when decoderune is used to iterate over a string. +func decoderune(s string, k int) (r rune, pos int) { + pos = k + + if k >= len(s) { + return runeError, k + 1 + } + + s = s[k:] + + switch { + case t2 <= s[0] && s[0] < t3: + // 0080-07FF two byte sequence + if len(s) > 1 && (locb <= s[1] && s[1] <= hicb) { + r = rune(s[0]&mask2)<<6 | rune(s[1]&maskx) + pos += 2 + if rune1Max < r { + return + } + } + case t3 <= s[0] && s[0] < t4: + // 0800-FFFF three byte sequence + if len(s) > 2 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) { + r = rune(s[0]&mask3)<<12 | rune(s[1]&maskx)<<6 | rune(s[2]&maskx) + pos += 3 + if rune2Max < r && !(surrogateMin <= r && r <= surrogateMax) { + return + } + } + case t4 <= s[0] && s[0] < t5: + // 10000-1FFFFF four byte sequence + if len(s) > 3 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) && (locb <= s[3] && s[3] <= hicb) { + r = rune(s[0]&mask4)<<18 | rune(s[1]&maskx)<<12 | rune(s[2]&maskx)<<6 | rune(s[3]&maskx) + pos += 4 + if rune3Max < r && r <= maxRune { + return + } + } + } + + return runeError, k + 1 +} + +// encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune. +// It returns the number of bytes written. +func encoderune(p []byte, r rune) int { + // Negative values are erroneous. Making it unsigned addresses the problem. + switch i := uint32(r); { + case i <= rune1Max: + p[0] = byte(r) + return 1 + case i <= rune2Max: + _ = p[1] // eliminate bounds checks + p[0] = t2 | byte(r>>6) + p[1] = tx | byte(r)&maskx + return 2 + case i > maxRune, surrogateMin <= i && i <= surrogateMax: + r = runeError + fallthrough + case i <= rune3Max: + _ = p[2] // eliminate bounds checks + p[0] = t3 | byte(r>>12) + p[1] = tx | byte(r>>6)&maskx + p[2] = tx | byte(r)&maskx + return 3 + default: + _ = p[3] // eliminate bounds checks + p[0] = t4 | byte(r>>18) + p[1] = tx | byte(r>>12)&maskx + p[2] = tx | byte(r>>6)&maskx + p[3] = tx | byte(r)&maskx + return 4 + } +} |
