aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/rune.go204
-rw-r--r--src/runtime/string.go12
-rw-r--r--src/runtime/string_test.go1
-rw-r--r--src/runtime/utf8.go123
4 files changed, 130 insertions, 210 deletions
diff --git a/src/runtime/rune.go b/src/runtime/rune.go
deleted file mode 100644
index 84f7bbf1c0..0000000000
--- a/src/runtime/rune.go
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * The authors of this software are Rob Pike and Ken Thompson.
- * Copyright (c) 2002 by Lucent Technologies.
- * Portions Copyright 2009 The Go Authors. All rights reserved.
- * Permission to use, copy, modify, and distribute this software for any
- * purpose without fee is hereby granted, provided that this entire notice
- * is included in all copies of any software which is or includes a copy
- * or modification of this software and in all copies of the supporting
- * documentation for such software.
- * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
- * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
- * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
- */
-
-/*
- * This code is copied, with slight editing due to type differences,
- * from a subset of ../lib9/utf/rune.c [which no longer exists]
- */
-
-package runtime
-
-const (
- bit1 = 7
- bitx = 6
- bit2 = 5
- bit3 = 4
- bit4 = 3
- bit5 = 2
-
- t1 = ((1 << (bit1 + 1)) - 1) ^ 0xFF /* 0000 0000 */
- tx = ((1 << (bitx + 1)) - 1) ^ 0xFF /* 1000 0000 */
- t2 = ((1 << (bit2 + 1)) - 1) ^ 0xFF /* 1100 0000 */
- t3 = ((1 << (bit3 + 1)) - 1) ^ 0xFF /* 1110 0000 */
- t4 = ((1 << (bit4 + 1)) - 1) ^ 0xFF /* 1111 0000 */
- t5 = ((1 << (bit5 + 1)) - 1) ^ 0xFF /* 1111 1000 */
-
- rune1 = (1 << (bit1 + 0*bitx)) - 1 /* 0000 0000 0111 1111 */
- rune2 = (1 << (bit2 + 1*bitx)) - 1 /* 0000 0111 1111 1111 */
- rune3 = (1 << (bit3 + 2*bitx)) - 1 /* 1111 1111 1111 1111 */
- rune4 = (1 << (bit4 + 3*bitx)) - 1 /* 0001 1111 1111 1111 1111 1111 */
-
- maskx = (1 << bitx) - 1 /* 0011 1111 */
- testx = maskx ^ 0xFF /* 1100 0000 */
-
- runeerror = 0xFFFD
- runeself = 0x80
-
- surrogateMin = 0xD800
- surrogateMax = 0xDFFF
-
- runemax = 0x10FFFF /* maximum rune value */
-)
-
-// charntorune returns the rune at the start of
-// s[k:] and the index after the rune in s.
-//
-// If the string appears to be incomplete or decoding problems
-// are encountered (runeerror, k + 1) is returned to ensure
-// progress when charntorune is used to iterate over a string.
-//
-// Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
-func charntorune(s string, k int) (rune, int) {
- // When we're not allowed to read anything */
- if len(s) <= k {
- return runeerror, k + 1
- }
-
- s = s[k:]
-
- // one character sequence (7-bit value)
- // 00000-0007F => T1
- c := s[0]
- if c < tx {
- return rune(c), k + 1
- }
-
- // If we can't read more than one character we must stop
- if len(s) <= 1 {
- return runeerror, k + 1
- }
-
- // two character sequence (11-bit value)
- // 0080-07FF => t2 tx
- c1 := s[1] ^ tx
- if (c1 & testx) != 0 {
- return runeerror, k + 1
- }
- if c < t3 {
- if c < t2 {
- return runeerror, k + 1
- }
- l := ((rune(c) << bitx) | rune(c1)) & rune2
- if l <= rune1 {
- return runeerror, k + 1
- }
- return l, k + 2
- }
-
- // If we can't read more than two characters we must stop
- if len(s) <= 2 {
- return runeerror, k + 1
- }
-
- // three character sequence (16-bit value)
- // 0800-FFFF => t3 tx tx
- c2 := s[2] ^ tx
- if (c2 & testx) != 0 {
- return runeerror, k + 1
- }
- if c < t4 {
- l := ((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) & rune3
- if l <= rune2 {
- return runeerror, k + 1
- }
- if surrogateMin <= l && l <= surrogateMax {
- return runeerror, k + 1
- }
- return l, k + 3
- }
-
- if len(s) <= 3 {
- return runeerror, k + 1
- }
-
- // four character sequence (21-bit value)
- // 10000-1FFFFF => t4 tx tx tx
- c3 := s[3] ^ tx
- if (c3 & testx) != 0 {
- return runeerror, k + 1
- }
- if c < t5 {
- l := ((((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) << bitx) | rune(c3)) & rune4
- if l <= rune3 || l > runemax {
- return runeerror, k + 1
- }
- return l, k + 4
- }
-
- // Support for 5-byte or longer UTF-8 would go here, but
- // since we don't have that, we'll just return runeerror.
- return runeerror, k + 1
-}
-
-// runetochar converts r to bytes and writes the result to str.
-// returns the number of bytes generated.
-func runetochar(str []byte, r rune) int {
- /* runes are signed, so convert to unsigned for range check. */
- c := uint32(r)
- /*
- * one character sequence
- * 00000-0007F => 00-7F
- */
- if c <= rune1 {
- str[0] = byte(c)
- return 1
- }
- /*
- * two character sequence
- * 0080-07FF => t2 tx
- */
- if c <= rune2 {
- _ = str[1]
- str[0] = byte(t2 | (c >> (1 * bitx)))
- str[1] = byte(tx | (c & maskx))
- return 2
- }
-
- /*
- * If the rune is out of range or a surrogate half, convert it to the error rune.
- * Do this test here because the error rune encodes to three bytes.
- * Doing it earlier would duplicate work, since an out of range
- * rune wouldn't have fit in one or two bytes.
- */
- if c > runemax {
- c = runeerror
- }
- if surrogateMin <= c && c <= surrogateMax {
- c = runeerror
- }
-
- /*
- * three character sequence
- * 0800-FFFF => t3 tx tx
- */
- if c <= rune3 {
- _ = str[2]
- str[0] = byte(t3 | (c >> (2 * bitx)))
- str[1] = byte(tx | ((c >> (1 * bitx)) & maskx))
- str[2] = byte(tx | (c & maskx))
- return 3
- }
-
- /*
- * four character sequence (21-bit value)
- * 10000-1FFFFF => t4 tx tx tx
- */
- _ = str[3]
- str[0] = byte(t4 | (c >> (3 * bitx)))
- str[1] = byte(tx | ((c >> (2 * bitx)) & maskx))
- str[2] = byte(tx | ((c >> (1 * bitx)) & maskx))
- str[3] = byte(tx | (c & maskx))
- return 4
-}
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 4c4b736c63..c7a9d27711 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -196,7 +196,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string {
var dum [4]byte
size1 := 0
for _, r := range a {
- size1 += runetochar(dum[:], r)
+ size1 += encoderune(dum[:], r)
}
s, b := rawstringtmp(buf, size1+3)
size2 := 0
@@ -205,7 +205,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string {
if size2 >= size1 {
break
}
- size2 += runetochar(b[size2:], r)
+ size2 += encoderune(b[size2:], r)
}
return s[:size2]
}
@@ -235,9 +235,9 @@ func intstring(buf *[4]byte, v int64) string {
s, b = rawstring(4)
}
if int64(rune(v)) != v {
- v = runeerror
+ v = runeError
}
- n := runetochar(b, rune(v))
+ n := encoderune(b, rune(v))
return s[:n]
}
@@ -378,7 +378,7 @@ func gostringw(strw *uint16) string {
str := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(strw))
n1 := 0
for i := 0; str[i] != 0; i++ {
- n1 += runetochar(buf[:], rune(str[i]))
+ n1 += encoderune(buf[:], rune(str[i]))
}
s, b := rawstring(n1 + 4)
n2 := 0
@@ -387,7 +387,7 @@ func gostringw(strw *uint16) string {
if n2 >= n1 {
break
}
- n2 += runetochar(b[n2:], rune(str[i]))
+ n2 += encoderune(b[n2:], rune(str[i]))
}
b[n2] = 0 // for luck
return s[:n2]
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index 4ee32ea671..ef0b01c237 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go
@@ -92,6 +92,7 @@ func BenchmarkConcatStringAndBytes(b *testing.B) {
var stringdata = []struct{ name, data string }{
{"ASCII", "01234567890"},
{"Japanese", "日本語日本語日本語"},
+ {"MixedLength", "$Ѐࠀက퀀𐀀\U00040000\U0010FFFF"},
}
func BenchmarkRuneIterate(b *testing.B) {
diff --git a/src/runtime/utf8.go b/src/runtime/utf8.go
new file mode 100644
index 0000000000..24ef179214
--- /dev/null
+++ b/src/runtime/utf8.go
@@ -0,0 +1,123 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Numbers fundamental to the encoding.
+const (
+ runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
+ runeSelf = 0x80 // characters below Runeself are represented as themselves in a single byte.
+ maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
+)
+
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+ surrogateMin = 0xD800
+ surrogateMax = 0xDFFF
+)
+
+const (
+ t1 = 0x00 // 0000 0000
+ tx = 0x80 // 1000 0000
+ t2 = 0xC0 // 1100 0000
+ t3 = 0xE0 // 1110 0000
+ t4 = 0xF0 // 1111 0000
+ t5 = 0xF8 // 1111 1000
+
+ maskx = 0x3F // 0011 1111
+ mask2 = 0x1F // 0001 1111
+ mask3 = 0x0F // 0000 1111
+ mask4 = 0x07 // 0000 0111
+
+ rune1Max = 1<<7 - 1
+ rune2Max = 1<<11 - 1
+ rune3Max = 1<<16 - 1
+
+ // The default lowest and highest continuation byte.
+ locb = 0x80 // 1000 0000
+ hicb = 0xBF // 1011 1111
+)
+
+// decoderune returns the non-ASCII rune at the start of
+// s[k:] and the index after the rune in s.
+//
+// decoderune assumes that caller has checked that
+// the to be decoded rune is a non-ASCII rune.
+//
+// If the string appears to be incomplete or decoding problems
+// are encountered (runeerror, k + 1) is returned to ensure
+// progress when decoderune is used to iterate over a string.
+func decoderune(s string, k int) (r rune, pos int) {
+ pos = k
+
+ if k >= len(s) {
+ return runeError, k + 1
+ }
+
+ s = s[k:]
+
+ switch {
+ case t2 <= s[0] && s[0] < t3:
+ // 0080-07FF two byte sequence
+ if len(s) > 1 && (locb <= s[1] && s[1] <= hicb) {
+ r = rune(s[0]&mask2)<<6 | rune(s[1]&maskx)
+ pos += 2
+ if rune1Max < r {
+ return
+ }
+ }
+ case t3 <= s[0] && s[0] < t4:
+ // 0800-FFFF three byte sequence
+ if len(s) > 2 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) {
+ r = rune(s[0]&mask3)<<12 | rune(s[1]&maskx)<<6 | rune(s[2]&maskx)
+ pos += 3
+ if rune2Max < r && !(surrogateMin <= r && r <= surrogateMax) {
+ return
+ }
+ }
+ case t4 <= s[0] && s[0] < t5:
+ // 10000-1FFFFF four byte sequence
+ if len(s) > 3 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) && (locb <= s[3] && s[3] <= hicb) {
+ r = rune(s[0]&mask4)<<18 | rune(s[1]&maskx)<<12 | rune(s[2]&maskx)<<6 | rune(s[3]&maskx)
+ pos += 4
+ if rune3Max < r && r <= maxRune {
+ return
+ }
+ }
+ }
+
+ return runeError, k + 1
+}
+
+// encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
+// It returns the number of bytes written.
+func encoderune(p []byte, r rune) int {
+ // Negative values are erroneous. Making it unsigned addresses the problem.
+ switch i := uint32(r); {
+ case i <= rune1Max:
+ p[0] = byte(r)
+ return 1
+ case i <= rune2Max:
+ _ = p[1] // eliminate bounds checks
+ p[0] = t2 | byte(r>>6)
+ p[1] = tx | byte(r)&maskx
+ return 2
+ case i > maxRune, surrogateMin <= i && i <= surrogateMax:
+ r = runeError
+ fallthrough
+ case i <= rune3Max:
+ _ = p[2] // eliminate bounds checks
+ p[0] = t3 | byte(r>>12)
+ p[1] = tx | byte(r>>6)&maskx
+ p[2] = tx | byte(r)&maskx
+ return 3
+ default:
+ _ = p[3] // eliminate bounds checks
+ p[0] = t4 | byte(r>>18)
+ p[1] = tx | byte(r>>12)&maskx
+ p[2] = tx | byte(r>>6)&maskx
+ p[3] = tx | byte(r)&maskx
+ return 4
+ }
+}