summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWu Tingfeng <wu.tingfeng@u.nus.edu>2023-06-06 11:14:27 +0800
committerShulhan <ms@kilabit.info>2023-06-06 10:46:57 +0700
commitb6510f668837a17401f0b64fc208d809058f7ffd (patch)
tree5ab3443a383fcd72b1e7f78dd89ac395ba0ca180
parent2d57327dbe2d2d2d819d699f543daac9a55e48e9 (diff)
downloadpakakeh.go-b6510f668837a17401f0b64fc208d809058f7ffd.tar.xz
internal: add package asciiset
The package asciiset add new type ASCIISet, a bitmap to represent list of ASCII characters for faster lookup. ASCIISet is a 36-byte value, where each bit in the first 32-bytes represents the presence of a given ASCII character in the set. The remaining 4-bytes is a counter for the number of ASCII characters in the set. The 128-bits of the first 16 bytes, starting with the least-significant bit of the lowest word to the most-significant bit of the highest word, map to the full range of all 128 ASCII characters. The 128-bits of the next 16 bytes will be zeroed, ensuring that any non-ASCII character will be reported as not in the set.
-rw-r--r--internal/asciiset/asciiset.go120
-rw-r--r--lib/email/is.go8
-rw-r--r--lib/email/is_test.go5
3 files changed, 129 insertions, 4 deletions
diff --git a/internal/asciiset/asciiset.go b/internal/asciiset/asciiset.go
new file mode 100644
index 00000000..64914151
--- /dev/null
+++ b/internal/asciiset/asciiset.go
@@ -0,0 +1,120 @@
+// Copyright (c) 2022, Wu Tingfeng <wutingfeng@outlook.com>
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package asciiset is an ASCII character bitset
+package asciiset
+
+import (
+ "unicode/utf8"
+)
+
+// ASCIISet is a 36-byte value, where each bit in the first 32-bytes
+// represents the presence of a given ASCII character in the set.
+// The remaining 4-bytes is a counter for the number of ASCII characters in the set.
+// The 128-bits of the first 16 bytes, starting with the least-significant bit
+// of the lowest word to the most-significant bit of the highest word,
+// map to the full range of all 128 ASCII characters.
+// The 128-bits of the next 16 bytes will be zeroed,
+// ensuring that any non-ASCII character will be reported as not in the set.
+// Rejecting non-ASCII characters in this way avoids bounds checks in ASCIISet.Contains.
+type ASCIISet [9]uint32
+
+// MakeASCIISet creates a set of ASCII characters and reports whether all
+// characters in chars are ASCII.
+func MakeASCIISet(chars string) (as ASCIISet, ok bool) {
+ for i := 0; i < len(chars); i++ {
+ c := chars[i]
+ if c >= utf8.RuneSelf {
+ return as, false
+ }
+ as.Add(c)
+ }
+ return as, true
+}
+
+// Add inserts character c into the set.
+func (as *ASCIISet) Add(c byte) {
+ if c < utf8.RuneSelf { // ensure that c is an ASCII byte
+ before := as[c/32]
+ as[c/32] |= 1 << (c % 32)
+ if before != as[c/32] {
+ as[8]++
+ }
+ }
+}
+
+// Contains reports whether c is inside the set.
+func (as *ASCIISet) Contains(c byte) bool {
+ return (as[c/32] & (1 << (c % 32))) != 0
+}
+
+// Remove removes c from the set
+//
+// if c is not in the set, the set contents will remain unchanged.
+func (as *ASCIISet) Remove(c byte) {
+ if c < utf8.RuneSelf { // ensure that c is an ASCII byte
+ before := as[c/32]
+ as[c/32] &^= 1 << (c % 32)
+ if before != as[c/32] {
+ as[8]--
+ }
+ }
+}
+
+// Size returns the number of characters in the set.
+func (as *ASCIISet) Size() int {
+ return int(as[8])
+}
+
+// Union returns a new set containing all characters that belong to either as and as2.
+func (as *ASCIISet) Union(as2 ASCIISet) (as3 ASCIISet) {
+ as3[0] = as[0] | as2[0]
+ as3[1] = as[1] | as2[1]
+ as3[2] = as[2] | as2[2]
+ as3[3] = as[3] | as2[3]
+ return
+}
+
+// Intersection returns a new set containing all characters that belong to both as and as2.
+func (as *ASCIISet) Intersection(as2 ASCIISet) (as3 ASCIISet) {
+ as3[0] = as[0] & as2[0]
+ as3[1] = as[1] & as2[1]
+ as3[2] = as[2] & as2[2]
+ as3[3] = as[3] & as2[3]
+ return
+}
+
+// Subtract returns a new set containing all characters that belong to as but not as2.
+func (as *ASCIISet) Subtract(as2 ASCIISet) (as3 ASCIISet) {
+ as3[0] = as[0] &^ as2[0]
+ as3[1] = as[1] &^ as2[1]
+ as3[2] = as[2] &^ as2[2]
+ as3[3] = as[3] &^ as2[3]
+ return
+}
+
+// Equal reports whether as contains the same characters as as2.
+func (as *ASCIISet) Equal(as2 ASCIISet) bool {
+ return as[0] == as2[0] && as[1] == as2[1] && as[2] == as2[2] && as[3] == as2[3]
+}
+
+// Visit calls the do function for each character of as in ascending numerical order.
+// If do returns true, Visit returns immediately, skipping any remaining
+// characters, and returns true. It is safe for do to Add or Remove
+// characters. The behavior of Visit is undefined if do changes
+// the set in any other way.
+func (as *ASCIISet) Visit(do func(n byte) (skip bool)) (aborted bool) {
+ var currentChar byte
+ for i := uint(0); i < 4; i++ {
+ for j := uint(0); j < 32; j++ {
+ if (as[i] & (1 << j)) != 0 {
+ if do(currentChar) {
+ return true
+ }
+ }
+ currentChar++
+ }
+ }
+ return false
+}
diff --git a/lib/email/is.go b/lib/email/is.go
index f0922ec1..739ae493 100644
--- a/lib/email/is.go
+++ b/lib/email/is.go
@@ -4,7 +4,10 @@
package email
-var specialChars = map[byte]struct{}{
+import "github.com/shuLhan/share/internal/asciiset"
+
+var specialChars, _ = asciiset.MakeASCIISet(`()<>[]:;@\,"`)
+var specialCharsOld = map[byte]struct{}{
'(': {}, ')': {},
'<': {}, '>': {},
'[': {}, ']': {},
@@ -43,7 +46,8 @@ func IsValidLocal(local []byte) bool {
continue
}
dot = false
- if _, ok := specialChars[local[x]]; ok {
+ // if _, ok := specialCharsOld[local[x]]; ok {
+ if specialChars.Contains(local[x]) {
return false
}
}
diff --git a/lib/email/is_test.go b/lib/email/is_test.go
index a649f143..55ef3b42 100644
--- a/lib/email/is_test.go
+++ b/lib/email/is_test.go
@@ -39,7 +39,7 @@ func TestIsValidLocal(t *testing.T) {
test.Assert(t, "IsValidLocal", c.exp, got)
}
- for k := range specialChars {
+ specialChars.Visit(func(k byte) bool {
local := []byte("loc")
local = append(local, k)
local = append(local, "al"...)
@@ -49,5 +49,6 @@ func TestIsValidLocal(t *testing.T) {
got := IsValidLocal(local)
test.Assert(t, "IsValidLocal", false, got)
- }
+ return false
+ })
}