diff options
| author | Wu Tingfeng <wu.tingfeng@u.nus.edu> | 2023-06-06 11:14:27 +0800 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2023-06-06 10:46:57 +0700 |
| commit | b6510f668837a17401f0b64fc208d809058f7ffd (patch) | |
| tree | 5ab3443a383fcd72b1e7f78dd89ac395ba0ca180 | |
| parent | 2d57327dbe2d2d2d819d699f543daac9a55e48e9 (diff) | |
| download | pakakeh.go-b6510f668837a17401f0b64fc208d809058f7ffd.tar.xz | |
internal: add package asciiset
The package asciiset add new type ASCIISet, a bitmap to represent list of ASCII characters
for faster lookup.
ASCIISet is a 36-byte value, where each bit in the first 32-bytes represents the presence of
a given ASCII character in the set.
The remaining 4-bytes is a counter for the number of ASCII characters in the set.
The 128-bits of the first 16 bytes, starting with the least-significant bit of the lowest word
to the most-significant bit of the highest word, map to the full range of all 128 ASCII characters.
The 128-bits of the next 16 bytes will be zeroed, ensuring that any non-ASCII character will
be reported as not in the set.
| -rw-r--r-- | internal/asciiset/asciiset.go | 120 | ||||
| -rw-r--r-- | lib/email/is.go | 8 | ||||
| -rw-r--r-- | lib/email/is_test.go | 5 |
3 files changed, 129 insertions, 4 deletions
diff --git a/internal/asciiset/asciiset.go b/internal/asciiset/asciiset.go new file mode 100644 index 00000000..64914151 --- /dev/null +++ b/internal/asciiset/asciiset.go @@ -0,0 +1,120 @@ +// Copyright (c) 2022, Wu Tingfeng <wutingfeng@outlook.com> +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package asciiset is an ASCII character bitset +package asciiset + +import ( + "unicode/utf8" +) + +// ASCIISet is a 36-byte value, where each bit in the first 32-bytes +// represents the presence of a given ASCII character in the set. +// The remaining 4-bytes is a counter for the number of ASCII characters in the set. +// The 128-bits of the first 16 bytes, starting with the least-significant bit +// of the lowest word to the most-significant bit of the highest word, +// map to the full range of all 128 ASCII characters. +// The 128-bits of the next 16 bytes will be zeroed, +// ensuring that any non-ASCII character will be reported as not in the set. +// Rejecting non-ASCII characters in this way avoids bounds checks in ASCIISet.Contains. +type ASCIISet [9]uint32 + +// MakeASCIISet creates a set of ASCII characters and reports whether all +// characters in chars are ASCII. +func MakeASCIISet(chars string) (as ASCIISet, ok bool) { + for i := 0; i < len(chars); i++ { + c := chars[i] + if c >= utf8.RuneSelf { + return as, false + } + as.Add(c) + } + return as, true +} + +// Add inserts character c into the set. +func (as *ASCIISet) Add(c byte) { + if c < utf8.RuneSelf { // ensure that c is an ASCII byte + before := as[c/32] + as[c/32] |= 1 << (c % 32) + if before != as[c/32] { + as[8]++ + } + } +} + +// Contains reports whether c is inside the set. +func (as *ASCIISet) Contains(c byte) bool { + return (as[c/32] & (1 << (c % 32))) != 0 +} + +// Remove removes c from the set +// +// if c is not in the set, the set contents will remain unchanged. +func (as *ASCIISet) Remove(c byte) { + if c < utf8.RuneSelf { // ensure that c is an ASCII byte + before := as[c/32] + as[c/32] &^= 1 << (c % 32) + if before != as[c/32] { + as[8]-- + } + } +} + +// Size returns the number of characters in the set. +func (as *ASCIISet) Size() int { + return int(as[8]) +} + +// Union returns a new set containing all characters that belong to either as and as2. +func (as *ASCIISet) Union(as2 ASCIISet) (as3 ASCIISet) { + as3[0] = as[0] | as2[0] + as3[1] = as[1] | as2[1] + as3[2] = as[2] | as2[2] + as3[3] = as[3] | as2[3] + return +} + +// Intersection returns a new set containing all characters that belong to both as and as2. +func (as *ASCIISet) Intersection(as2 ASCIISet) (as3 ASCIISet) { + as3[0] = as[0] & as2[0] + as3[1] = as[1] & as2[1] + as3[2] = as[2] & as2[2] + as3[3] = as[3] & as2[3] + return +} + +// Subtract returns a new set containing all characters that belong to as but not as2. +func (as *ASCIISet) Subtract(as2 ASCIISet) (as3 ASCIISet) { + as3[0] = as[0] &^ as2[0] + as3[1] = as[1] &^ as2[1] + as3[2] = as[2] &^ as2[2] + as3[3] = as[3] &^ as2[3] + return +} + +// Equal reports whether as contains the same characters as as2. +func (as *ASCIISet) Equal(as2 ASCIISet) bool { + return as[0] == as2[0] && as[1] == as2[1] && as[2] == as2[2] && as[3] == as2[3] +} + +// Visit calls the do function for each character of as in ascending numerical order. +// If do returns true, Visit returns immediately, skipping any remaining +// characters, and returns true. It is safe for do to Add or Remove +// characters. The behavior of Visit is undefined if do changes +// the set in any other way. +func (as *ASCIISet) Visit(do func(n byte) (skip bool)) (aborted bool) { + var currentChar byte + for i := uint(0); i < 4; i++ { + for j := uint(0); j < 32; j++ { + if (as[i] & (1 << j)) != 0 { + if do(currentChar) { + return true + } + } + currentChar++ + } + } + return false +} diff --git a/lib/email/is.go b/lib/email/is.go index f0922ec1..739ae493 100644 --- a/lib/email/is.go +++ b/lib/email/is.go @@ -4,7 +4,10 @@ package email -var specialChars = map[byte]struct{}{ +import "github.com/shuLhan/share/internal/asciiset" + +var specialChars, _ = asciiset.MakeASCIISet(`()<>[]:;@\,"`) +var specialCharsOld = map[byte]struct{}{ '(': {}, ')': {}, '<': {}, '>': {}, '[': {}, ']': {}, @@ -43,7 +46,8 @@ func IsValidLocal(local []byte) bool { continue } dot = false - if _, ok := specialChars[local[x]]; ok { + // if _, ok := specialCharsOld[local[x]]; ok { + if specialChars.Contains(local[x]) { return false } } diff --git a/lib/email/is_test.go b/lib/email/is_test.go index a649f143..55ef3b42 100644 --- a/lib/email/is_test.go +++ b/lib/email/is_test.go @@ -39,7 +39,7 @@ func TestIsValidLocal(t *testing.T) { test.Assert(t, "IsValidLocal", c.exp, got) } - for k := range specialChars { + specialChars.Visit(func(k byte) bool { local := []byte("loc") local = append(local, k) local = append(local, "al"...) @@ -49,5 +49,6 @@ func TestIsValidLocal(t *testing.T) { got := IsValidLocal(local) test.Assert(t, "IsValidLocal", false, got) - } + return false + }) } |
