summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2022-06-27 23:30:03 +0700
committerShulhan <ms@kilabit.info>2022-07-02 12:54:13 +0700
commit21f2f2f64ab9f846800730003cf83643dd69b066 (patch)
tree473b216111cb602a290a9f30b821472b78874024
parent0f8ba5965c82c2c912f3c937e78917e20cd1eeb7 (diff)
downloadpakakeh.go-21f2f2f64ab9f846800730003cf83643dd69b066.tar.xz
net/html: use inline replacement to clean up white spaces
Instead of using bytes.Replace, three times, iterate the plain text manually to clean up the white and multiple spaces. Benchmark result, name old time/op new time/op delta Sanitize-8 4.27µs ±10% 2.64µs ±13% -38.21% (p=0.000 n=10+10) name old alloc/op new alloc/op delta Sanitize-8 4.84kB ± 0% 4.45kB ± 0% -7.94% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Sanitize-8 13.0 ± 0% 6.0 ± 0% -53.85% (p=0.000 n=10+10)
-rw-r--r--lib/net/html/example_test.go2
-rw-r--r--lib/net/html/html.go42
2 files changed, 29 insertions, 15 deletions
diff --git a/lib/net/html/example_test.go b/lib/net/html/example_test.go
index 5554ef90..e06f6da4 100644
--- a/lib/net/html/example_test.go
+++ b/lib/net/html/example_test.go
@@ -13,6 +13,7 @@ func ExampleNormalizeForID() {
fmt.Println(NormalizeForID("_id.1"))
fmt.Println(NormalizeForID("1-d"))
fmt.Println(NormalizeForID(".123 ABC def"))
+ fmt.Println(NormalizeForID("test 123"))
fmt.Println(NormalizeForID("⌘"))
//Output:
//_
@@ -21,6 +22,7 @@ func ExampleNormalizeForID() {
//_id_1
//_1-d
//_123_abc_def
+ //test_123
//___
}
diff --git a/lib/net/html/html.go b/lib/net/html/html.go
index c149794e..807eae9c 100644
--- a/lib/net/html/html.go
+++ b/lib/net/html/html.go
@@ -71,14 +71,15 @@ func Sanitize(in []byte) (plain []byte) {
}
var (
- r = bytes.NewReader(in)
- twoSpaces = []byte(" ")
+ r = bytes.NewReader(in)
- w bytes.Buffer
- htmlToken *html.Tokenizer
- tokenType html.TokenType
- tagName []byte
- x int
+ w bytes.Buffer
+ htmlToken *html.Tokenizer
+ tokenType html.TokenType
+ tagName []byte
+ x, y int
+ c byte
+ prevIsSpace bool
)
htmlToken = html.NewTokenizer(r)
@@ -102,16 +103,27 @@ func Sanitize(in []byte) (plain []byte) {
}
out:
plain = w.Bytes()
- plain = bytes.Replace(plain, []byte("\r"), nil, -1)
- plain = bytes.Replace(plain, []byte("\n"), []byte(" "), -1)
- plain = bytes.Replace(plain, []byte("\t"), []byte(" "), -1)
- for {
- x = bytes.Index(plain, twoSpaces)
- if x < 0 {
- break
+
+ // Remove CR ('\r'), replace LF and TAB with space and trim multiple
+ // spaces.
+ for y, c = range plain {
+ if c == '\r' || c == '\v' {
+ continue
}
- plain = bytes.Replace(plain, twoSpaces, []byte(" "), -1)
+ if c == '\n' || c == '\t' || c == ' ' {
+ if !prevIsSpace {
+ plain[x] = ' '
+ x++
+ prevIsSpace = true
+ }
+ continue
+ }
+ plain[x] = plain[y]
+ x++
+ prevIsSpace = false
}
+ plain = plain[:x]
+
return plain
}