diff options
| author | Shulhan <ms@kilabit.info> | 2022-06-27 23:30:03 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2022-07-02 12:54:13 +0700 |
| commit | 21f2f2f64ab9f846800730003cf83643dd69b066 (patch) | |
| tree | 473b216111cb602a290a9f30b821472b78874024 | |
| parent | 0f8ba5965c82c2c912f3c937e78917e20cd1eeb7 (diff) | |
| download | pakakeh.go-21f2f2f64ab9f846800730003cf83643dd69b066.tar.xz | |
net/html: use inline replacement to clean up white spaces
Instead of using bytes.Replace, three times, iterate the plain text
manually to clean up the white and multiple spaces.
Benchmark result,
name old time/op new time/op delta
Sanitize-8 4.27µs ±10% 2.64µs ±13% -38.21% (p=0.000 n=10+10)
name old alloc/op new alloc/op delta
Sanitize-8 4.84kB ± 0% 4.45kB ± 0% -7.94% (p=0.000 n=10+10)
name old allocs/op new allocs/op delta
Sanitize-8 13.0 ± 0% 6.0 ± 0% -53.85% (p=0.000 n=10+10)
| -rw-r--r-- | lib/net/html/example_test.go | 2 | ||||
| -rw-r--r-- | lib/net/html/html.go | 42 |
2 files changed, 29 insertions, 15 deletions
diff --git a/lib/net/html/example_test.go b/lib/net/html/example_test.go index 5554ef90..e06f6da4 100644 --- a/lib/net/html/example_test.go +++ b/lib/net/html/example_test.go @@ -13,6 +13,7 @@ func ExampleNormalizeForID() { fmt.Println(NormalizeForID("_id.1")) fmt.Println(NormalizeForID("1-d")) fmt.Println(NormalizeForID(".123 ABC def")) + fmt.Println(NormalizeForID("test 123")) fmt.Println(NormalizeForID("⌘")) //Output: //_ @@ -21,6 +22,7 @@ func ExampleNormalizeForID() { //_id_1 //_1-d //_123_abc_def + //test_123 //___ } diff --git a/lib/net/html/html.go b/lib/net/html/html.go index c149794e..807eae9c 100644 --- a/lib/net/html/html.go +++ b/lib/net/html/html.go @@ -71,14 +71,15 @@ func Sanitize(in []byte) (plain []byte) { } var ( - r = bytes.NewReader(in) - twoSpaces = []byte(" ") + r = bytes.NewReader(in) - w bytes.Buffer - htmlToken *html.Tokenizer - tokenType html.TokenType - tagName []byte - x int + w bytes.Buffer + htmlToken *html.Tokenizer + tokenType html.TokenType + tagName []byte + x, y int + c byte + prevIsSpace bool ) htmlToken = html.NewTokenizer(r) @@ -102,16 +103,27 @@ func Sanitize(in []byte) (plain []byte) { } out: plain = w.Bytes() - plain = bytes.Replace(plain, []byte("\r"), nil, -1) - plain = bytes.Replace(plain, []byte("\n"), []byte(" "), -1) - plain = bytes.Replace(plain, []byte("\t"), []byte(" "), -1) - for { - x = bytes.Index(plain, twoSpaces) - if x < 0 { - break + + // Remove CR ('\r'), replace LF and TAB with space and trim multiple + // spaces. + for y, c = range plain { + if c == '\r' || c == '\v' { + continue } - plain = bytes.Replace(plain, twoSpaces, []byte(" "), -1) + if c == '\n' || c == '\t' || c == ' ' { + if !prevIsSpace { + plain[x] = ' ' + x++ + prevIsSpace = true + } + continue + } + plain[x] = plain[y] + x++ + prevIsSpace = false } + plain = plain[:x] + return plain } |
