net/html: use inline replacement to clean up white spaces

Instead of using bytes.Replace, three times, iterate the plain text manually to clean up the white and multiple spaces. Benchmark result, name old time/op new time/op delta Sanitize-8 4.27µs ±10% 2.64µs ±13% -38.21% (p=0.000 n=10+10) name old alloc/op new alloc/op delta Sanitize-8 4.84kB ± 0% 4.45kB ± 0% -7.94% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Sanitize-8 13.0 ± 0% 6.0 ± 0% -53.85% (p=0.000 n=10+10)
author: Shulhan <ms@kilabit.info> 2022-06-27 23:30:03 +0700
committer: Shulhan <ms@kilabit.info> 2022-07-02 12:54:13 +0700
commit: 21f2f2f64ab9f846800730003cf83643dd69b066 (patch)
tree: 473b216111cb602a290a9f30b821472b78874024
parent: 0f8ba5965c82c2c912f3c937e78917e20cd1eeb7 (diff)
download: pakakeh.go-21f2f2f64ab9f846800730003cf83643dd69b066.tar.xz
2 files changed, 29 insertions, 15 deletions
diff --git a/lib/net/html/example_test.go b/lib/net/html/example_test.go
index 5554ef90..e06f6da4 100644
--- a/lib/net/html/example_test.go
+++ b/lib/net/html/example_test.go
@@ -13,6 +13,7 @@ func ExampleNormalizeForID() {
 	fmt.Println(NormalizeForID("_id.1"))
 	fmt.Println(NormalizeForID("1-d"))
 	fmt.Println(NormalizeForID(".123 ABC def"))
+	fmt.Println(NormalizeForID("test 123"))
 	fmt.Println(NormalizeForID("⌘"))
 	//Output:
 	//_
@@ -21,6 +22,7 @@ func ExampleNormalizeForID() {
 	//_id_1
 	//_1-d
 	//_123_abc_def
+	//test_123
 	//___
 }
 
diff --git a/lib/net/html/html.go b/lib/net/html/html.go
index c149794e..807eae9c 100644
--- a/lib/net/html/html.go
+++ b/lib/net/html/html.go
@@ -71,14 +71,15 @@ func Sanitize(in []byte) (plain []byte) {
 	}
 
 	var (
-		r         = bytes.NewReader(in)
-		twoSpaces = []byte("  ")
+		r = bytes.NewReader(in)
 
-		w         bytes.Buffer
-		htmlToken *html.Tokenizer
-		tokenType html.TokenType
-		tagName   []byte
-		x         int
+		w           bytes.Buffer
+		htmlToken   *html.Tokenizer
+		tokenType   html.TokenType
+		tagName     []byte
+		x, y        int
+		c           byte
+		prevIsSpace bool
 	)
 
 	htmlToken = html.NewTokenizer(r)
@@ -102,16 +103,27 @@ func Sanitize(in []byte) (plain []byte) {
 	}
 out:
 	plain = w.Bytes()
-	plain = bytes.Replace(plain, []byte("\r"), nil, -1)
-	plain = bytes.Replace(plain, []byte("\n"), []byte(" "), -1)
-	plain = bytes.Replace(plain, []byte("\t"), []byte(" "), -1)
-	for {
-		x = bytes.Index(plain, twoSpaces)
-		if x < 0 {
-			break
+
+	// Remove CR ('\r'), replace LF and TAB with space and trim multiple
+	// spaces.
+	for y, c = range plain {
+		if c == '\r' || c == '\v' {
+			continue
 		}
-		plain = bytes.Replace(plain, twoSpaces, []byte(" "), -1)
+		if c == '\n' || c == '\t' || c == ' ' {
+			if !prevIsSpace {
+				plain[x] = ' '
+				x++
+				prevIsSpace = true
+			}
+			continue
+		}
+		plain[x] = plain[y]
+		x++
+		prevIsSpace = false
 	}
 
+	plain = plain[:x]
+
 	return plain
 }
author	Shulhan <ms@kilabit.info>	2022-06-27 23:30:03 +0700
committer	Shulhan <ms@kilabit.info>	2022-07-02 12:54:13 +0700
commit	21f2f2f64ab9f846800730003cf83643dd69b066 (patch)
tree	473b216111cb602a290a9f30b821472b78874024
parent	0f8ba5965c82c2c912f3c937e78917e20cd1eeb7 (diff)
download	pakakeh.go-21f2f2f64ab9f846800730003cf83643dd69b066.tar.xz