lib: move package "net/html" to "lib/html"

Putting "html" under "net" package make no sense. Another reason is to make the package flat under "lib/" directory.
author: Shulhan <ms@kilabit.info> 2024-03-06 03:42:00 +0700
committer: Shulhan <ms@kilabit.info> 2024-03-09 01:10:17 +0700
commit: e730418289d80985c5d48946353961a357a4b532 (patch)
tree: 9e573bf4fca975775eb25f32448f755a325880a8 /lib/html
parent: 1e7cb99f42bcd41e98326bd9406d3cecfb2a4542 (diff)
download: pakakeh.go-e730418289d80985c5d48946353961a357a4b532.tar.xz
6 files changed, 492 insertions, 0 deletions
diff --git a/lib/html/benchmark_test.go b/lib/html/benchmark_test.go
new file mode 100644
index 00000000..05aa62d2
--- /dev/null
+++ b/lib/html/benchmark_test.go
@@ -0,0 +1,45 @@
+// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import "testing"
+
+func BenchmarkNormalizeForID(b *testing.B) {
+	var (
+		cases = []string{
+			"",
+			".123 ABC def",
+		}
+		x int
+	)
+	for ; x < b.N; x++ {
+		NormalizeForID(cases[0])
+		NormalizeForID(cases[1])
+	}
+}
+
+func BenchmarkSanitize(b *testing.B) {
+	var (
+		input = []byte(`
+<html>
+	<title>Test</title>
+	<head>
+	</head>
+	<body>
+		This
+		<p> is </p>
+		a
+		<a href="/">link</a>.
+		An another
+		<a href="/">link</a>.
+	</body>
+</html>`)
+		x int
+	)
+
+	for ; x < b.N; x++ {
+		Sanitize(input)
+	}
+}
diff --git a/lib/html/example_node_iterator_test.go b/lib/html/example_node_iterator_test.go
new file mode 100644
index 00000000..94ece050
--- /dev/null
+++ b/lib/html/example_node_iterator_test.go
@@ -0,0 +1,96 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"fmt"
+	"log"
+	"strings"
+)
+
+func ExampleParse() {
+	rawHTML := `
+<ul>
+	<li>
+		<b>item</b>
+		<span>one</span>
+	</li>
+</ul>
+`
+
+	r := strings.NewReader(rawHTML)
+
+	iter, err := Parse(r)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	for node := iter.Next(); node != nil; node = iter.Next() {
+		if node.IsElement() {
+			fmt.Printf("%s\n", node.Data)
+		} else {
+			fmt.Printf("\t%s\n", node.Data)
+		}
+	}
+
+	// Output:
+	// html
+	// head
+	// body
+	// ul
+	// li
+	// b
+	// 	item
+	// b
+	// span
+	// 	one
+	// span
+	// li
+	// ul
+	// body
+	// html
+}
+
+func ExampleNodeIterator_SetNext() {
+	rawHTML := `
+<ul>
+	<li>
+		<b>item</b>
+		<span>one</span>
+	</li>
+</ul>
+<h2>Jump here</h2>
+`
+
+	r := strings.NewReader(rawHTML)
+
+	iter, err := Parse(r)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	for node := iter.Next(); node != nil; node = iter.Next() {
+		if node.IsElement() {
+			if node.Data == "ul" {
+				// Skip iterating the "ul" element.
+				iter.SetNext(node.GetNextSibling())
+				continue
+			}
+			fmt.Printf("%s\n", node.Data)
+		} else {
+			fmt.Printf("\t%s\n", node.Data)
+		}
+	}
+
+	// Output:
+	// html
+	// head
+	// body
+	// h2
+	// 	Jump here
+	// h2
+	// body
+	// html
+}
diff --git a/lib/html/example_test.go b/lib/html/example_test.go
new file mode 100644
index 00000000..d0582555
--- /dev/null
+++ b/lib/html/example_test.go
@@ -0,0 +1,51 @@
+// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import "fmt"
+
+func ExampleNormalizeForID() {
+	fmt.Println(NormalizeForID(""))
+	fmt.Println(NormalizeForID(" id "))
+	fmt.Println(NormalizeForID(" ID "))
+	fmt.Println(NormalizeForID("_id.1"))
+	fmt.Println(NormalizeForID("1-d"))
+	fmt.Println(NormalizeForID(".123 ABC def"))
+	fmt.Println(NormalizeForID("test 123"))
+	fmt.Println(NormalizeForID("⌘"))
+	// Output:
+	// _
+	// _id_
+	// _id_
+	// _id_1
+	// _1-d
+	// _123_abc_def
+	// test_123
+	// ___
+}
+
+func ExampleSanitize() {
+	input := `
+<html>
+	<title>Test</title>
+	<head>
+	</head>
+	<body>
+		This
+		<p> is </p>
+		a
+		<a href="/">link</a>.
+		An another
+		<a href="/">link</a>.
+	</body>
+</html>
+`
+
+	out := Sanitize([]byte(input))
+	fmt.Printf("%s", out)
+
+	// Output:
+	// This is a link. An another link.
+}
diff --git a/lib/html/html.go b/lib/html/html.go
new file mode 100644
index 00000000..9f5324ab
--- /dev/null
+++ b/lib/html/html.go
@@ -0,0 +1,132 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package html extends the golang.org/x/net/html by providing simplified
+// methods to Node.
+//
+// The x/net/html package currently only provide bare raw functionalities
+// to iterate tree, there is no check for empty node, and no function to
+// get attribute by name without looping it manually.
+//
+// This package extends the parent package by adding methods to get node's
+// attribute by name, get the first non-empty child, get the next
+// non-empty sibling, and method to iterate the tree.
+package html
+
+import (
+	"bytes"
+
+	"golang.org/x/net/html"
+
+	"git.sr.ht/~shulhan/pakakeh.go/lib/ascii"
+)
+
+// NormalizeForID given an input string normalize it to HTML ID.
+// The normalization follow [Mozilla specification] rules,
+//
+//   - it must not contain whitespace (spaces, tabs etc.),
+//   - only ASCII letters, digits, '_', and '-' should be used, and
+//   - it should start with a letter.
+//
+// This function,
+//
+//   - Return "_" if input is empty string,
+//   - replace unknown characters with '_',
+//   - prefix output with '_' unless it start with '-', '_', or letters, and
+//   - convert letters to lower cases.
+//
+// [Mozilla specification]: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/id.
+func NormalizeForID(in string) (out string) {
+	var (
+		bin = []byte(in)
+		x   int
+		b   byte
+	)
+
+	for x, b = range bin {
+		if ascii.IsAlpha(b) {
+			if b >= 'A' && b <= 'Z' {
+				bin[x] = b + 32
+			}
+		} else if !(ascii.IsDigit(b) || b == '-' || b == '_') {
+			bin[x] = '_'
+		}
+	}
+	if len(bin) == 0 {
+		bin = append(bin, '_')
+	} else if !ascii.IsAlpha(bin[0]) && bin[0] != '_' {
+		bin = append(bin, '_')
+		copy(bin[1:], bin)
+		bin[0] = '_'
+	}
+
+	return string(bin)
+}
+
+// Sanitize the content of HTML into plain text.
+func Sanitize(in []byte) (plain []byte) {
+	if len(in) == 0 {
+		return plain
+	}
+
+	var (
+		r = bytes.NewReader(in)
+
+		w           bytes.Buffer
+		htmlToken   *html.Tokenizer
+		tokenType   html.TokenType
+		tagName     []byte
+		x, y        int
+		c           byte
+		prevIsSpace bool
+	)
+
+	htmlToken = html.NewTokenizer(r)
+	for {
+		tokenType = htmlToken.Next()
+		switch tokenType {
+		case html.EndTagToken, html.SelfClosingTagToken, html.CommentToken, html.DoctypeToken:
+			// NOOP.
+
+		case html.ErrorToken:
+			goto out
+
+		case html.TextToken:
+			w.Write(htmlToken.Text())
+
+		case html.StartTagToken:
+			tagName, _ = htmlToken.TagName()
+
+			if bytes.Equal(tagName, []byte("title")) ||
+				bytes.Equal(tagName, []byte("script")) {
+				htmlToken.Next()
+			}
+		}
+	}
+out:
+	plain = w.Bytes()
+
+	// Remove CR ('\r'), replace LF and TAB with space and trim multiple
+	// spaces.
+	for y, c = range plain {
+		if c == '\r' || c == '\v' {
+			continue
+		}
+		if c == '\n' || c == '\t' || c == ' ' {
+			if !prevIsSpace {
+				plain[x] = ' '
+				x++
+				prevIsSpace = true
+			}
+			continue
+		}
+		plain[x] = plain[y]
+		x++
+		prevIsSpace = false
+	}
+
+	plain = plain[:x]
+
+	return plain
+}
diff --git a/lib/html/node.go b/lib/html/node.go
new file mode 100644
index 00000000..ac00ac48
--- /dev/null
+++ b/lib/html/node.go
@@ -0,0 +1,75 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// Node extends the html.Node.
+type Node struct {
+	*html.Node
+}
+
+// NewNode create new node by embedding html.Node "el".
+func NewNode(el *html.Node) *Node {
+	return &Node{Node: el}
+}
+
+// GetAttrValue get the value of node's attribute with specific key or empty
+// if key not found.
+func (node *Node) GetAttrValue(key string) string {
+	for _, attr := range node.Attr {
+		if key == attr.Key {
+			return attr.Val
+		}
+	}
+	return ""
+}
+
+// GetFirstChild get the first non-empty child of node or nil if no child
+// left.
+func (node *Node) GetFirstChild() *Node {
+	el := node.FirstChild
+	for el != nil {
+		if el.Type == html.TextNode {
+			if len(strings.TrimSpace(el.Data)) == 0 {
+				el = el.NextSibling
+				continue
+			}
+		}
+		break
+	}
+	if el == nil {
+		return nil
+	}
+	return NewNode(el)
+}
+
+// GetNextSibling get the next non-empty sibling of node or nil if no more
+// sibling left.
+func (node *Node) GetNextSibling() *Node {
+	el := node.NextSibling
+	for el != nil {
+		if el.Type == html.TextNode {
+			if len(strings.TrimSpace(el.Data)) == 0 {
+				el = el.NextSibling
+				continue
+			}
+		}
+		break
+	}
+	if el == nil {
+		return nil
+	}
+	return NewNode(el)
+}
+
+// IsElement will return true if node type is html.ElementNode.
+func (node *Node) IsElement() bool {
+	return node.Type == html.ElementNode
+}
diff --git a/lib/html/node_iterator.go b/lib/html/node_iterator.go
new file mode 100644
index 00000000..ca1819d4
--- /dev/null
+++ b/lib/html/node_iterator.go
@@ -0,0 +1,93 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"io"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// NodeIterator simplify iterating each node from top to bottom.
+type NodeIterator struct {
+	current  *Node
+	previous *Node
+	next     *html.Node
+	hasNext  bool
+}
+
+// Parse returns the NodeIterator to iterate through HTML tree.
+func Parse(r io.Reader) (iter *NodeIterator, err error) {
+	node, err := html.Parse(r)
+	if err != nil {
+		return nil, err
+	}
+	iter = &NodeIterator{
+		current:  NewNode(node),
+		previous: NewNode(nil),
+	}
+	return iter, nil
+}
+
+// Next return the first child or the next sibling of current node.
+// If no more node in the tree, it will return nil.
+func (iter *NodeIterator) Next() *Node {
+	if iter.hasNext {
+		iter.current.Node = iter.next
+		iter.next = nil
+		iter.hasNext = false
+		return iter.current
+	}
+	if iter.current.Node == nil {
+		return nil
+	}
+
+	for {
+		switch {
+		case iter.current.FirstChild != nil &&
+			iter.current.FirstChild != iter.previous.Node &&
+			iter.current.LastChild != iter.previous.Node:
+			iter.current.Node = iter.current.FirstChild
+		case iter.current.NextSibling != nil:
+			iter.current.Node = iter.current.NextSibling
+		default:
+			iter.previous.Node = iter.current.Node
+			iter.current.Node = iter.current.Parent
+		}
+		if iter.current.Node == nil {
+			return nil
+		}
+
+		// Skip empty text node.
+		if iter.current.Type != html.TextNode {
+			break
+		}
+		text := strings.TrimSpace(iter.current.Data)
+		if len(text) != 0 {
+			break
+		}
+	}
+	return iter.current
+}
+
+// SetNext set the node for iteration to Node "el" only if its not nil.
+func (iter *NodeIterator) SetNext(el *Node) {
+	if el == nil {
+		return
+	}
+	iter.hasNext = true
+	iter.next = el.Node
+}
+
+// SetNextNode set the next iteration node to html.Node "el" only if its not
+// nil.
+func (iter *NodeIterator) SetNextNode(el *html.Node) {
+	if el == nil {
+		return
+	}
+	iter.hasNext = true
+	iter.next = el
+}
author	Shulhan <ms@kilabit.info>	2024-03-06 03:42:00 +0700
committer	Shulhan <ms@kilabit.info>	2024-03-09 01:10:17 +0700
commit	e730418289d80985c5d48946353961a357a4b532 (patch)
tree	9e573bf4fca975775eb25f32448f755a325880a8 /lib/html
parent	1e7cb99f42bcd41e98326bd9406d3cecfb2a4542 (diff)
download	pakakeh.go-e730418289d80985c5d48946353961a357a4b532.tar.xz