aboutsummaryrefslogtreecommitdiff
path: root/lib/html
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2024-03-06 03:42:00 +0700
committerShulhan <ms@kilabit.info>2024-03-09 01:10:17 +0700
commite730418289d80985c5d48946353961a357a4b532 (patch)
tree9e573bf4fca975775eb25f32448f755a325880a8 /lib/html
parent1e7cb99f42bcd41e98326bd9406d3cecfb2a4542 (diff)
downloadpakakeh.go-e730418289d80985c5d48946353961a357a4b532.tar.xz
lib: move package "net/html" to "lib/html"
Putting "html" under "net" package make no sense. Another reason is to make the package flat under "lib/" directory.
Diffstat (limited to 'lib/html')
-rw-r--r--lib/html/benchmark_test.go45
-rw-r--r--lib/html/example_node_iterator_test.go96
-rw-r--r--lib/html/example_test.go51
-rw-r--r--lib/html/html.go132
-rw-r--r--lib/html/node.go75
-rw-r--r--lib/html/node_iterator.go93
6 files changed, 492 insertions, 0 deletions
diff --git a/lib/html/benchmark_test.go b/lib/html/benchmark_test.go
new file mode 100644
index 00000000..05aa62d2
--- /dev/null
+++ b/lib/html/benchmark_test.go
@@ -0,0 +1,45 @@
+// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import "testing"
+
+func BenchmarkNormalizeForID(b *testing.B) {
+ var (
+ cases = []string{
+ "",
+ ".123 ABC def",
+ }
+ x int
+ )
+ for ; x < b.N; x++ {
+ NormalizeForID(cases[0])
+ NormalizeForID(cases[1])
+ }
+}
+
+func BenchmarkSanitize(b *testing.B) {
+ var (
+ input = []byte(`
+<html>
+ <title>Test</title>
+ <head>
+ </head>
+ <body>
+ This
+ <p> is </p>
+ a
+ <a href="/">link</a>.
+ An another
+ <a href="/">link</a>.
+ </body>
+</html>`)
+ x int
+ )
+
+ for ; x < b.N; x++ {
+ Sanitize(input)
+ }
+}
diff --git a/lib/html/example_node_iterator_test.go b/lib/html/example_node_iterator_test.go
new file mode 100644
index 00000000..94ece050
--- /dev/null
+++ b/lib/html/example_node_iterator_test.go
@@ -0,0 +1,96 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "fmt"
+ "log"
+ "strings"
+)
+
+func ExampleParse() {
+ rawHTML := `
+<ul>
+ <li>
+ <b>item</b>
+ <span>one</span>
+ </li>
+</ul>
+`
+
+ r := strings.NewReader(rawHTML)
+
+ iter, err := Parse(r)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for node := iter.Next(); node != nil; node = iter.Next() {
+ if node.IsElement() {
+ fmt.Printf("%s\n", node.Data)
+ } else {
+ fmt.Printf("\t%s\n", node.Data)
+ }
+ }
+
+ // Output:
+ // html
+ // head
+ // body
+ // ul
+ // li
+ // b
+ // item
+ // b
+ // span
+ // one
+ // span
+ // li
+ // ul
+ // body
+ // html
+}
+
+func ExampleNodeIterator_SetNext() {
+ rawHTML := `
+<ul>
+ <li>
+ <b>item</b>
+ <span>one</span>
+ </li>
+</ul>
+<h2>Jump here</h2>
+`
+
+ r := strings.NewReader(rawHTML)
+
+ iter, err := Parse(r)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ for node := iter.Next(); node != nil; node = iter.Next() {
+ if node.IsElement() {
+ if node.Data == "ul" {
+ // Skip iterating the "ul" element.
+ iter.SetNext(node.GetNextSibling())
+ continue
+ }
+ fmt.Printf("%s\n", node.Data)
+ } else {
+ fmt.Printf("\t%s\n", node.Data)
+ }
+ }
+
+ // Output:
+ // html
+ // head
+ // body
+ // h2
+ // Jump here
+ // h2
+ // body
+ // html
+}
diff --git a/lib/html/example_test.go b/lib/html/example_test.go
new file mode 100644
index 00000000..d0582555
--- /dev/null
+++ b/lib/html/example_test.go
@@ -0,0 +1,51 @@
+// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import "fmt"
+
+func ExampleNormalizeForID() {
+ fmt.Println(NormalizeForID(""))
+ fmt.Println(NormalizeForID(" id "))
+ fmt.Println(NormalizeForID(" ID "))
+ fmt.Println(NormalizeForID("_id.1"))
+ fmt.Println(NormalizeForID("1-d"))
+ fmt.Println(NormalizeForID(".123 ABC def"))
+ fmt.Println(NormalizeForID("test 123"))
+ fmt.Println(NormalizeForID("⌘"))
+ // Output:
+ // _
+ // _id_
+ // _id_
+ // _id_1
+ // _1-d
+ // _123_abc_def
+ // test_123
+ // ___
+}
+
+func ExampleSanitize() {
+ input := `
+<html>
+ <title>Test</title>
+ <head>
+ </head>
+ <body>
+ This
+ <p> is </p>
+ a
+ <a href="/">link</a>.
+ An another
+ <a href="/">link</a>.
+ </body>
+</html>
+`
+
+ out := Sanitize([]byte(input))
+ fmt.Printf("%s", out)
+
+ // Output:
+ // This is a link. An another link.
+}
diff --git a/lib/html/html.go b/lib/html/html.go
new file mode 100644
index 00000000..9f5324ab
--- /dev/null
+++ b/lib/html/html.go
@@ -0,0 +1,132 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package html extends the golang.org/x/net/html by providing simplified
+// methods to Node.
+//
+// The x/net/html package currently only provide bare raw functionalities
+// to iterate tree, there is no check for empty node, and no function to
+// get attribute by name without looping it manually.
+//
+// This package extends the parent package by adding methods to get node's
+// attribute by name, get the first non-empty child, get the next
+// non-empty sibling, and method to iterate the tree.
+package html
+
+import (
+ "bytes"
+
+ "golang.org/x/net/html"
+
+ "git.sr.ht/~shulhan/pakakeh.go/lib/ascii"
+)
+
+// NormalizeForID given an input string normalize it to HTML ID.
+// The normalization follow [Mozilla specification] rules,
+//
+// - it must not contain whitespace (spaces, tabs etc.),
+// - only ASCII letters, digits, '_', and '-' should be used, and
+// - it should start with a letter.
+//
+// This function,
+//
+// - Return "_" if input is empty string,
+// - replace unknown characters with '_',
+// - prefix output with '_' unless it start with '-', '_', or letters, and
+// - convert letters to lower cases.
+//
+// [Mozilla specification]: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/id.
+func NormalizeForID(in string) (out string) {
+ var (
+ bin = []byte(in)
+ x int
+ b byte
+ )
+
+ for x, b = range bin {
+ if ascii.IsAlpha(b) {
+ if b >= 'A' && b <= 'Z' {
+ bin[x] = b + 32
+ }
+ } else if !(ascii.IsDigit(b) || b == '-' || b == '_') {
+ bin[x] = '_'
+ }
+ }
+ if len(bin) == 0 {
+ bin = append(bin, '_')
+ } else if !ascii.IsAlpha(bin[0]) && bin[0] != '_' {
+ bin = append(bin, '_')
+ copy(bin[1:], bin)
+ bin[0] = '_'
+ }
+
+ return string(bin)
+}
+
+// Sanitize the content of HTML into plain text.
+func Sanitize(in []byte) (plain []byte) {
+ if len(in) == 0 {
+ return plain
+ }
+
+ var (
+ r = bytes.NewReader(in)
+
+ w bytes.Buffer
+ htmlToken *html.Tokenizer
+ tokenType html.TokenType
+ tagName []byte
+ x, y int
+ c byte
+ prevIsSpace bool
+ )
+
+ htmlToken = html.NewTokenizer(r)
+ for {
+ tokenType = htmlToken.Next()
+ switch tokenType {
+ case html.EndTagToken, html.SelfClosingTagToken, html.CommentToken, html.DoctypeToken:
+ // NOOP.
+
+ case html.ErrorToken:
+ goto out
+
+ case html.TextToken:
+ w.Write(htmlToken.Text())
+
+ case html.StartTagToken:
+ tagName, _ = htmlToken.TagName()
+
+ if bytes.Equal(tagName, []byte("title")) ||
+ bytes.Equal(tagName, []byte("script")) {
+ htmlToken.Next()
+ }
+ }
+ }
+out:
+ plain = w.Bytes()
+
+ // Remove CR ('\r'), replace LF and TAB with space and trim multiple
+ // spaces.
+ for y, c = range plain {
+ if c == '\r' || c == '\v' {
+ continue
+ }
+ if c == '\n' || c == '\t' || c == ' ' {
+ if !prevIsSpace {
+ plain[x] = ' '
+ x++
+ prevIsSpace = true
+ }
+ continue
+ }
+ plain[x] = plain[y]
+ x++
+ prevIsSpace = false
+ }
+
+ plain = plain[:x]
+
+ return plain
+}
diff --git a/lib/html/node.go b/lib/html/node.go
new file mode 100644
index 00000000..ac00ac48
--- /dev/null
+++ b/lib/html/node.go
@@ -0,0 +1,75 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+// Node extends the html.Node.
+type Node struct {
+ *html.Node
+}
+
+// NewNode create new node by embedding html.Node "el".
+func NewNode(el *html.Node) *Node {
+ return &Node{Node: el}
+}
+
+// GetAttrValue get the value of node's attribute with specific key or empty
+// if key not found.
+func (node *Node) GetAttrValue(key string) string {
+ for _, attr := range node.Attr {
+ if key == attr.Key {
+ return attr.Val
+ }
+ }
+ return ""
+}
+
+// GetFirstChild get the first non-empty child of node or nil if no child
+// left.
+func (node *Node) GetFirstChild() *Node {
+ el := node.FirstChild
+ for el != nil {
+ if el.Type == html.TextNode {
+ if len(strings.TrimSpace(el.Data)) == 0 {
+ el = el.NextSibling
+ continue
+ }
+ }
+ break
+ }
+ if el == nil {
+ return nil
+ }
+ return NewNode(el)
+}
+
+// GetNextSibling get the next non-empty sibling of node or nil if no more
+// sibling left.
+func (node *Node) GetNextSibling() *Node {
+ el := node.NextSibling
+ for el != nil {
+ if el.Type == html.TextNode {
+ if len(strings.TrimSpace(el.Data)) == 0 {
+ el = el.NextSibling
+ continue
+ }
+ }
+ break
+ }
+ if el == nil {
+ return nil
+ }
+ return NewNode(el)
+}
+
+// IsElement will return true if node type is html.ElementNode.
+func (node *Node) IsElement() bool {
+ return node.Type == html.ElementNode
+}
diff --git a/lib/html/node_iterator.go b/lib/html/node_iterator.go
new file mode 100644
index 00000000..ca1819d4
--- /dev/null
+++ b/lib/html/node_iterator.go
@@ -0,0 +1,93 @@
+// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "io"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+// NodeIterator simplify iterating each node from top to bottom.
+type NodeIterator struct {
+ current *Node
+ previous *Node
+ next *html.Node
+ hasNext bool
+}
+
+// Parse returns the NodeIterator to iterate through HTML tree.
+func Parse(r io.Reader) (iter *NodeIterator, err error) {
+ node, err := html.Parse(r)
+ if err != nil {
+ return nil, err
+ }
+ iter = &NodeIterator{
+ current: NewNode(node),
+ previous: NewNode(nil),
+ }
+ return iter, nil
+}
+
+// Next return the first child or the next sibling of current node.
+// If no more node in the tree, it will return nil.
+func (iter *NodeIterator) Next() *Node {
+ if iter.hasNext {
+ iter.current.Node = iter.next
+ iter.next = nil
+ iter.hasNext = false
+ return iter.current
+ }
+ if iter.current.Node == nil {
+ return nil
+ }
+
+ for {
+ switch {
+ case iter.current.FirstChild != nil &&
+ iter.current.FirstChild != iter.previous.Node &&
+ iter.current.LastChild != iter.previous.Node:
+ iter.current.Node = iter.current.FirstChild
+ case iter.current.NextSibling != nil:
+ iter.current.Node = iter.current.NextSibling
+ default:
+ iter.previous.Node = iter.current.Node
+ iter.current.Node = iter.current.Parent
+ }
+ if iter.current.Node == nil {
+ return nil
+ }
+
+ // Skip empty text node.
+ if iter.current.Type != html.TextNode {
+ break
+ }
+ text := strings.TrimSpace(iter.current.Data)
+ if len(text) != 0 {
+ break
+ }
+ }
+ return iter.current
+}
+
+// SetNext set the node for iteration to Node "el" only if its not nil.
+func (iter *NodeIterator) SetNext(el *Node) {
+ if el == nil {
+ return
+ }
+ iter.hasNext = true
+ iter.next = el.Node
+}
+
+// SetNextNode set the next iteration node to html.Node "el" only if its not
+// nil.
+func (iter *NodeIterator) SetNextNode(el *html.Node) {
+ if el == nil {
+ return
+ }
+ iter.hasNext = true
+ iter.next = el
+}