diff options
| author | Shulhan <ms@kilabit.info> | 2024-03-06 03:42:00 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2024-03-09 01:10:17 +0700 |
| commit | e730418289d80985c5d48946353961a357a4b532 (patch) | |
| tree | 9e573bf4fca975775eb25f32448f755a325880a8 /lib/html | |
| parent | 1e7cb99f42bcd41e98326bd9406d3cecfb2a4542 (diff) | |
| download | pakakeh.go-e730418289d80985c5d48946353961a357a4b532.tar.xz | |
lib: move package "net/html" to "lib/html"
Putting "html" under "net" package make no sense.
Another reason is to make the package flat under "lib/" directory.
Diffstat (limited to 'lib/html')
| -rw-r--r-- | lib/html/benchmark_test.go | 45 | ||||
| -rw-r--r-- | lib/html/example_node_iterator_test.go | 96 | ||||
| -rw-r--r-- | lib/html/example_test.go | 51 | ||||
| -rw-r--r-- | lib/html/html.go | 132 | ||||
| -rw-r--r-- | lib/html/node.go | 75 | ||||
| -rw-r--r-- | lib/html/node_iterator.go | 93 |
6 files changed, 492 insertions, 0 deletions
diff --git a/lib/html/benchmark_test.go b/lib/html/benchmark_test.go new file mode 100644 index 00000000..05aa62d2 --- /dev/null +++ b/lib/html/benchmark_test.go @@ -0,0 +1,45 @@ +// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import "testing" + +func BenchmarkNormalizeForID(b *testing.B) { + var ( + cases = []string{ + "", + ".123 ABC def", + } + x int + ) + for ; x < b.N; x++ { + NormalizeForID(cases[0]) + NormalizeForID(cases[1]) + } +} + +func BenchmarkSanitize(b *testing.B) { + var ( + input = []byte(` +<html> + <title>Test</title> + <head> + </head> + <body> + This + <p> is </p> + a + <a href="/">link</a>. + An another + <a href="/">link</a>. + </body> +</html>`) + x int + ) + + for ; x < b.N; x++ { + Sanitize(input) + } +} diff --git a/lib/html/example_node_iterator_test.go b/lib/html/example_node_iterator_test.go new file mode 100644 index 00000000..94ece050 --- /dev/null +++ b/lib/html/example_node_iterator_test.go @@ -0,0 +1,96 @@ +// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "fmt" + "log" + "strings" +) + +func ExampleParse() { + rawHTML := ` +<ul> + <li> + <b>item</b> + <span>one</span> + </li> +</ul> +` + + r := strings.NewReader(rawHTML) + + iter, err := Parse(r) + if err != nil { + log.Fatal(err) + } + + for node := iter.Next(); node != nil; node = iter.Next() { + if node.IsElement() { + fmt.Printf("%s\n", node.Data) + } else { + fmt.Printf("\t%s\n", node.Data) + } + } + + // Output: + // html + // head + // body + // ul + // li + // b + // item + // b + // span + // one + // span + // li + // ul + // body + // html +} + +func ExampleNodeIterator_SetNext() { + rawHTML := ` +<ul> + <li> + <b>item</b> + <span>one</span> + </li> +</ul> +<h2>Jump here</h2> +` + + r := strings.NewReader(rawHTML) + + iter, err := Parse(r) + if err != nil { + log.Fatal(err) + } + + for node := iter.Next(); node != nil; node = iter.Next() { + if node.IsElement() { + if node.Data == "ul" { + // Skip iterating the "ul" element. + iter.SetNext(node.GetNextSibling()) + continue + } + fmt.Printf("%s\n", node.Data) + } else { + fmt.Printf("\t%s\n", node.Data) + } + } + + // Output: + // html + // head + // body + // h2 + // Jump here + // h2 + // body + // html +} diff --git a/lib/html/example_test.go b/lib/html/example_test.go new file mode 100644 index 00000000..d0582555 --- /dev/null +++ b/lib/html/example_test.go @@ -0,0 +1,51 @@ +// Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import "fmt" + +func ExampleNormalizeForID() { + fmt.Println(NormalizeForID("")) + fmt.Println(NormalizeForID(" id ")) + fmt.Println(NormalizeForID(" ID ")) + fmt.Println(NormalizeForID("_id.1")) + fmt.Println(NormalizeForID("1-d")) + fmt.Println(NormalizeForID(".123 ABC def")) + fmt.Println(NormalizeForID("test 123")) + fmt.Println(NormalizeForID("⌘")) + // Output: + // _ + // _id_ + // _id_ + // _id_1 + // _1-d + // _123_abc_def + // test_123 + // ___ +} + +func ExampleSanitize() { + input := ` +<html> + <title>Test</title> + <head> + </head> + <body> + This + <p> is </p> + a + <a href="/">link</a>. + An another + <a href="/">link</a>. + </body> +</html> +` + + out := Sanitize([]byte(input)) + fmt.Printf("%s", out) + + // Output: + // This is a link. An another link. +} diff --git a/lib/html/html.go b/lib/html/html.go new file mode 100644 index 00000000..9f5324ab --- /dev/null +++ b/lib/html/html.go @@ -0,0 +1,132 @@ +// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package html extends the golang.org/x/net/html by providing simplified +// methods to Node. +// +// The x/net/html package currently only provide bare raw functionalities +// to iterate tree, there is no check for empty node, and no function to +// get attribute by name without looping it manually. +// +// This package extends the parent package by adding methods to get node's +// attribute by name, get the first non-empty child, get the next +// non-empty sibling, and method to iterate the tree. +package html + +import ( + "bytes" + + "golang.org/x/net/html" + + "git.sr.ht/~shulhan/pakakeh.go/lib/ascii" +) + +// NormalizeForID given an input string normalize it to HTML ID. +// The normalization follow [Mozilla specification] rules, +// +// - it must not contain whitespace (spaces, tabs etc.), +// - only ASCII letters, digits, '_', and '-' should be used, and +// - it should start with a letter. +// +// This function, +// +// - Return "_" if input is empty string, +// - replace unknown characters with '_', +// - prefix output with '_' unless it start with '-', '_', or letters, and +// - convert letters to lower cases. +// +// [Mozilla specification]: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/id. +func NormalizeForID(in string) (out string) { + var ( + bin = []byte(in) + x int + b byte + ) + + for x, b = range bin { + if ascii.IsAlpha(b) { + if b >= 'A' && b <= 'Z' { + bin[x] = b + 32 + } + } else if !(ascii.IsDigit(b) || b == '-' || b == '_') { + bin[x] = '_' + } + } + if len(bin) == 0 { + bin = append(bin, '_') + } else if !ascii.IsAlpha(bin[0]) && bin[0] != '_' { + bin = append(bin, '_') + copy(bin[1:], bin) + bin[0] = '_' + } + + return string(bin) +} + +// Sanitize the content of HTML into plain text. +func Sanitize(in []byte) (plain []byte) { + if len(in) == 0 { + return plain + } + + var ( + r = bytes.NewReader(in) + + w bytes.Buffer + htmlToken *html.Tokenizer + tokenType html.TokenType + tagName []byte + x, y int + c byte + prevIsSpace bool + ) + + htmlToken = html.NewTokenizer(r) + for { + tokenType = htmlToken.Next() + switch tokenType { + case html.EndTagToken, html.SelfClosingTagToken, html.CommentToken, html.DoctypeToken: + // NOOP. + + case html.ErrorToken: + goto out + + case html.TextToken: + w.Write(htmlToken.Text()) + + case html.StartTagToken: + tagName, _ = htmlToken.TagName() + + if bytes.Equal(tagName, []byte("title")) || + bytes.Equal(tagName, []byte("script")) { + htmlToken.Next() + } + } + } +out: + plain = w.Bytes() + + // Remove CR ('\r'), replace LF and TAB with space and trim multiple + // spaces. + for y, c = range plain { + if c == '\r' || c == '\v' { + continue + } + if c == '\n' || c == '\t' || c == ' ' { + if !prevIsSpace { + plain[x] = ' ' + x++ + prevIsSpace = true + } + continue + } + plain[x] = plain[y] + x++ + prevIsSpace = false + } + + plain = plain[:x] + + return plain +} diff --git a/lib/html/node.go b/lib/html/node.go new file mode 100644 index 00000000..ac00ac48 --- /dev/null +++ b/lib/html/node.go @@ -0,0 +1,75 @@ +// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "strings" + + "golang.org/x/net/html" +) + +// Node extends the html.Node. +type Node struct { + *html.Node +} + +// NewNode create new node by embedding html.Node "el". +func NewNode(el *html.Node) *Node { + return &Node{Node: el} +} + +// GetAttrValue get the value of node's attribute with specific key or empty +// if key not found. +func (node *Node) GetAttrValue(key string) string { + for _, attr := range node.Attr { + if key == attr.Key { + return attr.Val + } + } + return "" +} + +// GetFirstChild get the first non-empty child of node or nil if no child +// left. +func (node *Node) GetFirstChild() *Node { + el := node.FirstChild + for el != nil { + if el.Type == html.TextNode { + if len(strings.TrimSpace(el.Data)) == 0 { + el = el.NextSibling + continue + } + } + break + } + if el == nil { + return nil + } + return NewNode(el) +} + +// GetNextSibling get the next non-empty sibling of node or nil if no more +// sibling left. +func (node *Node) GetNextSibling() *Node { + el := node.NextSibling + for el != nil { + if el.Type == html.TextNode { + if len(strings.TrimSpace(el.Data)) == 0 { + el = el.NextSibling + continue + } + } + break + } + if el == nil { + return nil + } + return NewNode(el) +} + +// IsElement will return true if node type is html.ElementNode. +func (node *Node) IsElement() bool { + return node.Type == html.ElementNode +} diff --git a/lib/html/node_iterator.go b/lib/html/node_iterator.go new file mode 100644 index 00000000..ca1819d4 --- /dev/null +++ b/lib/html/node_iterator.go @@ -0,0 +1,93 @@ +// Copyright 2020, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "io" + "strings" + + "golang.org/x/net/html" +) + +// NodeIterator simplify iterating each node from top to bottom. +type NodeIterator struct { + current *Node + previous *Node + next *html.Node + hasNext bool +} + +// Parse returns the NodeIterator to iterate through HTML tree. +func Parse(r io.Reader) (iter *NodeIterator, err error) { + node, err := html.Parse(r) + if err != nil { + return nil, err + } + iter = &NodeIterator{ + current: NewNode(node), + previous: NewNode(nil), + } + return iter, nil +} + +// Next return the first child or the next sibling of current node. +// If no more node in the tree, it will return nil. +func (iter *NodeIterator) Next() *Node { + if iter.hasNext { + iter.current.Node = iter.next + iter.next = nil + iter.hasNext = false + return iter.current + } + if iter.current.Node == nil { + return nil + } + + for { + switch { + case iter.current.FirstChild != nil && + iter.current.FirstChild != iter.previous.Node && + iter.current.LastChild != iter.previous.Node: + iter.current.Node = iter.current.FirstChild + case iter.current.NextSibling != nil: + iter.current.Node = iter.current.NextSibling + default: + iter.previous.Node = iter.current.Node + iter.current.Node = iter.current.Parent + } + if iter.current.Node == nil { + return nil + } + + // Skip empty text node. + if iter.current.Type != html.TextNode { + break + } + text := strings.TrimSpace(iter.current.Data) + if len(text) != 0 { + break + } + } + return iter.current +} + +// SetNext set the node for iteration to Node "el" only if its not nil. +func (iter *NodeIterator) SetNext(el *Node) { + if el == nil { + return + } + iter.hasNext = true + iter.next = el.Node +} + +// SetNextNode set the next iteration node to html.Node "el" only if its not +// nil. +func (iter *NodeIterator) SetNextNode(el *html.Node) { + if el == nil { + return + } + iter.hasNext = true + iter.next = el +} |
