diff options
| author | Shulhan <m.shulhan@gmail.com> | 2019-09-20 10:10:12 +0700 |
|---|---|---|
| committer | Shulhan <m.shulhan@gmail.com> | 2019-09-23 23:47:48 +0700 |
| commit | 7599e469bd4bf2c8508ce4f4a4c27356706378b6 (patch) | |
| tree | cc9d1dd3584ecde31ebfc6403d7919d0c283dda8 | |
| parent | d98c77b8225b3e68205d0ec26e1995760d88f119 (diff) | |
| download | pakakeh.go-7599e469bd4bf2c8508ce4f4a4c27356706378b6.tar.xz | |
memfs: add method to Search content of files
| -rw-r--r-- | CHANGELOG.adoc | 2 | ||||
| -rw-r--r-- | CHANGELOG.html | 3 | ||||
| -rw-r--r-- | lib/memfs/memfs.go | 80 | ||||
| -rw-r--r-- | lib/memfs/memfs_example_test.go | 26 | ||||
| -rw-r--r-- | lib/memfs/node.go | 38 | ||||
| -rw-r--r-- | lib/memfs/search_result.go | 11 |
6 files changed, 160 insertions, 0 deletions
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index e824e0a3..a9bc6761 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -9,6 +9,8 @@ This library is released each month, usually at the first week of month. === New Features +* memfs: add method to Search content of files + * sanitize: new package to sanitize markup document into plain text + Current implementation have a function to sanitize the content of HTML. diff --git a/CHANGELOG.html b/CHANGELOG.html index 5a2beac0..d2ec8137 100644 --- a/CHANGELOG.html +++ b/CHANGELOG.html @@ -114,6 +114,9 @@ <div class="ulist"> <ul> <li> +<p>memfs: add method to Search content of files</p> +</li> +<li> <p>sanitize: new package to sanitize markup document into plain text</p> <div class="paragraph"> <p>Current implementation have a function to sanitize the content of HTML.</p> diff --git a/lib/memfs/memfs.go b/lib/memfs/memfs.go index 25c8362e..40b5e550 100644 --- a/lib/memfs/memfs.go +++ b/lib/memfs/memfs.go @@ -16,6 +16,8 @@ import ( "regexp" "sort" "strings" + + "github.com/shuLhan/share/lib/sanitize" ) // @@ -479,3 +481,81 @@ func (mfs *MemFS) refresh(url string) (node *Node, err error) { return node, nil } + +// +// Search the string "q" in each content of files. +// +func (mfs *MemFS) Search(q string, snippetLen int) (results []SearchResult) { + if len(q) == 0 { + return nil + } + if snippetLen <= 0 { + snippetLen = 60 + } + + sep := bytes.ToLower([]byte(q)) + for _, node := range mfs.pn.v { + var v []byte + + if node.Mode.IsDir() { + continue + } + + if !strings.HasPrefix(node.ContentType, "text/") { + continue + } + + if len(node.plainv) == 0 { + err := node.decode() + if err != nil { + log.Printf("memfs.Search: " + err.Error()) + } + + if strings.HasPrefix(node.ContentType, "text/html") { + node.plainv = sanitize.HTML(node.plainv) + } + + node.lowerv = bytes.ToLower(node.plainv) + } + + result := SearchResult{ + Path: node.Path, + } + + offset := 0 + v = node.lowerv + for { + s := bytes.Index(v, sep) + if s == -1 { + break + } + + start := offset + end := offset + + if s > snippetLen { + start += s - snippetLen + } + if s+len(q)+snippetLen > len(v) { + end += len(v) + } else { + end += s + len(q) + snippetLen + } + + snippet := strings.TrimSpace(string(node.plainv[start:end])) + snippet = strings.ReplaceAll(snippet, "\r", "") + snippet = strings.ReplaceAll(snippet, "\n", " ") + offset += s + len(q) + + v = v[s+len(q):] + + result.Snippets = append(result.Snippets, snippet) + } + + if len(result.Snippets) > 0 { + results = append(results, result) + } + } + + return results +} diff --git a/lib/memfs/memfs_example_test.go b/lib/memfs/memfs_example_test.go index d2d4d75f..4cd7ba0d 100644 --- a/lib/memfs/memfs_example_test.go +++ b/lib/memfs/memfs_example_test.go @@ -33,3 +33,29 @@ func ExampleNew() { // Output: // <html></html> } + +func ExampleMemFS_Search() { + mfs, err := New(nil, nil, true) + if err != nil { + log.Fatal(err) + } + + err = mfs.Mount("./testdata") + if err != nil { + log.Fatal(err) + } + + results := mfs.Search("body", 0) + + for _, result := range results { + fmt.Printf("Path: %s\n", result.Path) + fmt.Printf("Snippets: %q\n", result.Snippets) + } + // Unordered output: + // Path: /include/index.css + // Snippets: ["body { }"] + // Path: /exclude/index.css + // Snippets: ["body { }"] + // Path: /index.css + // Snippets: ["body { }"] +} diff --git a/lib/memfs/node.go b/lib/memfs/node.go index 94be0a2c..c461481f 100644 --- a/lib/memfs/node.go +++ b/lib/memfs/node.go @@ -5,7 +5,10 @@ package memfs import ( + "bytes" + "compress/gzip" "fmt" + "io" "io/ioutil" "mime" "net/http" @@ -30,6 +33,8 @@ type Node struct { V []byte // Content of file. Parent *Node // Pointer to parent directory. Childs []*Node // List of files in directory. + plainv []byte // Content of file in plain text. + lowerv []byte // Content of file in lower cases. } // @@ -85,6 +90,39 @@ func NewNode(parent *Node, fi os.FileInfo, withContent bool) (node *Node, err er return node, nil } +func (leaf *Node) decode() (err error) { + if len(leaf.ContentEncoding) == 0 { + leaf.plainv = leaf.V + return nil + } + + leaf.plainv = leaf.plainv[:0] + + if leaf.ContentEncoding == EncodingGzip { + r, err := gzip.NewReader(bytes.NewReader(leaf.V)) + if err != nil { + return err + } + + buf := make([]byte, 1024) + + for { + n, err := r.Read(buf) + if n > 0 { + leaf.plainv = append(leaf.plainv, buf...) + } + if err != nil { + if err == io.EOF { + break + } + return err + } + } + } + + return nil +} + // // removeChild remove a children node from list. If child is not exist, it // will return nil. diff --git a/lib/memfs/search_result.go b/lib/memfs/search_result.go new file mode 100644 index 00000000..236284a4 --- /dev/null +++ b/lib/memfs/search_result.go @@ -0,0 +1,11 @@ +package memfs + +// +// SearchResult containts the result of searching where the Path will be +// filled with absolute path of file system in memory and the Snippet will +// filled with part of the text before and after the search string. +// +type SearchResult struct { + Path string + Snippets []string +} |
