aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <m.shulhan@gmail.com>2019-09-20 10:10:12 +0700
committerShulhan <m.shulhan@gmail.com>2019-09-23 23:47:48 +0700
commit7599e469bd4bf2c8508ce4f4a4c27356706378b6 (patch)
treecc9d1dd3584ecde31ebfc6403d7919d0c283dda8
parentd98c77b8225b3e68205d0ec26e1995760d88f119 (diff)
downloadpakakeh.go-7599e469bd4bf2c8508ce4f4a4c27356706378b6.tar.xz
memfs: add method to Search content of files
-rw-r--r--CHANGELOG.adoc2
-rw-r--r--CHANGELOG.html3
-rw-r--r--lib/memfs/memfs.go80
-rw-r--r--lib/memfs/memfs_example_test.go26
-rw-r--r--lib/memfs/node.go38
-rw-r--r--lib/memfs/search_result.go11
6 files changed, 160 insertions, 0 deletions
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
index e824e0a3..a9bc6761 100644
--- a/CHANGELOG.adoc
+++ b/CHANGELOG.adoc
@@ -9,6 +9,8 @@ This library is released each month, usually at the first week of month.
=== New Features
+* memfs: add method to Search content of files
+
* sanitize: new package to sanitize markup document into plain text
+
Current implementation have a function to sanitize the content of HTML.
diff --git a/CHANGELOG.html b/CHANGELOG.html
index 5a2beac0..d2ec8137 100644
--- a/CHANGELOG.html
+++ b/CHANGELOG.html
@@ -114,6 +114,9 @@
<div class="ulist">
<ul>
<li>
+<p>memfs: add method to Search content of files</p>
+</li>
+<li>
<p>sanitize: new package to sanitize markup document into plain text</p>
<div class="paragraph">
<p>Current implementation have a function to sanitize the content of HTML.</p>
diff --git a/lib/memfs/memfs.go b/lib/memfs/memfs.go
index 25c8362e..40b5e550 100644
--- a/lib/memfs/memfs.go
+++ b/lib/memfs/memfs.go
@@ -16,6 +16,8 @@ import (
"regexp"
"sort"
"strings"
+
+ "github.com/shuLhan/share/lib/sanitize"
)
//
@@ -479,3 +481,81 @@ func (mfs *MemFS) refresh(url string) (node *Node, err error) {
return node, nil
}
+
+//
+// Search the string "q" in each content of files.
+//
+func (mfs *MemFS) Search(q string, snippetLen int) (results []SearchResult) {
+ if len(q) == 0 {
+ return nil
+ }
+ if snippetLen <= 0 {
+ snippetLen = 60
+ }
+
+ sep := bytes.ToLower([]byte(q))
+ for _, node := range mfs.pn.v {
+ var v []byte
+
+ if node.Mode.IsDir() {
+ continue
+ }
+
+ if !strings.HasPrefix(node.ContentType, "text/") {
+ continue
+ }
+
+ if len(node.plainv) == 0 {
+ err := node.decode()
+ if err != nil {
+ log.Printf("memfs.Search: " + err.Error())
+ }
+
+ if strings.HasPrefix(node.ContentType, "text/html") {
+ node.plainv = sanitize.HTML(node.plainv)
+ }
+
+ node.lowerv = bytes.ToLower(node.plainv)
+ }
+
+ result := SearchResult{
+ Path: node.Path,
+ }
+
+ offset := 0
+ v = node.lowerv
+ for {
+ s := bytes.Index(v, sep)
+ if s == -1 {
+ break
+ }
+
+ start := offset
+ end := offset
+
+ if s > snippetLen {
+ start += s - snippetLen
+ }
+ if s+len(q)+snippetLen > len(v) {
+ end += len(v)
+ } else {
+ end += s + len(q) + snippetLen
+ }
+
+ snippet := strings.TrimSpace(string(node.plainv[start:end]))
+ snippet = strings.ReplaceAll(snippet, "\r", "")
+ snippet = strings.ReplaceAll(snippet, "\n", " ")
+ offset += s + len(q)
+
+ v = v[s+len(q):]
+
+ result.Snippets = append(result.Snippets, snippet)
+ }
+
+ if len(result.Snippets) > 0 {
+ results = append(results, result)
+ }
+ }
+
+ return results
+}
diff --git a/lib/memfs/memfs_example_test.go b/lib/memfs/memfs_example_test.go
index d2d4d75f..4cd7ba0d 100644
--- a/lib/memfs/memfs_example_test.go
+++ b/lib/memfs/memfs_example_test.go
@@ -33,3 +33,29 @@ func ExampleNew() {
// Output:
// <html></html>
}
+
+func ExampleMemFS_Search() {
+ mfs, err := New(nil, nil, true)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ err = mfs.Mount("./testdata")
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ results := mfs.Search("body", 0)
+
+ for _, result := range results {
+ fmt.Printf("Path: %s\n", result.Path)
+ fmt.Printf("Snippets: %q\n", result.Snippets)
+ }
+ // Unordered output:
+ // Path: /include/index.css
+ // Snippets: ["body { }"]
+ // Path: /exclude/index.css
+ // Snippets: ["body { }"]
+ // Path: /index.css
+ // Snippets: ["body { }"]
+}
diff --git a/lib/memfs/node.go b/lib/memfs/node.go
index 94be0a2c..c461481f 100644
--- a/lib/memfs/node.go
+++ b/lib/memfs/node.go
@@ -5,7 +5,10 @@
package memfs
import (
+ "bytes"
+ "compress/gzip"
"fmt"
+ "io"
"io/ioutil"
"mime"
"net/http"
@@ -30,6 +33,8 @@ type Node struct {
V []byte // Content of file.
Parent *Node // Pointer to parent directory.
Childs []*Node // List of files in directory.
+ plainv []byte // Content of file in plain text.
+ lowerv []byte // Content of file in lower cases.
}
//
@@ -85,6 +90,39 @@ func NewNode(parent *Node, fi os.FileInfo, withContent bool) (node *Node, err er
return node, nil
}
+func (leaf *Node) decode() (err error) {
+ if len(leaf.ContentEncoding) == 0 {
+ leaf.plainv = leaf.V
+ return nil
+ }
+
+ leaf.plainv = leaf.plainv[:0]
+
+ if leaf.ContentEncoding == EncodingGzip {
+ r, err := gzip.NewReader(bytes.NewReader(leaf.V))
+ if err != nil {
+ return err
+ }
+
+ buf := make([]byte, 1024)
+
+ for {
+ n, err := r.Read(buf)
+ if n > 0 {
+ leaf.plainv = append(leaf.plainv, buf...)
+ }
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+ return err
+ }
+ }
+ }
+
+ return nil
+}
+
//
// removeChild remove a children node from list. If child is not exist, it
// will return nil.
diff --git a/lib/memfs/search_result.go b/lib/memfs/search_result.go
new file mode 100644
index 00000000..236284a4
--- /dev/null
+++ b/lib/memfs/search_result.go
@@ -0,0 +1,11 @@
+package memfs
+
+//
+// SearchResult containts the result of searching where the Path will be
+// filled with absolute path of file system in memory and the Snippet will
+// filled with part of the text before and after the search string.
+//
+type SearchResult struct {
+ Path string
+ Snippets []string
+}