lib/strings: merge lib/parser here

The first idea of parser is to provide generic parser for both bytes and string. After we introduce lib/parser there is not much changes to that package. Also, since we create another Parser in lib/bytes that accept and return token as []byte, the lib/parser is not unique anymore. The following function/methods changes to minimize conflict in the future, * Lines become LinesOfFile * New become NewParser * Open become OpenForParser * Token become Read * TokenEscaped become ReadEscaped * TokenTrimSpace become ReadNoSpace
author: Shulhan <ms@kilabit.info> 2023-04-05 23:04:04 +0700
committer: Shulhan <ms@kilabit.info> 2023-04-05 23:07:55 +0700
commit: 98a256e57f2df1b10b7cd75808bee68d8abd1dcd (patch)
tree: 606c360e5d74388dc7207b912d91c6c88a1b5f70 /lib/strings
parent: 144ceef8e29f8780c84bc5ae2589dab3895c5ee6 (diff)
download: pakakeh.go-98a256e57f2df1b10b7cd75808bee68d8abd1dcd.tar.xz
5 files changed, 842 insertions, 0 deletions
diff --git a/lib/strings/parser.go b/lib/strings/parser.go
new file mode 100644
index 00000000..9efac353
--- /dev/null
+++ b/lib/strings/parser.go
@@ -0,0 +1,409 @@
+// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+import (
+	"fmt"
+	"os"
+
+	libascii "github.com/shuLhan/share/lib/ascii"
+)
+
+// Parser implement text parsing over string.
+type Parser struct {
+	file   string
+	delims string
+	v      string // v contains the text to be parsed.
+	token  []rune // token that has been parsed.
+	x      int    // x is the position of read in v.
+	d      rune   // d is one of delims character that terminated parsing.
+}
+
+// LinesOfFile parse the content of file and return non-empty lines.
+func LinesOfFile(file string) ([]string, error) {
+	p, err := OpenForParser(file, ``)
+	if err != nil {
+		return nil, fmt.Errorf(`LinesOfFile: %w`, err)
+	}
+	return p.Lines(), nil
+}
+
+// NewParser create and initialize parser from content and delimiters.
+func NewParser(content, delims string) (p *Parser) {
+	p = &Parser{
+		token: make([]rune, 0, 16),
+	}
+
+	p.Load(content, delims)
+
+	return p
+}
+
+// OpenForParser create and initialize the Parser using content from file.
+// If delimiters is empty, it would default to all whitespaces characters.
+func OpenForParser(file, delims string) (p *Parser, err error) {
+	v, err := os.ReadFile(file)
+	if err != nil {
+		return nil, err
+	}
+
+	p = NewParser(string(v), delims)
+	p.file = file
+
+	return p, nil
+}
+
+// AddDelimiters append new delimiter to existing parser.
+func (p *Parser) AddDelimiters(delims string) {
+	var found bool
+	for _, newd := range delims {
+		found = false
+		for _, oldd := range p.delims {
+			if oldd == newd {
+				found = true
+				break
+			}
+		}
+		if !found {
+			p.delims += string(newd)
+		}
+	}
+}
+
+// Close the parser by resetting all its internal state to zero value.
+func (p *Parser) Close() {
+	p.file = ``
+	p.delims = ``
+	p.x = 0
+	p.v = ``
+	p.token = p.token[:0]
+	p.d = 0
+}
+
+// isDelim true if r is one of delimiters.
+func (p *Parser) isDelim(r rune) bool {
+	var d rune
+	for _, d = range p.delims {
+		if r == d {
+			return true
+		}
+	}
+	return false
+}
+
+// Lines return all non-empty lines from the content.
+func (p *Parser) Lines() []string {
+	var start, end int
+
+	lines := make([]string, 0)
+
+	for x := p.x; x < len(p.v); x++ {
+		// Skip white spaces on beginning ...
+		for ; x < len(p.v); x++ {
+			if p.v[x] == ' ' || p.v[x] == '\t' || p.v[x] == '\r' || p.v[x] == '\f' {
+				continue
+			}
+			break
+		}
+		start = x
+		for ; x < len(p.v); x++ {
+			if p.v[x] != '\n' {
+				continue
+			}
+			break
+		}
+
+		// Skip white spaces at the end ...
+		for end = x - 1; end > start; end-- {
+			if p.v[end] == ' ' || p.v[end] == '\t' ||
+				p.v[end] == '\r' || p.v[end] == '\f' {
+				continue
+			}
+			break
+		}
+		end++
+		if start == end {
+			// Skip empty lines
+			continue
+		}
+
+		line := p.v[start:end]
+		lines = append(lines, line)
+	}
+
+	p.x = len(p.v)
+
+	return lines
+}
+
+// Load the new content and delimiters.
+func (p *Parser) Load(content, delims string) {
+	p.Close()
+	p.v = content
+	if len(delims) == 0 {
+		p.delims = string(libascii.Spaces)
+	} else {
+		p.delims = delims
+	}
+}
+
+// Line read and return a single line.
+// On success it will return a string without '\n' and new line character.
+// In case of EOF it will return the last line and 0.
+func (p *Parser) Line() (string, rune) {
+	p.d = 0
+	p.token = p.token[:0]
+
+	for x, r := range p.v[p.x:] {
+		if r == '\n' {
+			p.d = r
+			p.x += x + 1
+			return string(p.token), p.d
+		}
+		p.token = append(p.token, r)
+	}
+	p.x = len(p.v)
+	return string(p.token), 0
+}
+
+// SetDelimiters replace the current delimiters with delims.
+func (p *Parser) SetDelimiters(delims string) {
+	p.delims = delims
+}
+
+// Stop the parser, return the remaining unparsed content and its last
+// position, and then call Close to reset the internal state back to zero.
+func (p *Parser) Stop() (remain string, pos int) {
+	pos = p.x
+	remain = p.v[pos:]
+	p.Close()
+	return remain, pos
+}
+
+// Read read the next token from content until one of the delimiter found.
+// if no delimiter found, its mean all of content has been read, the returned
+// delimiter will be 0.
+func (p *Parser) Read() (string, rune) {
+	p.d = 0
+	p.token = p.token[:0]
+
+	if p.x >= len(p.v) {
+		return ``, 0
+	}
+
+	for x, r := range p.v[p.x:] {
+		for _, d := range p.delims {
+			if r == d {
+				p.d = d
+				p.x += x + 1
+				return string(p.token), p.d
+			}
+		}
+
+		p.token = append(p.token, r)
+	}
+
+	p.x = len(p.v)
+	return string(p.token), 0
+}
+
+// ReadEscaped read the next token from content until one of the delimiter
+// found, unless its escaped with value of esc character.
+//
+// For example, if the content is "a b" and one of the delimiter is " ",
+// escaping it with "\" will return as "a b" not "a".
+func (p *Parser) ReadEscaped(esc rune) (string, rune) {
+	var isEscaped bool
+
+	p.token = p.token[:0]
+
+	if p.x >= len(p.v) {
+		p.d = 0
+		return ``, 0
+	}
+
+	for x, r := range p.v[p.x:] {
+		if r == esc {
+			if isEscaped {
+				p.token = append(p.token, r)
+				isEscaped = false
+				continue
+			}
+			isEscaped = true
+			continue
+		}
+		for _, d := range p.delims {
+			if r == d {
+				if isEscaped {
+					isEscaped = false
+					break
+				}
+
+				p.d = d
+				p.x += x + 1
+				return string(p.token), p.d
+			}
+		}
+
+		p.token = append(p.token, r)
+	}
+
+	p.d = 0
+	p.x = len(p.v)
+	return string(p.token), p.d
+}
+
+// ReadNoSpace read the next token until one of the delimiter found, with
+// leading and trailing spaces are ignored.
+func (p *Parser) ReadNoSpace() (v string, r rune) {
+	p.d = 0
+	p.token = p.token[:0]
+
+	if p.x >= len(p.v) {
+		return ``, 0
+	}
+
+	var x int
+
+	// Skip leading spaces.
+	for x, r = range p.v[p.x:] {
+		if isHorizontalSpace(r) {
+			continue
+		}
+		break
+	}
+	p.x += x
+
+	for x, r = range p.v[p.x:] {
+		if p.isDelim(r) {
+			p.d = r
+			break
+		}
+		p.token = append(p.token, r)
+	}
+
+	p.x += x + 1 // +1 to skip the delimiter.
+
+	// Remove trailing spaces.
+	for x = len(p.token) - 1; x >= 0; x-- {
+		if isHorizontalSpace(p.token[x]) {
+			continue
+		}
+		break
+	}
+	if x < 0 {
+		// Empty token.
+		return ``, p.d
+	}
+	p.token = p.token[:x+1]
+
+	return string(p.token), p.d
+}
+
+// ReadEnclosed read the token inside opening and closing characters, ignoring
+// all delimiters that previously set.
+//
+// It will return the parsed token and closed character if closed character
+// found, otherwise it will token with 0.
+func (p *Parser) ReadEnclosed(open, closed rune) (string, rune) {
+	for x, r := range p.v[p.x:] {
+		if x == 0 {
+			if r == open {
+				continue
+			}
+		}
+		if r == closed {
+			p.d = closed
+			p.x += x + 1
+			return string(p.token), p.d
+		}
+
+		p.token = append(p.token, r)
+	}
+
+	p.d = 0
+	p.x = len(p.v)
+	return p.v, 0
+}
+
+// RemoveDelimiters from current parser.
+func (p *Parser) RemoveDelimiters(dels string) {
+	var (
+		newdelims string
+		found     bool
+	)
+
+	for _, oldd := range p.delims {
+		found = false
+		for _, r := range dels {
+			if r == oldd {
+				found = true
+				break
+			}
+		}
+		if !found {
+			newdelims += string(oldd)
+		}
+	}
+
+	p.delims = newdelims
+}
+
+// Skip parsing n characters or EOF if n is greater then length of content.
+func (p *Parser) Skip(n int) {
+	if p.x+n >= len(p.v) {
+		p.x = len(p.v)
+		p.d = 0
+	} else {
+		p.x += n
+	}
+}
+
+// SkipHorizontalSpaces skip all space (" "), tab ("\t"), carriage return
+// ("\r"), and form feed ("\f") characters; and return the first character
+// found, probably new line.
+func (p *Parser) SkipHorizontalSpaces() rune {
+	for x, r := range p.v[p.x:] {
+		switch r {
+		case ' ', '\t', '\r', '\f':
+		default:
+			p.x += x
+			p.d = r
+			return r
+		}
+	}
+
+	p.d = 0
+	p.x = len(p.v)
+
+	return 0
+}
+
+// SkipLine skip all characters until new line.
+// It will return the first character after new line or 0 if EOF.
+func (p *Parser) SkipLine() rune {
+	for x, r := range p.v[p.x:] {
+		if r == '\n' {
+			p.x += x + 1
+			if p.x >= len(p.v) {
+				p.d = 0
+			} else {
+				p.d = r
+			}
+			return p.d
+		}
+	}
+
+	// All contents has been read, no new line found.
+	p.x = len(p.v)
+	p.d = 0
+
+	return 0
+}
+
+// isHorizontalSpace true if r is space, tab, carriage return, or form feed.
+func isHorizontalSpace(r rune) bool {
+	return r == ' ' || r == '\t' || r == '\r' || r == '\f'
+}
diff --git a/lib/strings/parser_benchmark_test.go b/lib/strings/parser_benchmark_test.go
new file mode 100644
index 00000000..a41c2128
--- /dev/null
+++ b/lib/strings/parser_benchmark_test.go
@@ -0,0 +1,22 @@
+// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+import "testing"
+
+// Output:
+//
+// BenchmarkParser_Read-4          59117898                20.2 ns/op             0 B/op          0 allocs/op
+func BenchmarkParser_Read(b *testing.B) {
+	content := `abc;def`
+	delims := ` /;`
+
+	p := NewParser(content, delims)
+
+	for x := 0; x < b.N; x++ {
+		p.Read()
+		p.Load(content, delims)
+	}
+}
diff --git a/lib/strings/parser_example_test.go b/lib/strings/parser_example_test.go
new file mode 100644
index 00000000..726609cb
--- /dev/null
+++ b/lib/strings/parser_example_test.go
@@ -0,0 +1,53 @@
+// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+import (
+	"fmt"
+	"strings"
+)
+
+func ExampleNewParser() {
+	content := "[test]\nkey = value"
+	p := NewParser(content, `=[]`)
+
+	for {
+		token, del := p.Read()
+		token = strings.TrimSpace(token)
+		fmt.Printf("%q %q\n", token, del)
+		if del == 0 {
+			break
+		}
+	}
+	// Output:
+	// "" '['
+	// "test" ']'
+	// "key" '='
+	// "value" '\x00'
+}
+
+func ExampleParser_ReadNoSpace() {
+	var (
+		content = " 1 , \r\t\f, 2 , 3 , 4 , "
+		p       = NewParser(content, `,`)
+
+		tok string
+		r   rune
+	)
+	for {
+		tok, r = p.ReadNoSpace()
+		fmt.Printf("%q\n", tok)
+		if r == 0 {
+			break
+		}
+	}
+	// Output:
+	// "1"
+	// ""
+	// "2"
+	// "3"
+	// "4"
+	// ""
+}
diff --git a/lib/strings/parser_test.go b/lib/strings/parser_test.go
new file mode 100644
index 00000000..28ccb66d
--- /dev/null
+++ b/lib/strings/parser_test.go
@@ -0,0 +1,357 @@
+// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+import (
+	"testing"
+
+	"github.com/shuLhan/share/lib/test"
+)
+
+func TestParser_AddDelimiters(t *testing.T) {
+	p := &Parser{
+		delims: `/:`,
+	}
+
+	cases := []struct {
+		delims string
+		exp    string
+	}{{
+		exp: `/:`,
+	}, {
+		delims: " \t",
+		exp:    "/: \t",
+	}, {
+		delims: " \t",
+		exp:    "/: \t",
+	}}
+
+	for _, c := range cases {
+		p.AddDelimiters(c.delims)
+		test.Assert(t, `p.delims`, c.exp, p.delims)
+	}
+}
+
+type expLine struct {
+	line string
+	c    rune
+}
+
+func TestParser_Line(t *testing.T) {
+	p := NewParser(``, "\n")
+
+	cases := []struct {
+		content string
+		exp     []expLine
+	}{{
+		content: ``,
+		exp:     []expLine{{}},
+	}, {
+		content: `a
+`,
+		exp: []expLine{
+			{`a`, '\n'},
+			{``, 0},
+		},
+	}, {
+		content: `a
+
+b`,
+		exp: []expLine{
+			{`a`, '\n'},
+			{``, '\n'},
+			{`b`, 0},
+			{``, 0},
+		},
+	}}
+
+	for _, c := range cases {
+		p.Load(c.content, "\n")
+
+		for x := 0; x < len(c.exp); x++ {
+			gotLine, gotC := p.Line()
+			test.Assert(t, ``, c.exp[x].line, gotLine)
+			test.Assert(t, ``, c.exp[x].c, gotC)
+		}
+	}
+}
+
+func TestParser_Lines(t *testing.T) {
+	cases := []struct {
+		desc    string
+		content string
+		exp     []string
+	}{{
+		desc: `With empty content`,
+		exp:  []string{},
+	}, {
+		desc:    `With single empty line`,
+		content: "\n",
+		exp:     []string{},
+	}, {
+		desc:    `With single empty line`,
+		content: " \t\r\f\n",
+		exp:     []string{},
+	}, {
+		desc:    `With one line, at the end`,
+		content: " \t\r\f\ntest",
+		exp: []string{
+			`test`,
+		},
+	}, {
+		desc:    `With one line, in the middle`,
+		content: " \t\r\f\ntest \t\r\f\n",
+		exp: []string{
+			`test`,
+		},
+	}, {
+		desc:    `With two lines`,
+		content: "A \t\f\r\n \nB \t\f\r\n",
+		exp: []string{
+			`A`,
+			`B`,
+		},
+	}, {
+		desc:    `With three lines`,
+		content: "A \t\f\r\n \n\n\nB\n \t\f\r\nC",
+		exp: []string{
+			`A`,
+			`B`,
+			`C`,
+		},
+	}}
+
+	p := NewParser(``, ``)
+
+	for _, c := range cases {
+		t.Log(c.desc)
+
+		p.Load(c.content, ``)
+
+		got := p.Lines()
+
+		test.Assert(t, `Lines()`, c.exp, got)
+	}
+}
+
+func TestParser_Stop(t *testing.T) {
+	p := NewParser("\t test \ntest", ``)
+
+	cases := []struct {
+		exp string
+	}{{
+		exp: " test \ntest",
+	}, {
+		exp: "test \ntest",
+	}, {
+		exp: "\ntest",
+	}, {
+		exp: `test`,
+	}, {
+		exp: ``,
+	}}
+
+	var got string
+	for _, c := range cases {
+		_, _ = p.Read()
+		got, _ = p.Stop()
+		test.Assert(t, `Stop`, c.exp, got)
+		p.Load(got, ``)
+	}
+}
+
+func TestParserRead(t *testing.T) {
+	p := NewParser("\t test \ntest", ``)
+
+	cases := []struct {
+		expToken string
+		expDelim rune
+	}{{
+		expDelim: '\t',
+	}, {
+		expDelim: ' ',
+	}, {
+		expToken: `test`,
+		expDelim: ' ',
+	}, {
+		expDelim: '\n',
+	}, {
+		expToken: `test`,
+	}}
+
+	for _, c := range cases {
+		gotToken, gotDelim := p.Read()
+
+		test.Assert(t, `token`, c.expToken, gotToken)
+		test.Assert(t, `delim`, c.expDelim, gotDelim)
+	}
+}
+
+func TestParser_TokenEscaped(t *testing.T) {
+	p := NewParser("\t te\\ st \ntest", ``)
+
+	cases := []struct {
+		expToken string
+		expDelim rune
+	}{{
+		expDelim: '\t',
+	}, {
+		expDelim: ' ',
+	}, {
+		expToken: `te st`,
+		expDelim: ' ',
+	}, {
+		expDelim: '\n',
+	}, {
+		expToken: `test`,
+	}}
+
+	for _, c := range cases {
+		gotToken, gotDelim := p.ReadEscaped('\\')
+
+		test.Assert(t, `token`, c.expToken, gotToken)
+		test.Assert(t, `delim`, c.expDelim, gotDelim)
+	}
+}
+
+func TestParser_SkipLine(t *testing.T) {
+	cases := []struct {
+		desc     string
+		content  string
+		expToken string
+		expDelim rune
+	}{{
+		desc: `With empty content`,
+	}, {
+		desc:     `With empty line`,
+		content:  "\ntest\n",
+		expToken: `test`,
+		expDelim: '\n',
+	}, {
+		desc:    `With single line`,
+		content: "test\n",
+	}, {
+		desc:     `With two lines`,
+		content:  "test 1\ntest 2",
+		expToken: `test`,
+		expDelim: ' ',
+	}}
+
+	p := NewParser(``, ``)
+
+	for _, c := range cases {
+		t.Log(c.desc)
+
+		p.Load(c.content, ``)
+
+		p.SkipLine()
+
+		gotToken, gotDelim := p.Read()
+
+		test.Assert(t, `token`, c.expToken, gotToken)
+		test.Assert(t, `delim`, c.expDelim, gotDelim)
+	}
+}
+
+func TestOpenForParser(t *testing.T) {
+	cases := []struct {
+		desc       string
+		file       string
+		expError   string
+		expContent string
+	}{{
+		desc:     `With not existing file`,
+		file:     `testdata/xxx`,
+		expError: `open testdata/xxx: no such file or directory`,
+	}, {
+		desc:       `With file exist`,
+		file:       `testdata/parser/open_test.txt`,
+		expContent: "test\n",
+	}}
+
+	for _, c := range cases {
+		t.Log(c.desc)
+
+		p, err := OpenForParser(c.file, ``)
+		if err != nil {
+			test.Assert(t, `error`, c.expError, err.Error())
+			continue
+		}
+
+		test.Assert(t, `content`, c.expContent, p.v)
+	}
+}
+
+func TestParser_RemoveDelimiters(t *testing.T) {
+	p := &Parser{
+		delims: "/: \t",
+	}
+	cases := []struct {
+		delims string
+		exp    string
+	}{{
+		exp: "/: \t",
+	}, {
+		delims: `/`,
+		exp:    ": \t",
+	}, {
+		delims: `///`,
+		exp:    ": \t",
+	}, {
+		delims: "\t :",
+		exp:    ``,
+	}}
+
+	for _, c := range cases {
+		p.RemoveDelimiters(c.delims)
+		test.Assert(t, `p.delims`, c.exp, p.delims)
+	}
+}
+
+func TestParser_SkipHorizontalSpaces(t *testing.T) {
+	cases := []struct {
+		desc     string
+		content  string
+		expToken string
+		expRune  rune
+		expDelim rune
+	}{{
+		desc: `With empty content`,
+	}, {
+		desc:     `With empty line`,
+		content:  " \t\r\f\n",
+		expRune:  '\n',
+		expDelim: '\n',
+	}, {
+		desc:     `With single line`,
+		content:  "test\n",
+		expRune:  't',
+		expToken: `test`,
+		expDelim: '\n',
+	}, {
+		desc:     `With space in the beginning`,
+		content:  " \t\f\rtest 1\ntest 2",
+		expRune:  't',
+		expToken: `test`,
+		expDelim: ' ',
+	}}
+
+	p := NewParser(``, ``)
+
+	for _, c := range cases {
+		t.Log(c.desc)
+
+		p.Load(c.content, ``)
+
+		got := p.SkipHorizontalSpaces()
+
+		test.Assert(t, `rune`, c.expRune, got)
+
+		gotToken, gotDelim := p.Read()
+
+		test.Assert(t, `token`, c.expToken, gotToken)
+		test.Assert(t, `delim`, c.expDelim, gotDelim)
+	}
+}
diff --git a/lib/strings/testdata/parser/open_test.txt b/lib/strings/testdata/parser/open_test.txt
new file mode 100644
index 00000000..9daeafb9
--- /dev/null
+++ b/lib/strings/testdata/parser/open_test.txt
@@ -0,0 +1 @@
+test
author	Shulhan <ms@kilabit.info>	2023-04-05 23:04:04 +0700
committer	Shulhan <ms@kilabit.info>	2023-04-05 23:07:55 +0700
commit	98a256e57f2df1b10b7cd75808bee68d8abd1dcd (patch)
tree	606c360e5d74388dc7207b912d91c6c88a1b5f70 /lib/strings
parent	144ceef8e29f8780c84bc5ae2589dab3895c5ee6 (diff)
download	pakakeh.go-98a256e57f2df1b10b7cd75808bee68d8abd1dcd.tar.xz