diff options
| author | Shulhan <ms@kilabit.info> | 2023-04-05 23:04:04 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2023-04-05 23:07:55 +0700 |
| commit | 98a256e57f2df1b10b7cd75808bee68d8abd1dcd (patch) | |
| tree | 606c360e5d74388dc7207b912d91c6c88a1b5f70 | |
| parent | 144ceef8e29f8780c84bc5ae2589dab3895c5ee6 (diff) | |
| download | pakakeh.go-98a256e57f2df1b10b7cd75808bee68d8abd1dcd.tar.xz | |
lib/strings: merge lib/parser here
The first idea of parser is to provide generic parser for both bytes and
string.
After we introduce lib/parser there is not much changes to that package.
Also, since we create another Parser in lib/bytes that accept and
return token as []byte, the lib/parser is not unique anymore.
The following function/methods changes to minimize conflict in the future,
* Lines become LinesOfFile
* New become NewParser
* Open become OpenForParser
* Token become Read
* TokenEscaped become ReadEscaped
* TokenTrimSpace become ReadNoSpace
| -rw-r--r-- | lib/parser/parser.go | 3 | ||||
| -rw-r--r-- | lib/strings/parser.go | 409 | ||||
| -rw-r--r-- | lib/strings/parser_benchmark_test.go | 22 | ||||
| -rw-r--r-- | lib/strings/parser_example_test.go | 53 | ||||
| -rw-r--r-- | lib/strings/parser_test.go | 357 | ||||
| -rw-r--r-- | lib/strings/testdata/parser/open_test.txt | 1 |
6 files changed, 845 insertions, 0 deletions
diff --git a/lib/parser/parser.go b/lib/parser/parser.go index 7c084d6d..3562d9c2 100644 --- a/lib/parser/parser.go +++ b/lib/parser/parser.go @@ -3,6 +3,9 @@ // license that can be found in the LICENSE file. // Package parser provide a common text parser, using delimiters. +// +// DEPRECATED: this package has been merged with package lib/strings and will +// be removed in the next six release v0.51.0. package parser import ( diff --git a/lib/strings/parser.go b/lib/strings/parser.go new file mode 100644 index 00000000..9efac353 --- /dev/null +++ b/lib/strings/parser.go @@ -0,0 +1,409 @@ +// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "fmt" + "os" + + libascii "github.com/shuLhan/share/lib/ascii" +) + +// Parser implement text parsing over string. +type Parser struct { + file string + delims string + v string // v contains the text to be parsed. + token []rune // token that has been parsed. + x int // x is the position of read in v. + d rune // d is one of delims character that terminated parsing. +} + +// LinesOfFile parse the content of file and return non-empty lines. +func LinesOfFile(file string) ([]string, error) { + p, err := OpenForParser(file, ``) + if err != nil { + return nil, fmt.Errorf(`LinesOfFile: %w`, err) + } + return p.Lines(), nil +} + +// NewParser create and initialize parser from content and delimiters. +func NewParser(content, delims string) (p *Parser) { + p = &Parser{ + token: make([]rune, 0, 16), + } + + p.Load(content, delims) + + return p +} + +// OpenForParser create and initialize the Parser using content from file. +// If delimiters is empty, it would default to all whitespaces characters. +func OpenForParser(file, delims string) (p *Parser, err error) { + v, err := os.ReadFile(file) + if err != nil { + return nil, err + } + + p = NewParser(string(v), delims) + p.file = file + + return p, nil +} + +// AddDelimiters append new delimiter to existing parser. +func (p *Parser) AddDelimiters(delims string) { + var found bool + for _, newd := range delims { + found = false + for _, oldd := range p.delims { + if oldd == newd { + found = true + break + } + } + if !found { + p.delims += string(newd) + } + } +} + +// Close the parser by resetting all its internal state to zero value. +func (p *Parser) Close() { + p.file = `` + p.delims = `` + p.x = 0 + p.v = `` + p.token = p.token[:0] + p.d = 0 +} + +// isDelim true if r is one of delimiters. +func (p *Parser) isDelim(r rune) bool { + var d rune + for _, d = range p.delims { + if r == d { + return true + } + } + return false +} + +// Lines return all non-empty lines from the content. +func (p *Parser) Lines() []string { + var start, end int + + lines := make([]string, 0) + + for x := p.x; x < len(p.v); x++ { + // Skip white spaces on beginning ... + for ; x < len(p.v); x++ { + if p.v[x] == ' ' || p.v[x] == '\t' || p.v[x] == '\r' || p.v[x] == '\f' { + continue + } + break + } + start = x + for ; x < len(p.v); x++ { + if p.v[x] != '\n' { + continue + } + break + } + + // Skip white spaces at the end ... + for end = x - 1; end > start; end-- { + if p.v[end] == ' ' || p.v[end] == '\t' || + p.v[end] == '\r' || p.v[end] == '\f' { + continue + } + break + } + end++ + if start == end { + // Skip empty lines + continue + } + + line := p.v[start:end] + lines = append(lines, line) + } + + p.x = len(p.v) + + return lines +} + +// Load the new content and delimiters. +func (p *Parser) Load(content, delims string) { + p.Close() + p.v = content + if len(delims) == 0 { + p.delims = string(libascii.Spaces) + } else { + p.delims = delims + } +} + +// Line read and return a single line. +// On success it will return a string without '\n' and new line character. +// In case of EOF it will return the last line and 0. +func (p *Parser) Line() (string, rune) { + p.d = 0 + p.token = p.token[:0] + + for x, r := range p.v[p.x:] { + if r == '\n' { + p.d = r + p.x += x + 1 + return string(p.token), p.d + } + p.token = append(p.token, r) + } + p.x = len(p.v) + return string(p.token), 0 +} + +// SetDelimiters replace the current delimiters with delims. +func (p *Parser) SetDelimiters(delims string) { + p.delims = delims +} + +// Stop the parser, return the remaining unparsed content and its last +// position, and then call Close to reset the internal state back to zero. +func (p *Parser) Stop() (remain string, pos int) { + pos = p.x + remain = p.v[pos:] + p.Close() + return remain, pos +} + +// Read read the next token from content until one of the delimiter found. +// if no delimiter found, its mean all of content has been read, the returned +// delimiter will be 0. +func (p *Parser) Read() (string, rune) { + p.d = 0 + p.token = p.token[:0] + + if p.x >= len(p.v) { + return ``, 0 + } + + for x, r := range p.v[p.x:] { + for _, d := range p.delims { + if r == d { + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.x = len(p.v) + return string(p.token), 0 +} + +// ReadEscaped read the next token from content until one of the delimiter +// found, unless its escaped with value of esc character. +// +// For example, if the content is "a b" and one of the delimiter is " ", +// escaping it with "\" will return as "a b" not "a". +func (p *Parser) ReadEscaped(esc rune) (string, rune) { + var isEscaped bool + + p.token = p.token[:0] + + if p.x >= len(p.v) { + p.d = 0 + return ``, 0 + } + + for x, r := range p.v[p.x:] { + if r == esc { + if isEscaped { + p.token = append(p.token, r) + isEscaped = false + continue + } + isEscaped = true + continue + } + for _, d := range p.delims { + if r == d { + if isEscaped { + isEscaped = false + break + } + + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return string(p.token), p.d +} + +// ReadNoSpace read the next token until one of the delimiter found, with +// leading and trailing spaces are ignored. +func (p *Parser) ReadNoSpace() (v string, r rune) { + p.d = 0 + p.token = p.token[:0] + + if p.x >= len(p.v) { + return ``, 0 + } + + var x int + + // Skip leading spaces. + for x, r = range p.v[p.x:] { + if isHorizontalSpace(r) { + continue + } + break + } + p.x += x + + for x, r = range p.v[p.x:] { + if p.isDelim(r) { + p.d = r + break + } + p.token = append(p.token, r) + } + + p.x += x + 1 // +1 to skip the delimiter. + + // Remove trailing spaces. + for x = len(p.token) - 1; x >= 0; x-- { + if isHorizontalSpace(p.token[x]) { + continue + } + break + } + if x < 0 { + // Empty token. + return ``, p.d + } + p.token = p.token[:x+1] + + return string(p.token), p.d +} + +// ReadEnclosed read the token inside opening and closing characters, ignoring +// all delimiters that previously set. +// +// It will return the parsed token and closed character if closed character +// found, otherwise it will token with 0. +func (p *Parser) ReadEnclosed(open, closed rune) (string, rune) { + for x, r := range p.v[p.x:] { + if x == 0 { + if r == open { + continue + } + } + if r == closed { + p.d = closed + p.x += x + 1 + return string(p.token), p.d + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return p.v, 0 +} + +// RemoveDelimiters from current parser. +func (p *Parser) RemoveDelimiters(dels string) { + var ( + newdelims string + found bool + ) + + for _, oldd := range p.delims { + found = false + for _, r := range dels { + if r == oldd { + found = true + break + } + } + if !found { + newdelims += string(oldd) + } + } + + p.delims = newdelims +} + +// Skip parsing n characters or EOF if n is greater then length of content. +func (p *Parser) Skip(n int) { + if p.x+n >= len(p.v) { + p.x = len(p.v) + p.d = 0 + } else { + p.x += n + } +} + +// SkipHorizontalSpaces skip all space (" "), tab ("\t"), carriage return +// ("\r"), and form feed ("\f") characters; and return the first character +// found, probably new line. +func (p *Parser) SkipHorizontalSpaces() rune { + for x, r := range p.v[p.x:] { + switch r { + case ' ', '\t', '\r', '\f': + default: + p.x += x + p.d = r + return r + } + } + + p.d = 0 + p.x = len(p.v) + + return 0 +} + +// SkipLine skip all characters until new line. +// It will return the first character after new line or 0 if EOF. +func (p *Parser) SkipLine() rune { + for x, r := range p.v[p.x:] { + if r == '\n' { + p.x += x + 1 + if p.x >= len(p.v) { + p.d = 0 + } else { + p.d = r + } + return p.d + } + } + + // All contents has been read, no new line found. + p.x = len(p.v) + p.d = 0 + + return 0 +} + +// isHorizontalSpace true if r is space, tab, carriage return, or form feed. +func isHorizontalSpace(r rune) bool { + return r == ' ' || r == '\t' || r == '\r' || r == '\f' +} diff --git a/lib/strings/parser_benchmark_test.go b/lib/strings/parser_benchmark_test.go new file mode 100644 index 00000000..a41c2128 --- /dev/null +++ b/lib/strings/parser_benchmark_test.go @@ -0,0 +1,22 @@ +// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import "testing" + +// Output: +// +// BenchmarkParser_Read-4 59117898 20.2 ns/op 0 B/op 0 allocs/op +func BenchmarkParser_Read(b *testing.B) { + content := `abc;def` + delims := ` /;` + + p := NewParser(content, delims) + + for x := 0; x < b.N; x++ { + p.Read() + p.Load(content, delims) + } +} diff --git a/lib/strings/parser_example_test.go b/lib/strings/parser_example_test.go new file mode 100644 index 00000000..726609cb --- /dev/null +++ b/lib/strings/parser_example_test.go @@ -0,0 +1,53 @@ +// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "fmt" + "strings" +) + +func ExampleNewParser() { + content := "[test]\nkey = value" + p := NewParser(content, `=[]`) + + for { + token, del := p.Read() + token = strings.TrimSpace(token) + fmt.Printf("%q %q\n", token, del) + if del == 0 { + break + } + } + // Output: + // "" '[' + // "test" ']' + // "key" '=' + // "value" '\x00' +} + +func ExampleParser_ReadNoSpace() { + var ( + content = " 1 , \r\t\f, 2 , 3 , 4 , " + p = NewParser(content, `,`) + + tok string + r rune + ) + for { + tok, r = p.ReadNoSpace() + fmt.Printf("%q\n", tok) + if r == 0 { + break + } + } + // Output: + // "1" + // "" + // "2" + // "3" + // "4" + // "" +} diff --git a/lib/strings/parser_test.go b/lib/strings/parser_test.go new file mode 100644 index 00000000..28ccb66d --- /dev/null +++ b/lib/strings/parser_test.go @@ -0,0 +1,357 @@ +// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestParser_AddDelimiters(t *testing.T) { + p := &Parser{ + delims: `/:`, + } + + cases := []struct { + delims string + exp string + }{{ + exp: `/:`, + }, { + delims: " \t", + exp: "/: \t", + }, { + delims: " \t", + exp: "/: \t", + }} + + for _, c := range cases { + p.AddDelimiters(c.delims) + test.Assert(t, `p.delims`, c.exp, p.delims) + } +} + +type expLine struct { + line string + c rune +} + +func TestParser_Line(t *testing.T) { + p := NewParser(``, "\n") + + cases := []struct { + content string + exp []expLine + }{{ + content: ``, + exp: []expLine{{}}, + }, { + content: `a +`, + exp: []expLine{ + {`a`, '\n'}, + {``, 0}, + }, + }, { + content: `a + +b`, + exp: []expLine{ + {`a`, '\n'}, + {``, '\n'}, + {`b`, 0}, + {``, 0}, + }, + }} + + for _, c := range cases { + p.Load(c.content, "\n") + + for x := 0; x < len(c.exp); x++ { + gotLine, gotC := p.Line() + test.Assert(t, ``, c.exp[x].line, gotLine) + test.Assert(t, ``, c.exp[x].c, gotC) + } + } +} + +func TestParser_Lines(t *testing.T) { + cases := []struct { + desc string + content string + exp []string + }{{ + desc: `With empty content`, + exp: []string{}, + }, { + desc: `With single empty line`, + content: "\n", + exp: []string{}, + }, { + desc: `With single empty line`, + content: " \t\r\f\n", + exp: []string{}, + }, { + desc: `With one line, at the end`, + content: " \t\r\f\ntest", + exp: []string{ + `test`, + }, + }, { + desc: `With one line, in the middle`, + content: " \t\r\f\ntest \t\r\f\n", + exp: []string{ + `test`, + }, + }, { + desc: `With two lines`, + content: "A \t\f\r\n \nB \t\f\r\n", + exp: []string{ + `A`, + `B`, + }, + }, { + desc: `With three lines`, + content: "A \t\f\r\n \n\n\nB\n \t\f\r\nC", + exp: []string{ + `A`, + `B`, + `C`, + }, + }} + + p := NewParser(``, ``) + + for _, c := range cases { + t.Log(c.desc) + + p.Load(c.content, ``) + + got := p.Lines() + + test.Assert(t, `Lines()`, c.exp, got) + } +} + +func TestParser_Stop(t *testing.T) { + p := NewParser("\t test \ntest", ``) + + cases := []struct { + exp string + }{{ + exp: " test \ntest", + }, { + exp: "test \ntest", + }, { + exp: "\ntest", + }, { + exp: `test`, + }, { + exp: ``, + }} + + var got string + for _, c := range cases { + _, _ = p.Read() + got, _ = p.Stop() + test.Assert(t, `Stop`, c.exp, got) + p.Load(got, ``) + } +} + +func TestParserRead(t *testing.T) { + p := NewParser("\t test \ntest", ``) + + cases := []struct { + expToken string + expDelim rune + }{{ + expDelim: '\t', + }, { + expDelim: ' ', + }, { + expToken: `test`, + expDelim: ' ', + }, { + expDelim: '\n', + }, { + expToken: `test`, + }} + + for _, c := range cases { + gotToken, gotDelim := p.Read() + + test.Assert(t, `token`, c.expToken, gotToken) + test.Assert(t, `delim`, c.expDelim, gotDelim) + } +} + +func TestParser_TokenEscaped(t *testing.T) { + p := NewParser("\t te\\ st \ntest", ``) + + cases := []struct { + expToken string + expDelim rune + }{{ + expDelim: '\t', + }, { + expDelim: ' ', + }, { + expToken: `te st`, + expDelim: ' ', + }, { + expDelim: '\n', + }, { + expToken: `test`, + }} + + for _, c := range cases { + gotToken, gotDelim := p.ReadEscaped('\\') + + test.Assert(t, `token`, c.expToken, gotToken) + test.Assert(t, `delim`, c.expDelim, gotDelim) + } +} + +func TestParser_SkipLine(t *testing.T) { + cases := []struct { + desc string + content string + expToken string + expDelim rune + }{{ + desc: `With empty content`, + }, { + desc: `With empty line`, + content: "\ntest\n", + expToken: `test`, + expDelim: '\n', + }, { + desc: `With single line`, + content: "test\n", + }, { + desc: `With two lines`, + content: "test 1\ntest 2", + expToken: `test`, + expDelim: ' ', + }} + + p := NewParser(``, ``) + + for _, c := range cases { + t.Log(c.desc) + + p.Load(c.content, ``) + + p.SkipLine() + + gotToken, gotDelim := p.Read() + + test.Assert(t, `token`, c.expToken, gotToken) + test.Assert(t, `delim`, c.expDelim, gotDelim) + } +} + +func TestOpenForParser(t *testing.T) { + cases := []struct { + desc string + file string + expError string + expContent string + }{{ + desc: `With not existing file`, + file: `testdata/xxx`, + expError: `open testdata/xxx: no such file or directory`, + }, { + desc: `With file exist`, + file: `testdata/parser/open_test.txt`, + expContent: "test\n", + }} + + for _, c := range cases { + t.Log(c.desc) + + p, err := OpenForParser(c.file, ``) + if err != nil { + test.Assert(t, `error`, c.expError, err.Error()) + continue + } + + test.Assert(t, `content`, c.expContent, p.v) + } +} + +func TestParser_RemoveDelimiters(t *testing.T) { + p := &Parser{ + delims: "/: \t", + } + cases := []struct { + delims string + exp string + }{{ + exp: "/: \t", + }, { + delims: `/`, + exp: ": \t", + }, { + delims: `///`, + exp: ": \t", + }, { + delims: "\t :", + exp: ``, + }} + + for _, c := range cases { + p.RemoveDelimiters(c.delims) + test.Assert(t, `p.delims`, c.exp, p.delims) + } +} + +func TestParser_SkipHorizontalSpaces(t *testing.T) { + cases := []struct { + desc string + content string + expToken string + expRune rune + expDelim rune + }{{ + desc: `With empty content`, + }, { + desc: `With empty line`, + content: " \t\r\f\n", + expRune: '\n', + expDelim: '\n', + }, { + desc: `With single line`, + content: "test\n", + expRune: 't', + expToken: `test`, + expDelim: '\n', + }, { + desc: `With space in the beginning`, + content: " \t\f\rtest 1\ntest 2", + expRune: 't', + expToken: `test`, + expDelim: ' ', + }} + + p := NewParser(``, ``) + + for _, c := range cases { + t.Log(c.desc) + + p.Load(c.content, ``) + + got := p.SkipHorizontalSpaces() + + test.Assert(t, `rune`, c.expRune, got) + + gotToken, gotDelim := p.Read() + + test.Assert(t, `token`, c.expToken, gotToken) + test.Assert(t, `delim`, c.expDelim, gotDelim) + } +} diff --git a/lib/strings/testdata/parser/open_test.txt b/lib/strings/testdata/parser/open_test.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/lib/strings/testdata/parser/open_test.txt @@ -0,0 +1 @@ +test |
