diff options
| author | Shulhan <ms@kilabit.info> | 2023-04-05 23:04:04 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2023-04-05 23:07:55 +0700 |
| commit | 98a256e57f2df1b10b7cd75808bee68d8abd1dcd (patch) | |
| tree | 606c360e5d74388dc7207b912d91c6c88a1b5f70 /lib/strings/parser.go | |
| parent | 144ceef8e29f8780c84bc5ae2589dab3895c5ee6 (diff) | |
| download | pakakeh.go-98a256e57f2df1b10b7cd75808bee68d8abd1dcd.tar.xz | |
lib/strings: merge lib/parser here
The first idea of parser is to provide generic parser for both bytes and
string.
After we introduce lib/parser there is not much changes to that package.
Also, since we create another Parser in lib/bytes that accept and
return token as []byte, the lib/parser is not unique anymore.
The following function/methods changes to minimize conflict in the future,
* Lines become LinesOfFile
* New become NewParser
* Open become OpenForParser
* Token become Read
* TokenEscaped become ReadEscaped
* TokenTrimSpace become ReadNoSpace
Diffstat (limited to 'lib/strings/parser.go')
| -rw-r--r-- | lib/strings/parser.go | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/lib/strings/parser.go b/lib/strings/parser.go new file mode 100644 index 00000000..9efac353 --- /dev/null +++ b/lib/strings/parser.go @@ -0,0 +1,409 @@ +// Copyright 2019, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "fmt" + "os" + + libascii "github.com/shuLhan/share/lib/ascii" +) + +// Parser implement text parsing over string. +type Parser struct { + file string + delims string + v string // v contains the text to be parsed. + token []rune // token that has been parsed. + x int // x is the position of read in v. + d rune // d is one of delims character that terminated parsing. +} + +// LinesOfFile parse the content of file and return non-empty lines. +func LinesOfFile(file string) ([]string, error) { + p, err := OpenForParser(file, ``) + if err != nil { + return nil, fmt.Errorf(`LinesOfFile: %w`, err) + } + return p.Lines(), nil +} + +// NewParser create and initialize parser from content and delimiters. +func NewParser(content, delims string) (p *Parser) { + p = &Parser{ + token: make([]rune, 0, 16), + } + + p.Load(content, delims) + + return p +} + +// OpenForParser create and initialize the Parser using content from file. +// If delimiters is empty, it would default to all whitespaces characters. +func OpenForParser(file, delims string) (p *Parser, err error) { + v, err := os.ReadFile(file) + if err != nil { + return nil, err + } + + p = NewParser(string(v), delims) + p.file = file + + return p, nil +} + +// AddDelimiters append new delimiter to existing parser. +func (p *Parser) AddDelimiters(delims string) { + var found bool + for _, newd := range delims { + found = false + for _, oldd := range p.delims { + if oldd == newd { + found = true + break + } + } + if !found { + p.delims += string(newd) + } + } +} + +// Close the parser by resetting all its internal state to zero value. +func (p *Parser) Close() { + p.file = `` + p.delims = `` + p.x = 0 + p.v = `` + p.token = p.token[:0] + p.d = 0 +} + +// isDelim true if r is one of delimiters. +func (p *Parser) isDelim(r rune) bool { + var d rune + for _, d = range p.delims { + if r == d { + return true + } + } + return false +} + +// Lines return all non-empty lines from the content. +func (p *Parser) Lines() []string { + var start, end int + + lines := make([]string, 0) + + for x := p.x; x < len(p.v); x++ { + // Skip white spaces on beginning ... + for ; x < len(p.v); x++ { + if p.v[x] == ' ' || p.v[x] == '\t' || p.v[x] == '\r' || p.v[x] == '\f' { + continue + } + break + } + start = x + for ; x < len(p.v); x++ { + if p.v[x] != '\n' { + continue + } + break + } + + // Skip white spaces at the end ... + for end = x - 1; end > start; end-- { + if p.v[end] == ' ' || p.v[end] == '\t' || + p.v[end] == '\r' || p.v[end] == '\f' { + continue + } + break + } + end++ + if start == end { + // Skip empty lines + continue + } + + line := p.v[start:end] + lines = append(lines, line) + } + + p.x = len(p.v) + + return lines +} + +// Load the new content and delimiters. +func (p *Parser) Load(content, delims string) { + p.Close() + p.v = content + if len(delims) == 0 { + p.delims = string(libascii.Spaces) + } else { + p.delims = delims + } +} + +// Line read and return a single line. +// On success it will return a string without '\n' and new line character. +// In case of EOF it will return the last line and 0. +func (p *Parser) Line() (string, rune) { + p.d = 0 + p.token = p.token[:0] + + for x, r := range p.v[p.x:] { + if r == '\n' { + p.d = r + p.x += x + 1 + return string(p.token), p.d + } + p.token = append(p.token, r) + } + p.x = len(p.v) + return string(p.token), 0 +} + +// SetDelimiters replace the current delimiters with delims. +func (p *Parser) SetDelimiters(delims string) { + p.delims = delims +} + +// Stop the parser, return the remaining unparsed content and its last +// position, and then call Close to reset the internal state back to zero. +func (p *Parser) Stop() (remain string, pos int) { + pos = p.x + remain = p.v[pos:] + p.Close() + return remain, pos +} + +// Read read the next token from content until one of the delimiter found. +// if no delimiter found, its mean all of content has been read, the returned +// delimiter will be 0. +func (p *Parser) Read() (string, rune) { + p.d = 0 + p.token = p.token[:0] + + if p.x >= len(p.v) { + return ``, 0 + } + + for x, r := range p.v[p.x:] { + for _, d := range p.delims { + if r == d { + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.x = len(p.v) + return string(p.token), 0 +} + +// ReadEscaped read the next token from content until one of the delimiter +// found, unless its escaped with value of esc character. +// +// For example, if the content is "a b" and one of the delimiter is " ", +// escaping it with "\" will return as "a b" not "a". +func (p *Parser) ReadEscaped(esc rune) (string, rune) { + var isEscaped bool + + p.token = p.token[:0] + + if p.x >= len(p.v) { + p.d = 0 + return ``, 0 + } + + for x, r := range p.v[p.x:] { + if r == esc { + if isEscaped { + p.token = append(p.token, r) + isEscaped = false + continue + } + isEscaped = true + continue + } + for _, d := range p.delims { + if r == d { + if isEscaped { + isEscaped = false + break + } + + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return string(p.token), p.d +} + +// ReadNoSpace read the next token until one of the delimiter found, with +// leading and trailing spaces are ignored. +func (p *Parser) ReadNoSpace() (v string, r rune) { + p.d = 0 + p.token = p.token[:0] + + if p.x >= len(p.v) { + return ``, 0 + } + + var x int + + // Skip leading spaces. + for x, r = range p.v[p.x:] { + if isHorizontalSpace(r) { + continue + } + break + } + p.x += x + + for x, r = range p.v[p.x:] { + if p.isDelim(r) { + p.d = r + break + } + p.token = append(p.token, r) + } + + p.x += x + 1 // +1 to skip the delimiter. + + // Remove trailing spaces. + for x = len(p.token) - 1; x >= 0; x-- { + if isHorizontalSpace(p.token[x]) { + continue + } + break + } + if x < 0 { + // Empty token. + return ``, p.d + } + p.token = p.token[:x+1] + + return string(p.token), p.d +} + +// ReadEnclosed read the token inside opening and closing characters, ignoring +// all delimiters that previously set. +// +// It will return the parsed token and closed character if closed character +// found, otherwise it will token with 0. +func (p *Parser) ReadEnclosed(open, closed rune) (string, rune) { + for x, r := range p.v[p.x:] { + if x == 0 { + if r == open { + continue + } + } + if r == closed { + p.d = closed + p.x += x + 1 + return string(p.token), p.d + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return p.v, 0 +} + +// RemoveDelimiters from current parser. +func (p *Parser) RemoveDelimiters(dels string) { + var ( + newdelims string + found bool + ) + + for _, oldd := range p.delims { + found = false + for _, r := range dels { + if r == oldd { + found = true + break + } + } + if !found { + newdelims += string(oldd) + } + } + + p.delims = newdelims +} + +// Skip parsing n characters or EOF if n is greater then length of content. +func (p *Parser) Skip(n int) { + if p.x+n >= len(p.v) { + p.x = len(p.v) + p.d = 0 + } else { + p.x += n + } +} + +// SkipHorizontalSpaces skip all space (" "), tab ("\t"), carriage return +// ("\r"), and form feed ("\f") characters; and return the first character +// found, probably new line. +func (p *Parser) SkipHorizontalSpaces() rune { + for x, r := range p.v[p.x:] { + switch r { + case ' ', '\t', '\r', '\f': + default: + p.x += x + p.d = r + return r + } + } + + p.d = 0 + p.x = len(p.v) + + return 0 +} + +// SkipLine skip all characters until new line. +// It will return the first character after new line or 0 if EOF. +func (p *Parser) SkipLine() rune { + for x, r := range p.v[p.x:] { + if r == '\n' { + p.x += x + 1 + if p.x >= len(p.v) { + p.d = 0 + } else { + p.d = r + } + return p.d + } + } + + // All contents has been read, no new line found. + p.x = len(p.v) + p.d = 0 + + return 0 +} + +// isHorizontalSpace true if r is space, tab, carriage return, or form feed. +func isHorizontalSpace(r rune) bool { + return r == ' ' || r == '\t' || r == '\r' || r == '\f' +} |
