From 2bca5ef4f69b8f33e76660dfd784fa2b3e2251c4 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Tue, 5 Nov 2019 22:16:46 +0700 Subject: lib/parser: a general parser library --- lib/parser/parser.go | 349 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 lib/parser/parser.go (limited to 'lib/parser/parser.go') diff --git a/lib/parser/parser.go b/lib/parser/parser.go new file mode 100644 index 00000000..8c5d07e7 --- /dev/null +++ b/lib/parser/parser.go @@ -0,0 +1,349 @@ +// Copyright 2019, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package parser provide a common text parser, using delimiters. +// +package parser + +import ( + "fmt" + "io/ioutil" + + libascii "github.com/shuLhan/share/lib/ascii" +) + +// +// Parser implement text parsing. +// +type Parser struct { + file string + delims string + x int // x is the position of read in v. + v string // v contains the text to be parsed. + token []rune // token that has been parsed. + d rune // d is one of delims character that terminated parsing. +} + +// +// Lines parse the content of path and return non-empty lines. +// +func Lines(file string) ([]string, error) { + p, err := Open(file, "") + if err != nil { + return nil, fmt.Errorf("Lines: %w", err) + } + return p.Lines(), nil +} + +// +// New create and initialize parser from content and delimiters. +// +func New(content, delims string) (p *Parser) { + p = &Parser{ + token: make([]rune, 0, 16), + } + + p.Load(content, delims) + + return p +} + +// +// Open create and initialize the parser using predefined delimiters. +// All the content of file will be loaded first. +// If delimiters is empty, it would default to all whitespaces characters. +// +func Open(file, delims string) (p *Parser, err error) { + v, err := ioutil.ReadFile(file) + if err != nil { + return nil, err + } + + p = New(string(v), delims) + p.file = file + + return p, nil +} + +// +// AddDelimiters append new delimiter to existing parser. +// +func (p *Parser) AddDelimiters(delims string) { + var found bool + for _, newd := range delims { + found = false + for _, oldd := range p.delims { + if oldd == newd { + found = true + break + } + } + if !found { + p.delims += string(newd) + } + } +} + +// +// Close the parser by resetting all its internal state to zero value. +// +func (p *Parser) Close() { + p.file = "" + p.delims = "" + p.x = 0 + p.v = "" + p.token = p.token[:0] + p.d = 0 +} + +// +// Lines return all non-empty lines from the content. +// +func (p *Parser) Lines() []string { + var start, end int + + lines := make([]string, 0) + + for x := p.x; x < len(p.v); x++ { + // Skip white spaces on beginning ... + for ; x < len(p.v); x++ { + if p.v[x] == ' ' || p.v[x] == '\t' || p.v[x] == '\r' || p.v[x] == '\f' { + continue + } + break + } + start = x + for ; x < len(p.v); x++ { + if p.v[x] != '\n' { + continue + } + break + } + + // Skip white spaces at the end ... + for end = x - 1; end > start; end-- { + if p.v[end] == ' ' || p.v[end] == '\t' || + p.v[end] == '\r' || p.v[end] == '\f' { + continue + } + break + } + end++ + if start == end { + // Skip empty lines + continue + } + + line := p.v[start:end] + lines = append(lines, line) + } + + p.x = len(p.v) + + return lines +} + +// +// Load the new content and delimiters. +// +func (p *Parser) Load(content, delims string) { + p.Close() + p.v = content + if len(delims) == 0 { + p.delims = string(libascii.Spaces) + } else { + p.delims = delims + } +} + +// +// Token read the next token from content until one of the delimiter found. +// if no delimiter found, its mean all of content has been read, the returned +// delimiter will be 0. +// +func (p *Parser) Token() (string, rune) { + p.token = p.token[:0] + + if p.x >= len(p.v) { + p.d = 0 + return "", 0 + } + + for x, r := range p.v[p.x:] { + for _, d := range p.delims { + if r == d { + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return string(p.token), p.d +} + +// +// TokenEscaped read the next token from content until one of the delimiter +// found, unless its escaped with value of esc character. +// +// For example, if the content is "a b" and one of the delimiter is " ", +// escaping it with "\" will return as "a b" not "a". +// +func (p *Parser) TokenEscaped(esc rune) (string, rune) { + var isEscaped bool + + p.token = p.token[:0] + + if p.x >= len(p.v) { + p.d = 0 + return "", 0 + } + + for x, r := range p.v[p.x:] { + if r == esc { + if isEscaped { + p.token = append(p.token, r) + isEscaped = false + continue + } + isEscaped = true + continue + } + for _, d := range p.delims { + if r == d { + if isEscaped { + isEscaped = false + break + } + + p.d = d + p.x += x + 1 + return string(p.token), p.d + } + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return string(p.token), p.d + +} + +// +// ReadEnclosed read the token inside opening and closing characters, ignoring +// all delimiters that previously set. +// +// It will return the parsed token and closed character if closed character +// found, otherwise it will token with 0. +// +func (p *Parser) ReadEnclosed(open, closed rune) (string, rune) { + for x, r := range p.v[p.x:] { + if x == 0 { + if r == open { + continue + } + } + if r == closed { + p.d = closed + p.x += x + 1 + return string(p.token), p.d + } + + p.token = append(p.token, r) + } + + p.d = 0 + p.x = len(p.v) + return string(p.v), 0 +} + +// +// RemoveDelimiters from current parser. +// +func (p *Parser) RemoveDelimiters(dels string) { + var ( + newdelims string + found bool + ) + + for _, oldd := range p.delims { + found = false + for _, r := range dels { + if r == oldd { + found = true + break + } + } + if !found { + newdelims += string(oldd) + } + } + + p.delims = newdelims +} + +// +// Skip parsing n characters or EOF if n is greater then length of content. +// +func (p *Parser) Skip(n int) { + if p.x+n >= len(p.v) { + p.x = len(p.v) + p.d = 0 + } else { + p.x += n + } +} + +// +// SkipHorizontalSpaces skip all space (" "), tab ("\t"), carriage return +// ("\r"), and form feed ("\f") characters; and return the first character +// found, probably new line. +// +func (p *Parser) SkipHorizontalSpaces() rune { + for x, r := range p.v[p.x:] { + switch r { + case ' ', '\t', '\r', '\f': + default: + p.x += x + p.d = r + return r + } + } + + p.d = 0 + p.x = len(p.v) + + return 0 +} + +// +// SkipLine skip all characters until new line. +// It will return the first character after new line or 0 if EOF. +// +func (p *Parser) SkipLine() rune { + for x, r := range p.v[p.x:] { + if r == '\n' { + p.x += x + 1 + if p.x >= len(p.v) { + p.d = 0 + } else { + p.d = r + } + return p.d + } + } + + // All contents has been read, no new line found. + p.x = len(p.v) + p.d = 0 + + return 0 +} -- cgit v1.3