aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2018-09-17 03:48:37 +0700
committerShulhan <ms@kilabit.info>2018-09-18 01:50:21 +0700
commit446fef94cd712861221c0098dcdd9ae52aaed0eb (patch)
tree63167d5a90b27121b552ab428f337717bcf2b01f
parent44b26edf7f390db383fe025454be0c4e30cfbd9b (diff)
downloadpakakeh.go-446fef94cd712861221c0098dcdd9ae52aaed0eb.tar.xz
Merge package "github.com/shuLhan/dsv"
-rw-r--r--lib/dsv/.gitignore8
-rw-r--r--lib/dsv/LICENSE38
-rw-r--r--lib/dsv/README.md350
-rw-r--r--lib/dsv/claset_test.go34
-rw-r--r--lib/dsv/common_test.go163
-rw-r--r--lib/dsv/config.go27
-rw-r--r--lib/dsv/configinterface.go57
-rw-r--r--lib/dsv/data_test.go58
-rw-r--r--lib/dsv/dsv.go100
-rw-r--r--lib/dsv/dsv_test.go96
-rw-r--r--lib/dsv/dsvinterface.go85
-rw-r--r--lib/dsv/metadata.go163
-rw-r--r--lib/dsv/metadata_test.go48
-rw-r--r--lib/dsv/metadatainterface.go45
-rw-r--r--lib/dsv/reader.go632
-rw-r--r--lib/dsv/reader_test.go601
-rw-r--r--lib/dsv/readererror.go52
-rw-r--r--lib/dsv/readerinterface.go434
-rw-r--r--lib/dsv/testdata/claset.dsv7
-rw-r--r--lib/dsv/testdata/config.dsv50
-rw-r--r--lib/dsv/testdata/config_simpleread.dsv50
-rw-r--r--lib/dsv/testdata/config_skip.dsv54
-rw-r--r--lib/dsv/testdata/expected.dat11
-rw-r--r--lib/dsv/testdata/expected_merge_columns.dat22
-rw-r--r--lib/dsv/testdata/expected_merge_rows.dat22
-rw-r--r--lib/dsv/testdata/expected_simplemerge.dat22
-rw-r--r--lib/dsv/testdata/expected_skip.dat11
-rw-r--r--lib/dsv/testdata/input.dat15
-rw-r--r--lib/dsv/testdata/writeraw.exp10
-rw-r--r--lib/dsv/writer.go515
-rw-r--r--lib/dsv/writer_test.go126
-rw-r--r--lib/dsv/writerinterface.go45
32 files changed, 3951 insertions, 0 deletions
diff --git a/lib/dsv/.gitignore b/lib/dsv/.gitignore
new file mode 100644
index 00000000..17cf232b
--- /dev/null
+++ b/lib/dsv/.gitignore
@@ -0,0 +1,8 @@
+rejected.dat
+testdata/output.dat
+testdata/output_merge_columns.dat
+testdata/output_merge_rows.dat
+testdata/output_skip.dat
+testdata/rejected.dat
+testdata/writerawcolumns.out
+testdata/writerawrows.out
diff --git a/lib/dsv/LICENSE b/lib/dsv/LICENSE
new file mode 100644
index 00000000..100cc757
--- /dev/null
+++ b/lib/dsv/LICENSE
@@ -0,0 +1,38 @@
+Copyright 2015-2018, Shulhan (ms@kilabit.info). All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of Kilabit nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ --- --- --- --- --- --- ---
+
+ TT TT II BB AAAA LLLLLL II KKKKKKKK
+ TT TT II BB AA AA LL LL II KK
+ TTTT II BB AA AA LL LL II KK
+ TT TT II BB AAAAAAAA LLLLLL II KK
+ TT TT II BB AA AA LL LL II KK
+ TT TT II BBBBBBBB AA AA LLLLLL II KK
+
+Website: http://kilabit.info
+Contact: ms@kilabit.info
diff --git a/lib/dsv/README.md b/lib/dsv/README.md
new file mode 100644
index 00000000..1a930c3b
--- /dev/null
+++ b/lib/dsv/README.md
@@ -0,0 +1,350 @@
+[![GoDoc](https://godoc.org/github.com/shuLhan/share/lib/dsv?status.svg)](https://godoc.org/github.com/shuLhan/share/lib/dsv)
+[![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/share/lib/dsv)](https://goreportcard.com/report/github.com/shuLhan/share/lib/dsv)
+
+Package `dsv` is a Go library for working with delimited separated value (DSV).
+
+DSV is a free-style form of CSV format of text data, where each record is
+separated by newline, and each column can be separated by any string, not just
+comma.
+
+- [Example](#example)
+- [Terminology](#terminology)
+- [Configuration](#configuration)
+ - [Metadata](#metadata)
+ - [Input](#input)
+ - [DatasetMode Explained](#datasetmode-explained)
+ - [Output](#output)
+- [Working with DSV](#working-with-dsv)
+ - [Processing each Rows/Columns](#processing-each-rowscolumns)
+ - [Using different Dataset](#using-different-dataset)
+ - [Builtin Functions for Dataset](#builtin-functions-for-dataset)
+- [Limitations](#limitations)
+
+---
+
+## Example
+
+Lets process this input file `input.dat`,
+
+ Mon Dt HH MM SS Process
+ Nov 29 23:14:36 process-1
+ Nov 29 23:14:37 process-2
+ Nov 29 23:14:38 process-3
+
+and generate output file `output.dat` which format like this,
+
+ "process_1","29-Nov"
+ "process_2","29-Nov"
+ "process_3","29-Nov"
+
+How do we do it?
+
+First, create file metadata for input and output, name it `config.dsv`,
+
+ {
+ "Input" :"input.dat"
+ , "Skip" :1
+ , "InputMetadata" :
+ [{
+ "Name" :"month"
+ , "Separator" :" "
+ },{
+ "Name" :"date"
+ , "Separator" :" "
+ , "Type" :"integer"
+ },{
+ "Name" :"hour"
+ , "Separator" :":"
+ , "Type" :"integer"
+ },{
+ "Name" :"minute"
+ , "Separator" :":"
+ , "Type" :"integer"
+ },{
+ "Name" :"second"
+ , "Separator" :" "
+ , "Type" :"integer"
+ },{
+ "Name" :"process_name"
+ , "Separator" :"-"
+ },{
+ "Name" :"process_id"
+ }]
+ , "Output" :"output.dat"
+ , "OutputMetadata":
+ [{
+ "Name" :"process_name"
+ , "LeftQuote" :"\""
+ , "Separator" :"_"
+ },{
+ "Name" :"process_id"
+ , "RightQuote":"\""
+ , "Separator" :","
+ },{
+ "Name" :"date"
+ , "LeftQuote" :"\""
+ , "Separator" :"-"
+ },{
+ "Name" :"month"
+ , "RightQuote":"\""
+ }]
+ }
+
+The metadata is using JSON format. For more information see `metadata.go`
+and `reader.go`.
+
+Second, we create a reader to read the input file.
+
+ dsvReader, e := dsv.NewReader("config.dsv", nil)
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+Third, we create a writer to write our output data,
+
+ dsvWriter, e := dsv.NewWriter("config.dsv")
+
+ if nil != e {
+ t.Error(e)
+ }
+
+Last action, we process them: read input records and pass them to writer.
+
+ for {
+ n, e := dsv.Read(dsvReader)
+
+ if n > 0 {
+ dsvWriter.Write(dsvReader)
+
+ // EOF, no more record.
+ } else if e == io.EOF {
+ break
+ }
+ }
+
+ // we will make sure all open descriptor is closed.
+ _ = dsvReader.Close()
+
+Easy enough? We can combine the reader and writer using `dsv.New()`, which will
+create reader and writer,
+
+ rw, e := dsv.New("config.dsv", nil)
+
+ if nil != e {
+ t.Error(e)
+ }
+
+ // do usual process like in the last step.
+
+Thats it!
+
+## Terminology
+
+Here are some terminology that we used in developing this library, which may
+help reader understanding the configuration and API.
+
+- Dataset: is a content of file
+- Record: a single cell in row or column, or the smallest building block of
+ dataset
+- Row: is a horizontal representation of records in dataset
+- Column: is a vertical representation of records in dataset
+
+```
+ COL-0 COL-1 ... COL-x
+ROW-0: record record ... record
+ROW-1: record record ... record
+...
+ROW-y: record record ... record
+```
+
+## Configuration
+
+We choose and use JSON for configuration because,
+
+1. No additional source to test.
+2. Easy to extended. User can embed the current metadata, add additional
+ configuration, and create another reader to work with it.
+
+### Metadata
+
+Metadata contain information about each column when reading input file and
+writing to output file,
+
+- `Name`: mandatory, the name of column
+- `Type`: optional, type of record when reading input file. Valid value are
+ "integer", "real", or "string" (default)
+- `Separator`: optional, default to `"\n"`. Separator is a string that
+ separate the current record with the next record.
+- `LeftQuote`: optional, default is empty `""`. LeftQuote is a string that
+ start at the beginning of record.
+- `RightQuote`: optional, default is empty `""`. RightQuote is a string at the
+ end of record.
+- `Skip`: optional, boolean, default is `false`. If true the column will be
+ saved in dataset when reading input file, otherwise it will be ignored.
+- `ValueSpace`: optional, slice of string, default is empty. This contain the
+ string representation of all possible value in column.
+
+### Input
+
+Input configuration contain information about input file.
+
+- `Input`: mandatory, the name of input file, could use relative or absolute
+ path. If no path is given then it assumed that the input file is in the same
+ directory with configuration file.
+- `InputMetadata`: mandatory, list of metadata.
+- `Skip`: optional, number, default 0. Skip define the number of line that will
+ be skipped when first input file is opened.
+- `TrimSpace`: optional, boolean, default is true. If its true, before parsed, the
+ white space in the beginning and end of each input line will be removed,
+ otherwise it will leave unmodified.
+- `Rejected`: optional, default to `rejected.dat`. Rejected is file where
+ data that does not match with metadata will be saved. One can inspect the
+ rejected file fix it for re-process or ignore it.
+- `MaxRows`: optional, default to `256`. Maximum number of rows for one read
+ operation that will be saved in memory. If its negative, i.e. `-1`, all data
+ in input file will be processed.
+- `DatasetMode`: optional, default to "rows". Mode of dataset in memory.
+ Valid values are "rows", "columns", or "matrix". Matrix mode is combination of
+ rows and columns, it give more flexibility when processing the dataset but
+ will require additional memory.
+
+#### `DatasetMode` Explained
+
+For example, given input data file,
+
+ col1,col2,col3
+ a,b,c
+ 1,2,3
+
+"rows" mode is where each line saved in its own slice, resulting in Rows:
+
+ Rows[0]: [a b c]
+ Rows[1]: [1 2 3]
+
+"columns" mode is where each line saved by columns, resulting in Columns:
+
+ Columns[0]: {col1 0 0 [] [a 1]}
+ Columns[1]: {col2 0 0 [] [b 2]}
+ Columns[1]: {col3 0 0 [] [c 3]}
+
+Unlike rows mode, each column contain metadata including column name, type,
+flag, and value space (all possible value that _may_ contain in column value).
+
+"matrix" mode is where each record saved both in row and column.
+
+### Output
+
+Output configuration contain information about output file when writing the
+dataset.
+
+- `Output`: mandatory, the name of output file, could use relative or absolute
+ path. If no path is given then it assumed that the output file is in the same
+ directory with configuration file.
+- `OutputMetadata`: mandatory, list of metadata.
+
+## Working with DSV
+
+### Processing each Rows/Columns
+
+After opening the input file, we can process the dataset based on rows/columns
+mode using simple `for` loop. Example,
+
+```
+// Save dataset object for used later.
+dataset := dsvReader.GetDataset().(tabula.DatasetInterface)
+
+for {
+ n, e := dsv.Read(dsvReader)
+
+ if n > 0 {
+ // Process each row ...
+ for x, row := dataset.GetDataAsRows() {
+
+ for y, record := range row.Records {
+ // process each record in row
+ }
+ }
+
+ // Or, process each columns
+ for x, column := dataset.GetDataAsColumns() {
+
+ for y, record := range column.Records {
+ // process each record in column
+ }
+ }
+
+ // Write the dataset to file after processed
+ dsvWriter.Write(dsvReader)
+ }
+ if e == io.EOF {
+ break
+ }
+ if e != nil {
+ // handle error
+ }
+}
+```
+
+### Using different Dataset
+
+Default dataset used by Reader is
+[tabula.Dataset](https://godoc.org/github.com/shuLhan/share/lib/tabula#Dataset).
+
+You can extend and implement
+[DatasetInterface](https://godoc.org/github.com/shuLhan/share/lib/tabula#DatasetInterface)
+and use it in reader object, either by
+
+- passing it in the second parameter in `NewReader`, for example,
+
+ ```
+ myset := MySet{
+ ...
+ }
+ reader, e := dsv.NewReader("config.dsv", &myset)
+ ```
+
+- or by calling `reader.Init` after creating new Reader,
+
+ ```
+ myset := MySet{
+ ...
+ }
+ reader := dsv.Reader{
+ ...
+ }
+ reader.Init("config.dsv", &myset)
+ ```
+
+### Builtin Functions for Dataset
+
+Since we use tabula package to manage data, any features in those package
+can be used in our dataset.
+For more information see [tabula
+package](https://godoc.org/github.com/shuLhan/share/lib/tabula).
+
+## Limitations
+
+- New line is `\n` for each row.
+
+- Reader and Writer operate in ASCII (8 bit or char type), UTF-8 is not
+ supported yet, since we can not test it. Patch for supporting UTF-8 (or
+ runes type) are welcome.
+
+- About escaped character in content of data.
+
+ Since we said that we handle free-style form of CSV, what we mean was the
+ left-quote, right-quote and separator can be string. Its not only one single
+ character like single quote or double quote or any single character, but
+ literally one or more characters without space. Any escaped character will be
+ read as is (along with `'\'`) unless its followed by right-quote or separator.
+ For example,
+
+ "test\'"
+
+ will be readed as `test\'`. But
+
+ "test\""
+
+ will be readed as `test"`, since the right-quote is matched with escaped
+ token.
diff --git a/lib/dsv/claset_test.go b/lib/dsv/claset_test.go
new file mode 100644
index 00000000..21ed197c
--- /dev/null
+++ b/lib/dsv/claset_test.go
@@ -0,0 +1,34 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "testing"
+
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestReaderWithClaset(t *testing.T) {
+ fcfg := "testdata/claset.dsv"
+
+ claset := tabula.Claset{}
+
+ _, e := NewReader(fcfg, &claset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ test.Assert(t, "", 3, claset.GetClassIndex(), true)
+
+ claset.SetMajorityClass("regular")
+ claset.SetMinorityClass("vandalism")
+
+ clone := claset.Clone().(tabula.ClasetInterface)
+
+ test.Assert(t, "", 3, clone.GetClassIndex(), true)
+ test.Assert(t, "", "regular", clone.MajorityClass(), true)
+ test.Assert(t, "", "vandalism", clone.MinorityClass(), true)
+}
diff --git a/lib/dsv/common_test.go b/lib/dsv/common_test.go
new file mode 100644
index 00000000..239fdf5e
--- /dev/null
+++ b/lib/dsv/common_test.go
@@ -0,0 +1,163 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "runtime/debug"
+ "testing"
+
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+//
+// assertFile compare content of two file, print error message and exit
+// when both are different.
+//
+func assertFile(t *testing.T, a, b string, equal bool) {
+ out, e := ioutil.ReadFile(a)
+
+ if nil != e {
+ debug.PrintStack()
+ t.Error(e)
+ }
+
+ exp, e := ioutil.ReadFile(b)
+
+ if nil != e {
+ debug.PrintStack()
+ t.Error(e)
+ }
+
+ r := bytes.Compare(out, exp)
+
+ if equal && 0 != r {
+ debug.PrintStack()
+ t.Fatal("Comparing", a, "with", b, ": result is different (",
+ r, ")")
+ }
+}
+
+func checkDataset(t *testing.T, r *Reader, exp string) {
+ var got string
+ ds := r.GetDataset().(tabula.DatasetInterface)
+ data := ds.GetData()
+
+ switch data.(type) {
+ case *tabula.Rows:
+ rows := data.(*tabula.Rows)
+ got = fmt.Sprint(*rows)
+ case *tabula.Columns:
+ cols := data.(*tabula.Columns)
+ got = fmt.Sprint(*cols)
+ case *tabula.Matrix:
+ matrix := data.(*tabula.Matrix)
+ got = fmt.Sprint(*matrix)
+ default:
+ fmt.Println("data type unknown")
+ }
+
+ test.Assert(t, "", exp, got, true)
+}
+
+//
+// doReadWrite test reading and writing the DSV data.
+//
+func doReadWrite(t *testing.T, dsvReader *Reader, dsvWriter *Writer,
+ expectation []string, check bool) {
+ i := 0
+
+ for {
+ n, e := Read(dsvReader)
+
+ if e == io.EOF || n == 0 {
+ _, e = dsvWriter.Write(dsvReader)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ break
+ }
+
+ if e != nil {
+ continue
+ }
+
+ if check {
+ checkDataset(t, dsvReader, expectation[i])
+ i++
+ }
+
+ _, e = dsvWriter.Write(dsvReader)
+ if e != nil {
+ t.Fatal(e)
+ }
+ }
+
+ e := dsvWriter.Flush()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+var datasetRows = [][]string{
+ {"0", "1", "A"},
+ {"1", "1.1", "B"},
+ {"2", "1.2", "A"},
+ {"3", "1.3", "B"},
+ {"4", "1.4", "C"},
+ {"5", "1.5", "D"},
+ {"6", "1.6", "C"},
+ {"7", "1.7", "D"},
+ {"8", "1.8", "E"},
+ {"9", "1.9", "F"},
+}
+
+var datasetCols = [][]string{
+ {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+ {"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"},
+ {"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"},
+}
+
+var datasetTypes = []int{
+ tabula.TInteger,
+ tabula.TReal,
+ tabula.TString,
+}
+
+var datasetNames = []string{"int", "real", "string"}
+
+func populateWithRows(t *testing.T, dataset *tabula.Dataset) {
+ for _, rowin := range datasetRows {
+ row := make(tabula.Row, len(rowin))
+
+ for x, recin := range rowin {
+ rec, e := tabula.NewRecordBy(recin, datasetTypes[x])
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ row[x] = rec
+ }
+
+ dataset.PushRow(&row)
+ }
+}
+
+func populateWithColumns(t *testing.T, dataset *tabula.Dataset) {
+ for x := range datasetCols {
+ col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x],
+ datasetNames[x])
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ dataset.PushColumn(*col)
+ }
+}
diff --git a/lib/dsv/config.go b/lib/dsv/config.go
new file mode 100644
index 00000000..a74fc315
--- /dev/null
+++ b/lib/dsv/config.go
@@ -0,0 +1,27 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+//
+// Config for working with DSV configuration.
+//
+type Config struct {
+ // ConfigPath path to configuration file.
+ ConfigPath string
+}
+
+//
+// GetConfigPath return the base path of configuration file.
+//
+func (cfg *Config) GetConfigPath() string {
+ return cfg.ConfigPath
+}
+
+//
+// SetConfigPath for reading input and writing rejected file.
+//
+func (cfg *Config) SetConfigPath(dir string) {
+ cfg.ConfigPath = dir
+}
diff --git a/lib/dsv/configinterface.go b/lib/dsv/configinterface.go
new file mode 100644
index 00000000..2c6fd3ae
--- /dev/null
+++ b/lib/dsv/configinterface.go
@@ -0,0 +1,57 @@
+package dsv
+
+import (
+ "encoding/json"
+ "io/ioutil"
+ "path"
+)
+
+//
+// ConfigInterface for reader and writer for initializing the config from JSON.
+//
+type ConfigInterface interface {
+ GetConfigPath() string
+ SetConfigPath(dir string)
+}
+
+//
+// ConfigOpen configuration file and initialize the attributes.
+//
+func ConfigOpen(rw interface{}, fcfg string) error {
+ cfg, e := ioutil.ReadFile(fcfg)
+
+ if nil != e {
+ return e
+ }
+
+ // Get directory where the config reside.
+ rwconfig := rw.(ConfigInterface)
+ rwconfig.SetConfigPath(path.Dir(fcfg))
+
+ return ConfigParse(rw, cfg)
+}
+
+//
+// ConfigParse from JSON string.
+//
+func ConfigParse(rw interface{}, cfg []byte) error {
+ return json.Unmarshal(cfg, rw)
+}
+
+//
+// ConfigCheckPath if no path in file, return the config path plus file name,
+// otherwise leave it unchanged.
+//
+func ConfigCheckPath(comin ConfigInterface, file string) string {
+ dir := path.Dir(file)
+
+ if dir == "." {
+ cfgPath := comin.GetConfigPath()
+ if cfgPath != "" && cfgPath != "." {
+ return cfgPath + "/" + file
+ }
+ }
+
+ // nothing happen.
+ return file
+}
diff --git a/lib/dsv/data_test.go b/lib/dsv/data_test.go
new file mode 100644
index 00000000..41f6cf7a
--- /dev/null
+++ b/lib/dsv/data_test.go
@@ -0,0 +1,58 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+var expectation = []string{
+ "&[1 A-B AB 1 0.1]",
+ "&[2 A-B-C BCD 2 0.02]",
+ "&[3 A;B-C,D A;B C,D 3 0.003]",
+ "&[4 A;B-C,D A;B C D 4 0.0004]",
+ "&[6 6 0.000006]",
+ "&[8 ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]",
+ "&[10 test integer 10 0.101]",
+ "&[12 test real 123456789 0.123456789]",
+ "&[13 string with string with 13 13]",
+ "&[14 string with\" quote string with]] escape 14 14]",
+}
+
+var expSkip = []string{
+ "&[A-B AB 1 0.1]",
+ "&[A-B-C BCD 2 0.02]",
+ "&[A;B-C,D A;B C,D 3 0.003]",
+ "&[A;B-C,D A;B C D 4 0.0004]",
+ "&[ 6 0.000006]",
+ "&[ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]",
+ "&[test integer 10 0.101]",
+ "&[test real 123456789 0.123456789]",
+ "&[string with string with 13 13]",
+ "&[string with\" quote string with]] escape 14 14]",
+}
+
+var expSkipColumns = []string{
+ "[{name 0 0 [] [A-B]} {value 0 0 [] [AB]} {integer 1 0 [] [1]} {real 2 0 [] [0.1]}]",
+ "[{name 0 0 [] [A-B-C]} {value 0 0 [] [BCD]} {integer 1 0 [] [2]} {real 2 0 [] [0.02]}]",
+ "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C,D]} {integer 1 0 [] [3]} {real 2 0 [] [0.003]}]",
+ "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C D]} {integer 1 0 [] [4]} {real 2 0 [] [0.0004]}]",
+ "[{name 0 0 [] []} {value 0 0 [] []} {integer 1 0 [] [6]} {real 2 0 [] [0.000006]}]",
+ "[{name 0 0 [] [ok]} {value 0 0 [] [missing right-quote];8;0.00000008\n9;\"ok\"-[[ok]} {integer 1 0 [] [9]} {real 2 0 [] [0.000000009]}]",
+ "[{name 0 0 [] [test]} {value 0 0 [] [integer]} {integer 1 0 [] [10]} {real 2 0 [] [0.101]}]",
+ "[{name 0 0 [] [test]} {value 0 0 [] [real]} {integer 1 0 [] [123456789]} {real 2 0 [] [0.123456789]}]",
+ "[{name 0 0 [] [string with]} {value 0 0 [] [string with]} {integer 1 0 [] [13]} {real 2 0 [] [13]}]",
+ "[{name 0 0 [] [string with\" quote]} {value 0 0 [] [string with]] escape]} {integer 1 0 [] [14]} {real 2 0 [] [14]}]",
+}
+
+var expSkipColumnsAll = []string{
+ "{name 0 0 [] [A-B A-B-C A;B-C,D A;B-C,D ok test test string with string with\" quote]}",
+ "{value 0 0 [] [AB BCD A;B C,D A;B C D missing right-quote];8;0.00000008\n9;\"ok\"-[[ok integer real string with string with]] escape]}",
+ "{integer 1 0 [] [1 2 3 4 6 9 10 123456789 13 14]}",
+ "{real 2 0 [] [0.1 0.02 0.003 0.0004 0.000006 0.000000009 0.101 0.123456789 13 14]}",
+}
+
+var expSkipColumnsAllRev = []string{
+ "{name 0 0 [] [string with\" quote string with test test ok A;B-C,D A;B-C,D A-B-C A-B]}",
+ "{value 0 0 [] [string with]] escape string with real integer missing right-quote];8;0.00000008\n9;\"ok\"-[[ok A;B C D A;B C,D BCD AB]}",
+ "{integer 1 0 [] [14 13 123456789 10 9 6 4 3 2 1]}",
+ "{real 2 0 [] [14 13 0.123456789 0.101 0.000000009 0.000006 0.0004 0.003 0.02 0.1]}",
+}
diff --git a/lib/dsv/dsv.go b/lib/dsv/dsv.go
new file mode 100644
index 00000000..b777264f
--- /dev/null
+++ b/lib/dsv/dsv.go
@@ -0,0 +1,100 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package dsv is a library for working with delimited separated value (DSV).
+//
+// DSV is a free-style form of Comma Separated Value (CSV) format of text data,
+// where each row is separated by newline, and each column can be separated by
+// any string enclosed with left-quote and right-quote.
+//
+package dsv
+
+import (
+ "errors"
+)
+
+const (
+ // DefaultRejected define the default file which will contain the
+ // rejected row.
+ DefaultRejected = "rejected.dat"
+
+ // DefaultMaxRows define default maximum row that will be saved
+ // in memory for each read if input data is too large and can not be
+ // consumed in one read operation.
+ DefaultMaxRows = 256
+
+ // DefDatasetMode default output mode is rows.
+ DefDatasetMode = DatasetModeROWS
+
+ // DefEOL default end-of-line
+ DefEOL = '\n'
+)
+
+var (
+ // ErrNoInput define an error when no Input file is given to Reader.
+ ErrNoInput = errors.New("dsv: No input file is given in config")
+
+ // ErrMissRecordsLen define an error when trying to push Row
+ // to Field, when their length is not equal.
+ // See reader.PushRowToColumns().
+ ErrMissRecordsLen = errors.New("dsv: Mismatch between number of record in row and columns length")
+
+ // ErrNoOutput define an error when no output file is given to Writer.
+ ErrNoOutput = errors.New("dsv: No output file is given in config")
+
+ // ErrNotOpen define an error when output file has not been opened
+ // by Writer.
+ ErrNotOpen = errors.New("dsv: Output file is not opened")
+
+ // ErrNilReader define an error when Reader object is nil when passed
+ // to Write function.
+ ErrNilReader = errors.New("dsv: Reader object is nil")
+)
+
+//
+// ReadWriter combine reader and writer.
+//
+type ReadWriter struct {
+ Reader
+ Writer
+}
+
+//
+// New create a new ReadWriter object.
+//
+func New(config string, dataset interface{}) (rw *ReadWriter, e error) {
+ rw = &ReadWriter{}
+
+ e = rw.Reader.Init(config, dataset)
+ if e != nil {
+ return nil, e
+ }
+
+ e = OpenWriter(&rw.Writer, config)
+ if e != nil {
+ return nil, e
+ }
+
+ return
+}
+
+//
+// SetConfigPath of input and output file.
+//
+func (dsv *ReadWriter) SetConfigPath(dir string) {
+ dsv.Reader.SetConfigPath(dir)
+ dsv.Writer.SetConfigPath(dir)
+}
+
+//
+// Close reader and writer.
+//
+func (dsv *ReadWriter) Close() (e error) {
+ e = dsv.Writer.Close()
+ if e != nil {
+ return
+ }
+ return dsv.Reader.Close()
+}
diff --git a/lib/dsv/dsv_test.go b/lib/dsv/dsv_test.go
new file mode 100644
index 00000000..f4661cdf
--- /dev/null
+++ b/lib/dsv/dsv_test.go
@@ -0,0 +1,96 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "testing"
+)
+
+//
+// doInit create read-write object.
+//
+func doInit(t *testing.T, fcfg string) (rw *ReadWriter, e error) {
+ // Initialize dsv
+ rw, e = New(fcfg, nil)
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ return
+}
+
+//
+// TestReadWriter test reading and writing DSV.
+//
+func TestReadWriter(t *testing.T) {
+ rw, _ := doInit(t, "testdata/config.dsv")
+
+ doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true)
+
+ e := rw.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
+}
+
+//
+// TestReadWriter test reading and writing DSV.
+//
+func TestReadWriterAll(t *testing.T) {
+ rw, _ := doInit(t, "testdata/config.dsv")
+
+ rw.SetMaxRows(-1)
+
+ doReadWrite(t, &rw.Reader, &rw.Writer, expectation, false)
+
+ e := rw.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
+}
+
+func TestSimpleReadWrite(t *testing.T) {
+ fcfg := "testdata/config_simpleread.dsv"
+
+ reader, e := SimpleRead(fcfg, nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ fout := "testdata/output.dat"
+ fexp := "testdata/expected.dat"
+
+ _, e = SimpleWrite(reader, fcfg)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, fexp, fout, true)
+}
+
+func TestSimpleMerge(t *testing.T) {
+ fcfg1 := "testdata/config_simpleread.dsv"
+ fcfg2 := "testdata/config_simpleread.dsv"
+
+ reader, e := SimpleMerge(fcfg1, fcfg2, nil, nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ _, e = SimpleWrite(reader, fcfg1)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ fexp := "testdata/expected_simplemerge.dat"
+ fout := "testdata/output.dat"
+
+ assertFile(t, fexp, fout, true)
+}
diff --git a/lib/dsv/dsvinterface.go b/lib/dsv/dsvinterface.go
new file mode 100644
index 00000000..6de50d08
--- /dev/null
+++ b/lib/dsv/dsvinterface.go
@@ -0,0 +1,85 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "io"
+)
+
+//
+// SimpleRead provide a shortcut to read data from file using configuration file
+// from `fcfg`.
+// Return the reader contained data or error if failed.
+// Reader object upon returned has been closed, so if one need to read all
+// data in it simply set the `MaxRows` to `-1` in config file.
+//
+func SimpleRead(fcfg string, dataset interface{}) (
+ reader ReaderInterface,
+ e error,
+) {
+ reader, e = NewReader(fcfg, dataset)
+
+ if e != nil {
+ return
+ }
+
+ _, e = Read(reader)
+ if e != nil && e != io.EOF {
+ return nil, e
+ }
+
+ e = reader.Close()
+
+ return
+}
+
+//
+// SimpleWrite provide a shortcut to write data from reader using output metadata
+// format and output file defined in file `fcfg`.
+//
+func SimpleWrite(reader ReaderInterface, fcfg string) (nrows int, e error) {
+ writer, e := NewWriter(fcfg)
+ if e != nil {
+ return
+ }
+
+ nrows, e = writer.Write(reader)
+ if e != nil {
+ return
+ }
+
+ e = writer.Close()
+
+ return
+}
+
+//
+// SimpleMerge provide a shortcut to merge two dsv files using configuration
+// files passed in parameters.
+//
+// One must remember to set,
+// - "MaxRows" to -1 to be able to read all rows, in both input configuration, and
+// - "DatasetMode" to "columns" to speeding up process.
+//
+// This function return the merged reader or error if failed.
+//
+func SimpleMerge(fin1, fin2 string, dataset1, dataset2 interface{}) (
+ ReaderInterface,
+ error,
+) {
+ reader1, e := SimpleRead(fin1, dataset1)
+ if e != nil {
+ return nil, e
+ }
+
+ reader2, e := SimpleRead(fin2, dataset2)
+ if e != nil {
+ return nil, e
+ }
+
+ reader1.MergeColumns(reader2)
+
+ return reader1, nil
+}
diff --git a/lib/dsv/metadata.go b/lib/dsv/metadata.go
new file mode 100644
index 00000000..6e457080
--- /dev/null
+++ b/lib/dsv/metadata.go
@@ -0,0 +1,163 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "encoding/json"
+ "log"
+ "strings"
+
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// Metadata represent on how to parse each column in record.
+//
+type Metadata struct {
+ // Name of the column, optional.
+ Name string `json:"Name"`
+ // Type of the column, default to "string".
+ // Valid value are: "string", "integer", "real"
+ Type string `json:"Type"`
+ // T type of column in integer.
+ T int
+ // Separator for column in record.
+ Separator string `json:"Separator"`
+ // LeftQuote define the characters that enclosed the column in the left
+ // side.
+ LeftQuote string `json:"LeftQuote"`
+ // RightQuote define the characters that enclosed the column in the
+ // right side.
+ RightQuote string `json:"RightQuote"`
+ // Skip, if its true this column will be ignored, not saved in reader
+ // object. Default to false.
+ Skip bool `json:"Skip"`
+ // ValueSpace contain the possible value in records
+ ValueSpace []string `json:"ValueSpace"`
+}
+
+//
+// NewMetadata create and return new metadata.
+//
+func NewMetadata(name, tipe, sep, leftq, rightq string, vs []string) (
+ md *Metadata,
+) {
+ md = &Metadata{
+ Name: name,
+ Type: tipe,
+ Separator: sep,
+ LeftQuote: leftq,
+ RightQuote: rightq,
+ ValueSpace: vs,
+ }
+
+ md.Init()
+
+ return
+}
+
+//
+// Init initialize metadata column, i.e. check and set column type.
+//
+// If type is unknown it will default to string.
+//
+func (md *Metadata) Init() {
+ switch strings.ToUpper(md.Type) {
+ case "INTEGER", "INT":
+ md.T = tabula.TInteger
+ case "REAL":
+ md.T = tabula.TReal
+ default:
+ md.T = tabula.TString
+ md.Type = "string"
+ }
+}
+
+//
+// GetName return the name of metadata.
+//
+func (md *Metadata) GetName() string {
+ return md.Name
+}
+
+//
+// GetType return type of metadata.
+//
+func (md *Metadata) GetType() int {
+ return md.T
+}
+
+//
+// GetTypeName return string representation of type.
+//
+func (md *Metadata) GetTypeName() string {
+ return md.Type
+}
+
+//
+// GetSeparator return the field separator.
+//
+func (md *Metadata) GetSeparator() string {
+ return md.Separator
+}
+
+//
+// GetLeftQuote return the string used in the beginning of record value.
+//
+func (md *Metadata) GetLeftQuote() string {
+ return md.LeftQuote
+}
+
+//
+// GetRightQuote return string that end in record value.
+//
+func (md *Metadata) GetRightQuote() string {
+ return md.RightQuote
+}
+
+//
+// GetSkip return number of rows that will be skipped when reading data.
+//
+func (md *Metadata) GetSkip() bool {
+ return md.Skip
+}
+
+//
+// GetValueSpace return value space.
+//
+func (md *Metadata) GetValueSpace() []string {
+ return md.ValueSpace
+}
+
+//
+// IsEqual return true if this metadata equal with other instance, return false
+// otherwise.
+//
+func (md *Metadata) IsEqual(o MetadataInterface) bool {
+ if md.Name != o.GetName() {
+ return false
+ }
+ if md.Separator != o.GetSeparator() {
+ return false
+ }
+ if md.LeftQuote != o.GetLeftQuote() {
+ return false
+ }
+ if md.RightQuote != o.GetRightQuote() {
+ return false
+ }
+ return true
+}
+
+//
+// String yes, it will print it JSON like format.
+//
+func (md *Metadata) String() string {
+ r, e := json.MarshalIndent(md, "", "\t")
+ if nil != e {
+ log.Print(e)
+ }
+ return string(r)
+}
diff --git a/lib/dsv/metadata_test.go b/lib/dsv/metadata_test.go
new file mode 100644
index 00000000..46630c11
--- /dev/null
+++ b/lib/dsv/metadata_test.go
@@ -0,0 +1,48 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "testing"
+)
+
+func TestMetadataIsEqual(t *testing.T) {
+ cases := []struct {
+ in Metadata
+ out Metadata
+ result bool
+ }{
+ {
+ Metadata{
+ Name: "A",
+ Separator: ",",
+ },
+ Metadata{
+ Name: "A",
+ Separator: ",",
+ },
+ true,
+ },
+ {
+ Metadata{
+ Name: "A",
+ Separator: ",",
+ },
+ Metadata{
+ Name: "A",
+ Separator: ";",
+ },
+ false,
+ },
+ }
+
+ for _, c := range cases {
+ r := c.in.IsEqual(&c.out)
+
+ if r != c.result {
+ t.Error("Test failed on ", c.in, c.out)
+ }
+ }
+}
diff --git a/lib/dsv/metadatainterface.go b/lib/dsv/metadatainterface.go
new file mode 100644
index 00000000..a0425b2e
--- /dev/null
+++ b/lib/dsv/metadatainterface.go
@@ -0,0 +1,45 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+//
+// MetadataInterface is the interface for field metadata.
+// This is to make anyone can extend the DSV library including the metadata.
+//
+type MetadataInterface interface {
+ Init()
+ GetName() string
+ GetType() int
+ GetTypeName() string
+ GetLeftQuote() string
+ GetRightQuote() string
+ GetSeparator() string
+ GetSkip() bool
+ GetValueSpace() []string
+
+ IsEqual(MetadataInterface) bool
+}
+
+//
+// FindMetadata Given a slice of metadata, find `mdin` in the slice which has the
+// same name, ignoring metadata where Skip value is true.
+// If found, return the index and metadata object of matched metadata name.
+// If not found return -1 as index and nil in `mdout`.
+//
+func FindMetadata(mdin MetadataInterface, mds []MetadataInterface) (
+ idx int,
+ mdout MetadataInterface,
+) {
+ for _, md := range mds {
+ if md.GetName() == mdin.GetName() {
+ mdout = md
+ break
+ }
+ if !md.GetSkip() {
+ idx++
+ }
+ }
+ return idx, mdout
+}
diff --git a/lib/dsv/reader.go b/lib/dsv/reader.go
new file mode 100644
index 00000000..1e78352e
--- /dev/null
+++ b/lib/dsv/reader.go
@@ -0,0 +1,632 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "bufio"
+ "log"
+ "os"
+ "strings"
+
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ // DatasetModeROWS is a string representation of output mode rows.
+ DatasetModeROWS = "ROWS"
+ // DatasetModeCOLUMNS is a string representation of output mode columns.
+ DatasetModeCOLUMNS = "COLUMNS"
+ // DatasetModeMATRIX will save data in rows and columns. This mode will
+ // consume more memory that "rows" and "columns" but give greater
+ // flexibility when working with data.
+ DatasetModeMATRIX = "MATRIX"
+)
+
+//
+// Reader hold all configuration, metadata and input data.
+//
+// DSV Reader work like this,
+//
+// (1) Initialize new dsv reader object
+//
+// dsvReader, e := dsv.NewReader(configfile)
+//
+// (2) Do not forget to check for error ...
+//
+// if e != nil {
+// // handle error
+// }
+//
+// (3) Make sure to close all files after finished
+//
+// defer dsvReader.Close ()
+//
+// (4) Create loop to read input data
+//
+// for {
+// n, e := dsv.Read (dsvReader)
+//
+// if e == io.EOF {
+// break
+// }
+//
+// (4.1) Iterate through rows
+//
+// for row := range dsvReader.GetDataAsRows() {
+// // work with row ...
+// }
+// }
+//
+// Thats it.
+//
+//
+type Reader struct {
+ // Config define path of configuration file.
+ //
+ // If the configuration located in other directory, e.g.
+ // "../../config.dsv", and the Input option is set with name only, like
+ // "input.dat", we assume that its in the same directory where the
+ // configuration file belong.
+ Config
+ // Dataset contains the content of input file after read.
+ dataset interface{}
+ // Input file, mandatory.
+ Input string `json:"Input"`
+ // Skip n lines from the head.
+ Skip int `json:"Skip"`
+ // TrimSpace or not. If its true, before parsing the line, the white
+ // space in the beginning and end of each input line will be removed,
+ // otherwise it will leave unmodified. Default is true.
+ TrimSpace bool `json:"TrimSpace"`
+ // Rejected is the file name where row that does not fit
+ // with metadata will be saved.
+ Rejected string `json:"Rejected"`
+ // InputMetadata define format for each column in input data.
+ InputMetadata []Metadata `json:"InputMetadata"`
+ // MaxRows define maximum row that this reader will read and
+ // saved in the memory at one read operation.
+ // If the value is -1, all rows will read.
+ MaxRows int `json:"MaxRows"`
+ // DatasetMode define on how do you want the result is saved. There are
+ // three options: either in "rows", "columns", or "matrix" mode.
+ // For example, input data file,
+ //
+ // a,b,c
+ // 1,2,3
+ //
+ // "rows" mode is where each line saved in its own slice, resulting
+ // in Rows:
+ //
+ // [a b c]
+ // [1 2 3]
+ //
+ // "columns" mode is where each line saved by columns, resulting in
+ // Columns:
+ //
+ // [a 1]
+ // [b 2]
+ // [c 3]
+ //
+ // "matrix" mode is where each record saved in their own row and column.
+ //
+ DatasetMode string `json:"DatasetMode"`
+ // fRead is read descriptor.
+ fRead *os.File
+ // fReject is reject descriptor.
+ fReject *os.File
+ // bufRead is a buffer for working with input file.
+ bufRead *bufio.Reader
+ // bufReject is a buffer for working with rejected file.
+ bufReject *bufio.Writer
+}
+
+//
+// NewReader create and initialize new instance of DSV Reader with default values.
+//
+func NewReader(config string, dataset interface{}) (reader *Reader, e error) {
+ reader = &Reader{
+ Input: "",
+ Skip: 0,
+ TrimSpace: true,
+ Rejected: DefaultRejected,
+ InputMetadata: nil,
+ MaxRows: DefaultMaxRows,
+ DatasetMode: DefDatasetMode,
+ dataset: dataset,
+ fRead: nil,
+ fReject: nil,
+ bufRead: nil,
+ bufReject: nil,
+ }
+
+ e = reader.Init(config, dataset)
+ if e != nil {
+ return nil, e
+ }
+
+ return
+}
+
+//
+// Init will initialize reader object by
+//
+// (1) Check if dataset is not empty.
+// (2) Read config file.
+// (3) Set reader object default value.
+// (4) Check if output mode is valid and initialize it if valid.
+// (5) Check and initialize metadata and columns attributes.
+// (6) Check if Input is name only without path, so we can prefix it with
+// config path.
+// (7) Open rejected file.
+// (8) Open input file.
+//
+func (reader *Reader) Init(fcfg string, dataset interface{}) (e error) {
+ // (1)
+ if dataset == nil {
+ dataset = reader.GetDataset()
+ if dataset == nil {
+ dataset = &tabula.Dataset{}
+ reader.dataset = dataset
+ }
+ }
+
+ // (2)
+ fcfg = strings.TrimSpace(fcfg)
+ if fcfg != "" {
+ e = ConfigOpen(reader, fcfg)
+ if e != nil {
+ return e
+ }
+
+ e = tabula.ReadDatasetConfig(dataset, fcfg)
+ if e != nil {
+ return e
+ }
+ }
+
+ // (3)
+ reader.SetDefault()
+
+ // (4)
+ reader.SetDatasetMode(reader.GetDatasetMode())
+
+ // (5)
+ ds := dataset.(tabula.DatasetInterface)
+ md := reader.GetInputMetadata()
+ for i := range md {
+ md[i].Init()
+
+ // Count number of output columns.
+ if !md[i].GetSkip() {
+ // add type of metadata to list of type
+ col := tabula.Column{
+ Type: md[i].GetType(),
+ Name: md[i].GetName(),
+ ValueSpace: md[i].GetValueSpace(),
+ }
+ ds.PushColumn(col)
+ }
+ }
+
+ // (6)
+ reader.SetInput(ConfigCheckPath(reader, reader.GetInput()))
+ reader.SetRejected(ConfigCheckPath(reader, reader.GetRejected()))
+
+ // (7)
+ e = reader.OpenRejected()
+ if nil != e {
+ return
+ }
+
+ // (8)
+ e = reader.OpenInput()
+ if nil != e {
+ return
+ }
+
+ return
+}
+
+//
+// SetDefault options for global config and each metadata.
+//
+func (reader *Reader) SetDefault() {
+ if "" == strings.TrimSpace(reader.Rejected) {
+ reader.Rejected = DefaultRejected
+ }
+ if 0 == reader.MaxRows {
+ reader.MaxRows = DefaultMaxRows
+ }
+ if "" == strings.TrimSpace(reader.DatasetMode) {
+ reader.DatasetMode = DefDatasetMode
+ }
+ if nil == reader.dataset {
+ reader.dataset = &tabula.Dataset{}
+ }
+}
+
+//
+// CopyConfig copy configuration from other reader object not including data
+// and metadata.
+//
+func (reader *Reader) CopyConfig(src *Reader) {
+ reader.ConfigPath = src.GetConfigPath()
+ reader.Input = src.GetInput()
+ reader.Skip = src.GetSkip()
+ reader.TrimSpace = src.IsTrimSpace()
+ reader.Rejected = src.GetRejected()
+ reader.MaxRows = src.GetMaxRows()
+ reader.DatasetMode = src.GetDatasetMode()
+}
+
+//
+// GetInput return the input file.
+//
+func (reader *Reader) GetInput() string {
+ return reader.Input
+}
+
+//
+// SetInput file.
+//
+func (reader *Reader) SetInput(path string) {
+ reader.Input = path
+}
+
+//
+// GetSkip return number of line that will be skipped.
+//
+func (reader *Reader) GetSkip() int {
+ return reader.Skip
+}
+
+//
+// SetSkip set number of lines that will be skipped before reading actual data.
+//
+func (reader *Reader) SetSkip(n int) {
+ reader.Skip = n
+}
+
+//
+// IsTrimSpace return value of TrimSpace option.
+//
+func (reader *Reader) IsTrimSpace() bool {
+ return reader.TrimSpace
+}
+
+//
+// GetRejected return name of rejected file.
+//
+func (reader *Reader) GetRejected() string {
+ return reader.Rejected
+}
+
+//
+// SetRejected file.
+//
+func (reader *Reader) SetRejected(path string) {
+ reader.Rejected = path
+}
+
+//
+// AddInputMetadata add new input metadata to reader.
+//
+func (reader *Reader) AddInputMetadata(md *Metadata) {
+ reader.InputMetadata = append(reader.InputMetadata, *md)
+ ds := reader.dataset.(tabula.DatasetInterface)
+ ds.AddColumn(md.GetType(), md.GetName(), md.GetValueSpace())
+}
+
+//
+// AppendMetadata will append new metadata `md` to list of reader input metadata.
+//
+func (reader *Reader) AppendMetadata(mdi MetadataInterface) {
+ md := mdi.(*Metadata)
+ reader.InputMetadata = append(reader.InputMetadata, *md)
+}
+
+//
+// GetInputMetadata return pointer to slice of metadata.
+//
+func (reader *Reader) GetInputMetadata() []MetadataInterface {
+ md := make([]MetadataInterface, len(reader.InputMetadata))
+ for i := range reader.InputMetadata {
+ md[i] = &reader.InputMetadata[i]
+ }
+
+ return md
+}
+
+//
+// GetInputMetadataAt return pointer to metadata at index 'idx'.
+//
+func (reader *Reader) GetInputMetadataAt(idx int) MetadataInterface {
+ return &reader.InputMetadata[idx]
+}
+
+//
+// GetMaxRows return number of maximum rows for reading.
+//
+func (reader *Reader) GetMaxRows() int {
+ return reader.MaxRows
+}
+
+//
+// SetMaxRows will set maximum rows that will be read from input file.
+//
+func (reader *Reader) SetMaxRows(max int) {
+ reader.MaxRows = max
+}
+
+//
+// GetDatasetMode return output mode of data.
+//
+func (reader *Reader) GetDatasetMode() string {
+ return reader.DatasetMode
+}
+
+//
+// SetDatasetMode to `mode`.
+//
+func (reader *Reader) SetDatasetMode(mode string) {
+ ds := reader.dataset.(tabula.DatasetInterface)
+ switch strings.ToUpper(mode) {
+ case DatasetModeROWS:
+ ds.SetMode(tabula.DatasetModeRows)
+ case DatasetModeCOLUMNS:
+ ds.SetMode(tabula.DatasetModeColumns)
+ case DatasetModeMATRIX:
+ fallthrough
+ default:
+ ds.SetMode(tabula.DatasetModeMatrix)
+ mode = DatasetModeMATRIX
+ }
+ reader.DatasetMode = mode
+}
+
+//
+// GetNColumnIn return number of input columns, or number of metadata, including
+// column with Skip=true.
+//
+func (reader *Reader) GetNColumnIn() int {
+ return len(reader.InputMetadata)
+}
+
+//
+// OpenInput open the input file, metadata must have been initialize.
+//
+func (reader *Reader) OpenInput() (e error) {
+ reader.fRead, e = os.OpenFile(reader.Input, os.O_RDONLY, 0600)
+ if nil != e {
+ return e
+ }
+
+ reader.bufRead = bufio.NewReader(reader.fRead)
+
+ // Skip lines
+ if reader.GetSkip() > 0 {
+ e = reader.SkipLines()
+
+ if nil != e {
+ return
+ }
+ }
+
+ return nil
+}
+
+//
+// OpenRejected open rejected file, for saving unparseable line.
+//
+func (reader *Reader) OpenRejected() (e error) {
+ reader.fReject, e = os.OpenFile(reader.Rejected,
+ os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
+ if nil != e {
+ return e
+ }
+
+ reader.bufReject = bufio.NewWriter(reader.fReject)
+
+ return nil
+}
+
+//
+// Open input and rejected file.
+//
+func (reader *Reader) Open() (e error) {
+ // do not let file descriptor leaked
+ e = reader.Close()
+ if e != nil {
+ return
+ }
+
+ e = reader.OpenInput()
+ if e != nil {
+ return
+ }
+
+ e = reader.OpenRejected()
+
+ return
+}
+
+//
+// SkipLines skip parsing n lines from input file.
+// The n is defined in the attribute "Skip"
+//
+func (reader *Reader) SkipLines() (e error) {
+ for i := 0; i < reader.Skip; i++ {
+ _, e = reader.ReadLine()
+
+ if nil != e {
+ log.Print("dsv: ", e)
+ return
+ }
+ }
+ return
+}
+
+//
+// Reset all variables for next read operation. Number of rows will be 0, and
+// Rows will be empty again.
+//
+func (reader *Reader) Reset() (e error) {
+ e = reader.Flush()
+ if e != nil {
+ return
+ }
+ e = reader.dataset.(tabula.DatasetInterface).Reset()
+ return
+}
+
+//
+// Flush all output buffer.
+//
+func (reader *Reader) Flush() error {
+ return reader.bufReject.Flush()
+}
+
+//
+// ReadLine will read one line from input file.
+//
+func (reader *Reader) ReadLine() (line []byte, e error) {
+ line, e = reader.bufRead.ReadBytes(DefEOL)
+
+ if e == nil {
+ // remove EOL
+ line = line[:len(line)-1]
+ }
+
+ return
+}
+
+//
+// FetchNextLine read the next line and combine it with the `lastline`.
+//
+func (reader *Reader) FetchNextLine(lastline []byte) (line []byte, e error) {
+ line, e = reader.ReadLine()
+
+ lastline = append(lastline, DefEOL)
+ lastline = append(lastline, line...)
+
+ return lastline, e
+}
+
+//
+// Reject the line and save it to the reject file.
+//
+func (reader *Reader) Reject(line []byte) (int, error) {
+ return reader.bufReject.Write(line)
+}
+
+//
+// deleteEmptyRejected if rejected file is empty, delete it.
+//
+func (reader *Reader) deleteEmptyRejected() {
+ finfo, e := os.Stat(reader.Rejected)
+ if e != nil {
+ return
+ }
+
+ if finfo.Size() >= 0 {
+ _ = os.Remove(reader.Rejected)
+ }
+}
+
+//
+// Close all open descriptors.
+//
+func (reader *Reader) Close() (e error) {
+ if nil != reader.bufReject {
+ e = reader.bufReject.Flush()
+ if e != nil {
+ return
+ }
+ }
+ if nil != reader.fReject {
+ e = reader.fReject.Close()
+ if e != nil {
+ return
+ }
+ }
+
+ reader.deleteEmptyRejected()
+
+ if nil != reader.fRead {
+ e = reader.fRead.Close()
+ }
+ return
+}
+
+//
+// IsEqual compare only the configuration and metadata with other instance.
+//
+func (reader *Reader) IsEqual(other *Reader) bool {
+ if reader == other {
+ return true
+ }
+ if reader.Input != other.Input {
+ return false
+ }
+
+ l, r := len(reader.InputMetadata), len(other.InputMetadata)
+
+ if l != r {
+ return false
+ }
+
+ for a := 0; a < l; a++ {
+ if !reader.InputMetadata[a].IsEqual(&other.InputMetadata[a]) {
+ return false
+ }
+ }
+
+ return true
+}
+
+//
+// GetDataset return reader dataset.
+//
+func (reader *Reader) GetDataset() interface{} {
+ return reader.dataset
+}
+
+//
+// MergeColumns append metadata and columns from another reader if not exist in
+// current metadata set.
+//
+func (reader *Reader) MergeColumns(other ReaderInterface) {
+ for _, md := range other.GetInputMetadata() {
+ if md.GetSkip() {
+ continue
+ }
+
+ // Check if the same metadata name exist in current dataset.
+ found := false
+ for _, lmd := range reader.GetInputMetadata() {
+ if lmd.GetName() == md.GetName() {
+ found = true
+ break
+ }
+ }
+
+ if found {
+ continue
+ }
+
+ reader.AppendMetadata(md)
+ }
+
+ reader.dataset.(tabula.DatasetInterface).MergeColumns(
+ other.GetDataset().(tabula.DatasetInterface))
+}
+
+//
+// MergeRows append rows from another reader.
+//
+func (reader *Reader) MergeRows(other *Reader) {
+ reader.dataset.(tabula.DatasetInterface).MergeRows(
+ other.GetDataset().(tabula.DatasetInterface))
+}
diff --git a/lib/dsv/reader_test.go b/lib/dsv/reader_test.go
new file mode 100644
index 00000000..d8d724b4
--- /dev/null
+++ b/lib/dsv/reader_test.go
@@ -0,0 +1,601 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "fmt"
+ "io"
+ "strings"
+ "testing"
+
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+var jsonSample = []string{
+ `{}`,
+ `{
+ "Input" :"testdata/input.dat"
+ }`,
+ `{
+ "Input" :"testdata/input.dat"
+ }`,
+ `{
+ "Input" :"testdata/input.dat"
+ , "InputMetadata" :
+ [{
+ "Name" :"A"
+ , "Separator" :","
+ },{
+ "Name" :"B"
+ , "Separator" :";"
+ }]
+ }`,
+ `{
+ "Input" :"testdata/input.dat"
+ , "Skip" :1
+ , "MaxRows" :1
+ , "InputMetadata" :
+ [{
+ "Name" :"id"
+ , "Separator" :";"
+ , "Type" :"integer"
+ },{
+ "Name" :"name"
+ , "Separator" :"-"
+ , "LeftQuote" :"\""
+ , "RightQuote" :"\""
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"[["
+ , "RightQuote" :"]]"
+ },{
+ "Name" :"integer"
+ , "Type" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ , "Type" :"real"
+ }]
+ }`,
+ `{
+ "Input" :"testdata/input.dat"
+ , "Skip" :1
+ , "MaxRows" :1
+ , "InputMetadata" :
+ [{
+ "Name" :"id"
+ },{
+ "Name" :"editor"
+ },{
+ "Name" :"old_rev_id"
+ },{
+ "Name" :"new_rev_id"
+ },{
+ "Name" :"diff_url"
+ },{
+ "Name" :"edit_time"
+ },{
+ "Name" :"edit_comment"
+ },{
+ "Name" :"article_id"
+ },{
+ "Name" :"article_title"
+ }]
+ }`,
+}
+
+var readers = []*Reader{
+ {},
+ {
+ Input: "testdata/input.dat",
+ },
+ {
+ Input: "test-another.dsv",
+ },
+ {
+ Input: "testdata/input.dat",
+ InputMetadata: []Metadata{
+ {
+ Name: "A",
+ Separator: ",",
+ },
+ {
+ Name: "B",
+ Separator: ";",
+ },
+ },
+ },
+}
+
+//
+// TestReaderNoInput will print error that the input is not defined.
+//
+func TestReaderNoInput(t *testing.T) {
+ dsvReader := &Reader{}
+
+ e := ConfigParse(dsvReader, []byte(jsonSample[0]))
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ e = dsvReader.Init("", nil)
+
+ if nil == e {
+ t.Fatal("TestReaderNoInput: failed, should return non nil!")
+ }
+}
+
+//
+// TestConfigParse test parsing metadata.
+//
+func TestConfigParse(t *testing.T) {
+ cases := []struct {
+ in string
+ out *Reader
+ }{
+ {
+ jsonSample[1],
+ readers[1],
+ },
+ {
+ jsonSample[3],
+ readers[3],
+ },
+ }
+
+ dsvReader := &Reader{}
+
+ for _, c := range cases {
+ e := ConfigParse(dsvReader, []byte(c.in))
+
+ if e != nil {
+ t.Fatal(e)
+ }
+ if !dsvReader.IsEqual(c.out) {
+ t.Fatal("Test failed on ", c.in)
+ }
+ }
+}
+
+func TestReaderIsEqual(t *testing.T) {
+ cases := []struct {
+ in *Reader
+ out *Reader
+ result bool
+ }{
+ {
+ readers[1],
+ &Reader{
+ Input: "testdata/input.dat",
+ },
+ true,
+ },
+ {
+ readers[1],
+ readers[2],
+ false,
+ },
+ }
+
+ var r bool
+
+ for _, c := range cases {
+ r = c.in.IsEqual(c.out)
+
+ if r != c.result {
+ t.Fatal("Test failed on equality between ", c.in,
+ "\n and ", c.out)
+ }
+ }
+}
+
+//
+// doRead test reading the DSV data.
+//
+func doRead(t *testing.T, dsvReader *Reader, exp []string) {
+ i := 0
+ var n int
+ var e error
+
+ for {
+ n, e = Read(dsvReader)
+
+ if n > 0 {
+ r := fmt.Sprint(dsvReader.
+ GetDataset().(tabula.DatasetInterface).
+ GetDataAsRows())
+
+ test.Assert(t, "", exp[i], r, true)
+
+ i++
+ } else if e == io.EOF {
+ // EOF
+ break
+ }
+ }
+}
+
+//
+// TestReader test reading.
+//
+func TestReaderRead(t *testing.T) {
+ dsvReader := &Reader{}
+
+ e := ConfigParse(dsvReader, []byte(jsonSample[4]))
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ e = dsvReader.Init("", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ doRead(t, dsvReader, expectation)
+
+ e = dsvReader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+//
+// TestReaderOpen real example from the start.
+//
+func TestReaderOpen(t *testing.T) {
+ dsvReader, e := NewReader("testdata/config.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ doRead(t, dsvReader, expectation)
+
+ e = dsvReader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestDatasetMode(t *testing.T) {
+ var e error
+ var config = []string{`{
+ "Input" :"testdata/input.dat"
+ , "DatasetMode" :"row"
+ }`, `{
+ "Input" :"testdata/input.dat"
+ , "DatasetMode" :"rows"
+ }`, `{
+ "Input" :"testdata/input.dat"
+ , "DatasetMode" :"columns"
+ }`}
+
+ var exps = []struct {
+ status bool
+ value string
+ }{{
+ false,
+ string(config[0]),
+ }, {
+ true,
+ string(config[1]),
+ }, {
+ true,
+ string(config[2]),
+ }}
+
+ reader := &Reader{}
+
+ for k, v := range exps {
+ e = ConfigParse(reader, []byte(config[k]))
+
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = reader.Init("", nil)
+ if e != nil {
+ if v.status {
+ t.Fatal(e)
+ }
+ }
+ }
+}
+
+func TestReaderToColumns(t *testing.T) {
+ reader := &Reader{}
+
+ e := ConfigParse(reader, []byte(jsonSample[4]))
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ e = reader.Init("", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader.SetDatasetMode(DatasetModeCOLUMNS)
+
+ var n, i int
+ for {
+ n, e = Read(reader)
+
+ if n > 0 {
+ ds := reader.GetDataset().(tabula.DatasetInterface)
+ ds.TransposeToRows()
+
+ r := fmt.Sprint(ds.GetData())
+
+ test.Assert(t, "", expectation[i], r, true)
+
+ i++
+ } else if e == io.EOF {
+ // EOF
+ break
+ }
+ }
+}
+
+//
+// TestReaderSkip will test the 'Skip' option in Metadata.
+//
+func TestReaderSkip(t *testing.T) {
+ dsvReader, e := NewReader("testdata/config_skip.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ doRead(t, dsvReader, expSkip)
+
+ e = dsvReader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestTransposeToColumns(t *testing.T) {
+ reader, e := NewReader("testdata/config_skip.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader.SetMaxRows(-1)
+
+ _, e = Read(reader)
+
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ ds := reader.GetDataset().(tabula.DatasetInterface)
+ ds.TransposeToColumns()
+
+ exp := fmt.Sprint(expSkipColumnsAll)
+
+ columns := ds.GetDataAsColumns()
+
+ got := fmt.Sprint(*columns)
+
+ test.Assert(t, "", exp, got, true)
+
+ e = reader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestSortColumnsByIndex(t *testing.T) {
+ reader, e := NewReader("testdata/config_skip.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader.SetMaxRows(-1)
+
+ _, e = Read(reader)
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ // reverse the data
+ var idxReverse []int
+ var expReverse []string
+
+ for x := len(expSkip) - 1; x >= 0; x-- {
+ idxReverse = append(idxReverse, x)
+ expReverse = append(expReverse, expSkip[x])
+ }
+
+ ds := reader.GetDataset().(tabula.DatasetInterface)
+
+ tabula.SortColumnsByIndex(ds, idxReverse)
+
+ exp := strings.Join(expReverse, "")
+ got := fmt.Sprint(ds.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ exp = "[" + strings.Join(expSkipColumnsAllRev, " ") + "]"
+
+ columns := ds.GetDataAsColumns()
+
+ got = fmt.Sprint(*columns)
+
+ test.Assert(t, "", exp, got, true)
+
+ e = reader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestSplitRowsByValue(t *testing.T) {
+ reader, e := NewReader("testdata/config.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader.SetMaxRows(256)
+
+ _, e = Read(reader)
+
+ if e != nil && e != io.EOF {
+ t.Fatal(e)
+ }
+
+ ds := reader.GetDataset().(tabula.DatasetInterface)
+ splitL, splitR, e := tabula.SplitRowsByValue(ds, 0, 6)
+
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // test left split
+ exp := ""
+ for x := 0; x < 4; x++ {
+ exp += expectation[x]
+ }
+
+ got := fmt.Sprint(splitL.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ // test right split
+ exp = ""
+ for x := 4; x < len(expectation); x++ {
+ exp += expectation[x]
+ }
+
+ got = fmt.Sprint(splitR.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ e = reader.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+//
+// testWriteOutput will write merged reader and check with expected file output.
+//
+func testWriteOutput(t *testing.T, r *Reader, outfile, expfile string) {
+
+ writer, e := NewWriter("")
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = writer.OpenOutput(outfile)
+
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ sep := "\t"
+ ds := r.GetDataset().(tabula.DatasetInterface)
+
+ _, e = writer.WriteRawDataset(ds, &sep)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = writer.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, outfile, expfile, true)
+}
+
+func TestMergeColumns(t *testing.T) {
+ reader1, e := NewReader("testdata/config.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader2, e := NewReader("testdata/config_skip.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader1.SetMaxRows(-1)
+ reader2.SetMaxRows(-1)
+
+ _, e = Read(reader1)
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ _, e = Read(reader2)
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ e = reader1.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = reader2.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ reader1.InputMetadata[len(reader1.InputMetadata)-1].Separator = ";"
+
+ reader1.MergeColumns(reader2)
+
+ outfile := "testdata/output_merge_columns.dat"
+ expfile := "testdata/expected_merge_columns.dat"
+
+ testWriteOutput(t, reader1, outfile, expfile)
+}
+
+func TestMergeRows(t *testing.T) {
+ reader1, e := NewReader("testdata/config.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader2, e := NewReader("testdata/config_skip.dsv", nil)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ reader1.SetMaxRows(-1)
+ reader2.SetMaxRows(-1)
+
+ _, e = Read(reader1)
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ _, e = Read(reader2)
+ if e != io.EOF {
+ t.Fatal(e)
+ }
+
+ e = reader1.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = reader2.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ reader1.MergeRows(reader2)
+
+ outfile := "testdata/output_merge_rows.dat"
+ expfile := "testdata/expected_merge_rows.dat"
+
+ testWriteOutput(t, reader1, outfile, expfile)
+}
diff --git a/lib/dsv/readererror.go b/lib/dsv/readererror.go
new file mode 100644
index 00000000..6bd7616e
--- /dev/null
+++ b/lib/dsv/readererror.go
@@ -0,0 +1,52 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "fmt"
+)
+
+const (
+ _ = iota
+ // EReadMissLeftQuote read error when no left-quote found on line.
+ EReadMissLeftQuote
+ // EReadMissRightQuote read error when no right-quote found on line.
+ EReadMissRightQuote
+ // EReadMissSeparator read error when no separator found on line.
+ EReadMissSeparator
+ // EReadLine error when reading line from file.
+ EReadLine
+ // EReadEOF error which indicated end-of-file.
+ EReadEOF
+ // ETypeConversion error when converting type from string to numeric or
+ // vice versa.
+ ETypeConversion
+)
+
+//
+// ReaderError to handle error data and message.
+//
+type ReaderError struct {
+ // T define type of error.
+ T int
+ // Func where error happened
+ Func string
+ // What cause the error?
+ What string
+ // Line define the line which cause error
+ Line string
+ // Pos character position which cause error
+ Pos int
+ // N line number
+ N int
+}
+
+//
+// Error to string.
+//
+func (e *ReaderError) Error() string {
+ return fmt.Sprintf("dsv.Reader.%-20s [%d:%d]: %-30s data:|%s|", e.Func, e.N,
+ e.Pos, e.What, e.Line)
+}
diff --git a/lib/dsv/readerinterface.go b/lib/dsv/readerinterface.go
new file mode 100644
index 00000000..b7bc489f
--- /dev/null
+++ b/lib/dsv/readerinterface.go
@@ -0,0 +1,434 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "os"
+
+ libbytes "github.com/shuLhan/share/lib/bytes"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// ReaderInterface is the interface for reading DSV file.
+//
+type ReaderInterface interface {
+ ConfigInterface
+ AddInputMetadata(*Metadata)
+ AppendMetadata(MetadataInterface)
+ GetInputMetadata() []MetadataInterface
+ GetInputMetadataAt(idx int) MetadataInterface
+ GetMaxRows() int
+ SetMaxRows(max int)
+ GetDatasetMode() string
+ SetDatasetMode(mode string)
+ GetNColumnIn() int
+ GetInput() string
+ SetInput(path string)
+ GetRejected() string
+ SetRejected(path string)
+ GetSkip() int
+ SetSkip(n int)
+ IsTrimSpace() bool
+ SetDefault()
+ OpenInput() error
+ OpenRejected() error
+ SkipLines() error
+
+ Reset() error
+ Flush() error
+ ReadLine() ([]byte, error)
+ FetchNextLine([]byte) ([]byte, error)
+ Reject(line []byte) (int, error)
+ Close() error
+
+ GetDataset() interface{}
+ MergeColumns(ReaderInterface)
+}
+
+//
+// Read row from input file.
+//
+func Read(reader ReaderInterface) (n int, e error) {
+ var (
+ row *tabula.Row
+ line []byte
+ linenum int
+ eRead *ReaderError
+ )
+ maxrows := reader.GetMaxRows()
+
+ e = reader.Reset()
+ if e != nil {
+ return
+ }
+
+ dataset := reader.GetDataset().(tabula.DatasetInterface)
+
+ // Loop until we reached MaxRows (> 0) or when all rows has been
+ // read (= -1)
+ for {
+ row, line, linenum, eRead = ReadRow(reader, linenum)
+ if nil == eRead {
+ dataset.PushRow(row)
+
+ n++
+ if maxrows > 0 && n >= maxrows {
+ break
+ }
+ continue
+ }
+
+ if eRead.T&EReadEOF == EReadEOF {
+ _ = reader.Flush()
+ e = io.EOF
+ return
+ }
+
+ eRead.N = linenum
+ fmt.Fprintf(os.Stderr, "%s\n", eRead)
+
+ // If error, save the rejected line.
+ line = append(line, DefEOL)
+
+ _, e = reader.Reject(line)
+ if e != nil {
+ break
+ }
+ }
+
+ // remember to flush if we have rejected rows.
+ e = reader.Flush()
+
+ return n, e
+}
+
+//
+// parsingLeftQuote parse the left-quote string from line.
+//
+func parsingLeftQuote(lq, line []byte, startAt int) (
+ p int, eRead *ReaderError,
+) {
+ p = startAt
+
+ // parsing until we found left quote token
+ p, found := libbytes.SkipAfterToken(line, lq, p, false)
+
+ if found {
+ return p, nil
+ }
+
+ eRead = &ReaderError{
+ T: EReadMissLeftQuote,
+ Func: "parsingLeftQuote",
+ What: "Missing left-quote '" + string(lq) + "'",
+ Line: string(line),
+ Pos: p,
+ N: 0,
+ }
+
+ return p, eRead
+}
+
+//
+// parsingSeparator parsing the line until we found the separator.
+//
+// Return the data and index of last parsed line, or error if separator is not
+// found or not match with specification.
+//
+func parsingSeparator(sep, line []byte, startAt int) (
+ v []byte, p int, eRead *ReaderError,
+) {
+ p = startAt
+
+ v, p, found := libbytes.CutUntilToken(line, sep, p, false)
+
+ if found {
+ return v, p, nil
+ }
+
+ eRead = &ReaderError{
+ Func: "parsingSeparator",
+ What: "Missing separator '" + string(sep) + "'",
+ Line: string(line),
+ Pos: p,
+ N: 0,
+ }
+
+ return v, p, eRead
+}
+
+//
+// parsingRightQuote parsing the line until we found the right quote or separator.
+//
+// Return the data and index of last parsed line, or error if right-quote is not
+// found or not match with specification.
+//
+func parsingRightQuote(reader ReaderInterface, rq, line []byte, startAt int) (
+ v, lines []byte, p int, eRead *ReaderError,
+) {
+ var e error
+ var content []byte
+ p = startAt
+ var found bool
+
+ // (2.2.1)
+ for {
+ content, p, found = libbytes.CutUntilToken(line, rq, p, true)
+
+ v = append(v, content...)
+
+ if found {
+ return v, line, p, nil
+ }
+
+ // EOL before finding right-quote.
+ // Read and join with the next line.
+ line, e = reader.FetchNextLine(line)
+
+ if e != nil {
+ break
+ }
+ }
+
+ eRead = &ReaderError{
+ T: EReadMissRightQuote,
+ Func: "parsingRightQuote",
+ What: "Missing right-quote '" + string(rq) + "'",
+ Line: string(line),
+ Pos: p,
+ N: 0,
+ }
+
+ if e == io.EOF {
+ eRead.T &= EReadEOF
+ }
+
+ return v, line, p, eRead
+}
+
+//
+// parsingSkipSeparator parse until we found separator or EOF
+//
+func parsingSkipSeparator(sep, line []byte, startAt int) (
+ p int, eRead *ReaderError,
+) {
+ p = startAt
+
+ p, found := libbytes.SkipAfterToken(line, sep, p, false)
+
+ if found {
+ return p, nil
+ }
+
+ eRead = &ReaderError{
+ T: EReadMissSeparator,
+ Func: "parsingSkipSeparator",
+ What: "Missing separator '" + string(sep) + "'",
+ Line: string(line),
+ Pos: p,
+ N: 0,
+ }
+
+ return p, eRead
+}
+
+//
+// parsingSkipSpace skip all space starting from `startAt`.
+//
+func parsingSkipSpace(line []byte, startAt int) (p int) {
+ linelen := len(line)
+
+ for p = startAt; p < linelen; p++ {
+ if line[p] == ' ' || line[p] == '\t' || line[p] == '\n' ||
+ line[p] == '\r' {
+ continue
+ }
+ break
+ }
+ return
+}
+
+//
+// ParseLine parse a line containing records. The output is array of record
+// (or single row).
+//
+// This is how the algorithm works
+// (1) create n slice of record, where n is number of column metadata
+// (2) for each metadata
+// (2.0) Check if the next sequence matched with separator.
+// (2.0.1) If its match, create empty record
+// (2.1) If using left quote, skip until we found left-quote
+// (2.2) If using right quote, append byte to buffer until right-quote
+// (2.2.1) If using separator, skip until separator
+// (2.3) If using separator, append byte to buffer until separator
+// (2.4) else append all byte to buffer.
+// (3) save buffer to record
+//
+func ParseLine(reader ReaderInterface, line []byte) (
+ prow *tabula.Row, eRead *ReaderError,
+) {
+ p := 0
+ rIdx := 0
+ inputMd := reader.GetInputMetadata()
+ row := make(tabula.Row, 0)
+
+ for _, md := range inputMd {
+ lq := md.GetLeftQuote()
+ rq := md.GetRightQuote()
+ sep := md.GetSeparator()
+ v := []byte{}
+
+ // (2.0)
+ if sep != "" && sep != lq {
+ match := libbytes.IsTokenAt(line, []byte(sep), p)
+
+ // (2.0.1)
+ if match {
+ p += len(sep)
+ goto empty
+ }
+ }
+
+ // (2.1)
+ if lq != "" {
+ p, eRead = parsingLeftQuote([]byte(lq), line, p)
+
+ if eRead != nil {
+ return
+ }
+ }
+
+ // (2.2)
+ if rq != "" {
+ v, line, p, eRead = parsingRightQuote(reader, []byte(rq),
+ line, p)
+
+ if eRead != nil {
+ return
+ }
+
+ if sep != "" {
+ p, eRead = parsingSkipSeparator([]byte(sep),
+ line, p)
+
+ if eRead != nil {
+ return
+ }
+
+ // Handle multi space if separator is a single
+ // space.
+ if sep == " " {
+ p = parsingSkipSpace(line, p)
+ }
+ }
+ } else {
+ if sep != "" {
+ // Skip space at beginning if separator is a
+ // single space.
+ if sep == " " {
+ p = parsingSkipSpace(line, p)
+ }
+
+ v, p, eRead = parsingSeparator([]byte(sep),
+ line, p)
+
+ if eRead != nil {
+ return
+ }
+
+ // Handle multi space if separator is a single
+ // space.
+ if sep == " " {
+ p = parsingSkipSpace(line, p)
+ }
+ } else {
+ v = line[p:]
+ p = p + len(line)
+ }
+ }
+
+ if md.GetSkip() {
+ continue
+ }
+ empty:
+ r, e := tabula.NewRecordBy(string(v), md.GetType())
+
+ if nil != e {
+ msg := fmt.Sprintf("md %s: Type convertion error from %q to %s",
+ md.GetName(), string(v), md.GetTypeName())
+
+ return nil, &ReaderError{
+ T: ETypeConversion,
+ Func: "ParseLine",
+ What: msg,
+ Line: string(line),
+ Pos: p,
+ N: 0,
+ }
+ }
+
+ row = append(row, r)
+ rIdx++
+ }
+
+ return &row, nil
+}
+
+//
+// ReadRow read one line at a time until we get one row or error when parsing the
+// data.
+//
+func ReadRow(reader ReaderInterface, linenum int) (
+ row *tabula.Row,
+ line []byte,
+ n int,
+ eRead *ReaderError,
+) {
+ var e error
+ n = linenum
+
+ // Read one line, skip empty line.
+ for {
+ line, e = reader.ReadLine()
+ n++
+
+ if e != nil {
+ goto err
+ }
+
+ // check for empty line
+ linetrimed := bytes.TrimSpace(line)
+
+ if len(linetrimed) > 0 {
+ break
+ }
+ }
+
+ if reader.IsTrimSpace() {
+ line = bytes.TrimSpace(line)
+ }
+
+ row, eRead = ParseLine(reader, line)
+
+ return row, line, n, eRead
+
+err:
+ eRead = &ReaderError{
+ Func: "ReadRow",
+ What: fmt.Sprint(e),
+ }
+
+ if e == io.EOF {
+ eRead.T = EReadEOF
+ } else {
+ eRead.T = EReadLine
+ }
+
+ return nil, line, n, eRead
+}
diff --git a/lib/dsv/testdata/claset.dsv b/lib/dsv/testdata/claset.dsv
new file mode 100644
index 00000000..98082e09
--- /dev/null
+++ b/lib/dsv/testdata/claset.dsv
@@ -0,0 +1,7 @@
+{
+ "Input" :"input.dat"
+, "Rejected" :"rejected.dat"
+, "Skip" :1
+, "MaxRows" :2
+, "ClassIndex" :3
+}
diff --git a/lib/dsv/testdata/config.dsv b/lib/dsv/testdata/config.dsv
new file mode 100644
index 00000000..b8b8bd52
--- /dev/null
+++ b/lib/dsv/testdata/config.dsv
@@ -0,0 +1,50 @@
+{
+ "Input" :"input.dat"
+, "Rejected" :"rejected.dat"
+, "Skip" :1
+, "MaxRows" :1
+, "InputMetadata" :
+ [{
+ "Name" :"id"
+ , "Separator" :";"
+ , "Type" :"integer"
+ },{
+ "Name" :"name"
+ , "Separator" :"-"
+ , "LeftQuote" :"\""
+ , "RightQuote" :"\""
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"[["
+ , "RightQuote" :"]]"
+ },{
+ "Name" :"integer"
+ , "Type" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ , "Type" :"real"
+ }]
+, "Output" :"output.dat"
+, "OutputMetadata":
+ [{
+ "Name" :"id"
+ , "LeftQuote" :"ID "
+ , "Separator" :"/"
+ },{
+ "Name" :"name"
+ , "RightQuote" :"#"
+ , "Separator" :"\t"
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"{{"
+ , "RightQuote" :"}}"
+ },{
+ "Name" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ }]
+}
diff --git a/lib/dsv/testdata/config_simpleread.dsv b/lib/dsv/testdata/config_simpleread.dsv
new file mode 100644
index 00000000..5a877a4c
--- /dev/null
+++ b/lib/dsv/testdata/config_simpleread.dsv
@@ -0,0 +1,50 @@
+{
+ "Input" :"input.dat"
+, "Rejected" :"rejected.dat"
+, "Skip" :1
+, "MaxRows" :-1
+, "InputMetadata" :
+ [{
+ "Name" :"id"
+ , "Separator" :";"
+ , "Type" :"integer"
+ },{
+ "Name" :"name"
+ , "Separator" :"-"
+ , "LeftQuote" :"\""
+ , "RightQuote" :"\""
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"[["
+ , "RightQuote" :"]]"
+ },{
+ "Name" :"integer"
+ , "Type" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ , "Type" :"real"
+ }]
+, "Output" :"output.dat"
+, "OutputMetadata":
+ [{
+ "Name" :"id"
+ , "LeftQuote" :"ID "
+ , "Separator" :"/"
+ },{
+ "Name" :"name"
+ , "RightQuote" :"#"
+ , "Separator" :"\t"
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"{{"
+ , "RightQuote" :"}}"
+ },{
+ "Name" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ }]
+}
diff --git a/lib/dsv/testdata/config_skip.dsv b/lib/dsv/testdata/config_skip.dsv
new file mode 100644
index 00000000..d3a3aa13
--- /dev/null
+++ b/lib/dsv/testdata/config_skip.dsv
@@ -0,0 +1,54 @@
+{
+ "Input" :"input.dat"
+, "Rejected" :"rejected.dat"
+, "Skip" :1
+, "MaxRows" :1
+, "InputMetadata" :
+ [{
+ "Name" :"id"
+ , "Separator" :";"
+ , "Type" :"integer"
+ , "Skip" :true
+ },{
+ "Name" :"name"
+ , "Separator" :"-"
+ , "LeftQuote" :"\""
+ , "RightQuote" :"\""
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"[["
+ , "RightQuote" :"]]"
+ },{
+ "Name" :"integer"
+ , "Type" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"real"
+ , "Type" :"real"
+ }]
+, "Output" :"testdata/output_skip.dat"
+, "OutputMetadata":
+ [{
+ "Name" :"real"
+ , "Separator" :";"
+ },{
+ "Name" :"integer"
+ , "Separator" :";"
+ },{
+ "Name" :"value"
+ , "Separator" :";"
+ , "LeftQuote" :"{{"
+ , "RightQuote" :"}}"
+ },{
+ "Name" :"name"
+ , "RightQuote" :"#"
+ },{
+ "Name" :"id"
+ , "LeftQuote" :"ID "
+ , "Separator" :"/"
+ },{
+ "Name" :"invalid"
+ , "Separator" :";"
+ }]
+}
diff --git a/lib/dsv/testdata/expected.dat b/lib/dsv/testdata/expected.dat
new file mode 100644
index 00000000..6aeb2f94
--- /dev/null
+++ b/lib/dsv/testdata/expected.dat
@@ -0,0 +1,11 @@
+ID 1/A-B# {{AB}};1;0.1
+ID 2/A-B-C# {{BCD}};2;0.02
+ID 3/A;B-C,D# {{A;B C,D}};3;0.003
+ID 4/A;B-C,D# {{A;B C D}};4;0.0004
+ID 6/# {{}};6;0.000006
+ID 8/ok# {{missing right-quote];8;0.00000008
+9;"ok"-[[ok}};9;0.000000009
+ID 10/test# {{integer}};10;0.101
+ID 12/test# {{real}};123456789;0.123456789
+ID 13/string with# {{string with}};13;13
+ID 14/string with" quote# {{string with]] escape}};14;14
diff --git a/lib/dsv/testdata/expected_merge_columns.dat b/lib/dsv/testdata/expected_merge_columns.dat
new file mode 100644
index 00000000..5c279bab
--- /dev/null
+++ b/lib/dsv/testdata/expected_merge_columns.dat
@@ -0,0 +1,22 @@
+1 A-B AB 1 0.1
+2 A-B-C BCD 2 0.02
+3 A;B-C,D A;B C,D 3 0.003
+4 A;B-C,D A;B C D 4 0.0004
+6 6 0.000006
+8 ok missing right-quote];8;0.00000008
+9;"ok"-[[ok 9 0.000000009
+10 test integer 10 0.101
+12 test real 123456789 0.123456789
+13 string with string with 13 13
+14 string with" quote string with]] escape 14 14
+ A-B AB 1 0.1
+ A-B-C BCD 2 0.02
+ A;B-C,D A;B C,D 3 0.003
+ A;B-C,D A;B C D 4 0.0004
+ 6 0.000006
+ ok missing right-quote];8;0.00000008
+9;"ok"-[[ok 9 0.000000009
+ test integer 10 0.101
+ test real 123456789 0.123456789
+ string with string with 13 13
+ string with" quote string with]] escape 14 14
diff --git a/lib/dsv/testdata/expected_merge_rows.dat b/lib/dsv/testdata/expected_merge_rows.dat
new file mode 100644
index 00000000..a5880c76
--- /dev/null
+++ b/lib/dsv/testdata/expected_merge_rows.dat
@@ -0,0 +1,22 @@
+1 A-B AB 1 0.1
+2 A-B-C BCD 2 0.02
+3 A;B-C,D A;B C,D 3 0.003
+4 A;B-C,D A;B C D 4 0.0004
+6 6 0.000006
+8 ok missing right-quote];8;0.00000008
+9;"ok"-[[ok 9 0.000000009
+10 test integer 10 0.101
+12 test real 123456789 0.123456789
+13 string with string with 13 13
+14 string with" quote string with]] escape 14 14
+A-B AB 1 0.1
+A-B-C BCD 2 0.02
+A;B-C,D A;B C,D 3 0.003
+A;B-C,D A;B C D 4 0.0004
+ 6 0.000006
+ok missing right-quote];8;0.00000008
+9;"ok"-[[ok 9 0.000000009
+test integer 10 0.101
+test real 123456789 0.123456789
+string with string with 13 13
+string with" quote string with]] escape 14 14
diff --git a/lib/dsv/testdata/expected_simplemerge.dat b/lib/dsv/testdata/expected_simplemerge.dat
new file mode 100644
index 00000000..481c7f80
--- /dev/null
+++ b/lib/dsv/testdata/expected_simplemerge.dat
@@ -0,0 +1,22 @@
+ID 1/A-B# {{AB}};1;0.1
+ID 2/A-B-C# {{BCD}};2;0.02
+ID 3/A;B-C,D# {{A;B C,D}};3;0.003
+ID 4/A;B-C,D# {{A;B C D}};4;0.0004
+ID 6/# {{}};6;0.000006
+ID 8/ok# {{missing right-quote];8;0.00000008
+9;"ok"-[[ok}};9;0.000000009
+ID 10/test# {{integer}};10;0.101
+ID 12/test# {{real}};123456789;0.123456789
+ID 13/string with# {{string with}};13;13
+ID 14/string with" quote# {{string with]] escape}};14;14
+ID 1/A-B# {{AB}};1;0.1
+ID 2/A-B-C# {{BCD}};2;0.02
+ID 3/A;B-C,D# {{A;B C,D}};3;0.003
+ID 4/A;B-C,D# {{A;B C D}};4;0.0004
+ID 6/# {{}};6;0.000006
+ID 8/ok# {{missing right-quote];8;0.00000008
+9;"ok"-[[ok}};9;0.000000009
+ID 10/test# {{integer}};10;0.101
+ID 12/test# {{real}};123456789;0.123456789
+ID 13/string with# {{string with}};13;13
+ID 14/string with" quote# {{string with]] escape}};14;14
diff --git a/lib/dsv/testdata/expected_skip.dat b/lib/dsv/testdata/expected_skip.dat
new file mode 100644
index 00000000..10286f1a
--- /dev/null
+++ b/lib/dsv/testdata/expected_skip.dat
@@ -0,0 +1,11 @@
+0.1;1;{{AB}};A-B#
+0.02;2;{{BCD}};A-B-C#
+0.003;3;{{A;B C,D}};A;B-C,D#
+0.0004;4;{{A;B C D}};A;B-C,D#
+0.000006;6;{{}};#
+0.000000009;9;{{missing right-quote];8;0.00000008
+9;"ok"-[[ok}};ok#
+0.101;10;{{integer}};test#
+0.123456789;123456789;{{real}};test#
+13;13;{{string with}};string with#
+14;14;{{string with]] escape}};string with" quote#
diff --git a/lib/dsv/testdata/input.dat b/lib/dsv/testdata/input.dat
new file mode 100644
index 00000000..6774d376
--- /dev/null
+++ b/lib/dsv/testdata/input.dat
@@ -0,0 +1,15 @@
+"id","name","value","integer";"real"
+1;"A-B"-[[AB]];1;0.1
+2;"A-B-C"-[[BCD]];2;0.02
+3;"A;B-C,D"-[[A;B C,D]];3;0.003
+4;"A;B-C,D"-[[A;B C D]];4;0.0004
+5;"A;B-C,D-"[[A;B C D]];5;0.00005
+6;""-[[]];6;0.000006
+7;"ok"-[missing left-quote]];7;0.0000007
+8;"ok"-[[missing right-quote];8;0.00000008
+9;"ok"-[[ok]];9;0.000000009
+10;"test"-[[integer]];010;0.101
+11;"test"-[[integer]];1a;0.1001
+12;"test"-[[real]];123456789;00.123456789
+13;"string with" quote"-[[string with]];13;13.0
+14;"string with\" quote"-[[string with\]] escape]];14;14.0
diff --git a/lib/dsv/testdata/writeraw.exp b/lib/dsv/testdata/writeraw.exp
new file mode 100644
index 00000000..05f968ee
--- /dev/null
+++ b/lib/dsv/testdata/writeraw.exp
@@ -0,0 +1,10 @@
+0,1,A
+1,1.1,B
+2,1.2,A
+3,1.3,B
+4,1.4,C
+5,1.5,D
+6,1.6,C
+7,1.7,D
+8,1.8,E
+9,1.9,F
diff --git a/lib/dsv/writer.go b/lib/dsv/writer.go
new file mode 100644
index 00000000..1801ad36
--- /dev/null
+++ b/lib/dsv/writer.go
@@ -0,0 +1,515 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "bufio"
+ "encoding/json"
+ "log"
+ "os"
+
+ libbytes "github.com/shuLhan/share/lib/bytes"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ // DefSeparator default separator that will be used if its not given
+ // in config file.
+ DefSeparator = ","
+ // DefOutput file.
+ DefOutput = "output.dat"
+ // DefEscape default string to escape the right quote or separator.
+ DefEscape = "\\"
+)
+
+//
+// Writer write records from reader or slice using format configuration in
+// metadata.
+//
+type Writer struct {
+ Config `json:"-"`
+ // Output file where the records will be written.
+ Output string `json:"Output"`
+ // OutputMetadata define format for each column.
+ OutputMetadata []Metadata `json:"OutputMetadata"`
+ // fWriter as write descriptor.
+ fWriter *os.File
+ // BufWriter for buffered writer.
+ BufWriter *bufio.Writer
+}
+
+//
+// NewWriter create a writer object.
+// User must call Open after that to populate the output and metadata.
+//
+func NewWriter(config string) (writer *Writer, e error) {
+ writer = &Writer{
+ Output: "",
+ OutputMetadata: nil,
+ fWriter: nil,
+ BufWriter: nil,
+ }
+
+ if config == "" {
+ return
+ }
+
+ e = OpenWriter(writer, config)
+ if e != nil {
+ return nil, e
+ }
+
+ return
+}
+
+//
+// GetOutput return output filename.
+//
+func (writer *Writer) GetOutput() string {
+ return writer.Output
+}
+
+//
+// SetOutput will set the output file to path.
+//
+func (writer *Writer) SetOutput(path string) {
+ writer.Output = path
+}
+
+//
+// AddMetadata will add new output metadata to writer.
+//
+func (writer *Writer) AddMetadata(md Metadata) {
+ writer.OutputMetadata = append(writer.OutputMetadata, md)
+}
+
+//
+// open a generic method to open output file with specific flag.
+//
+func (writer *Writer) open(file string, flag int) (e error) {
+ if file == "" {
+ if writer.Output == "" {
+ file = DefOutput
+ } else {
+ file = writer.Output
+ }
+ }
+
+ writer.fWriter, e = os.OpenFile(file, flag, 0600)
+ if nil != e {
+ return e
+ }
+
+ writer.BufWriter = bufio.NewWriter(writer.fWriter)
+
+ return nil
+}
+
+//
+// OpenOutput file and buffered writer.
+// File will be truncated if its exist.
+//
+func (writer *Writer) OpenOutput(file string) (e error) {
+ return writer.open(file, os.O_CREATE|os.O_TRUNC|os.O_WRONLY)
+}
+
+//
+// ReopenOutput will open the output file back without truncating the content.
+//
+func (writer *Writer) ReopenOutput(file string) (e error) {
+ if e = writer.Close(); e != nil {
+ return
+ }
+ return writer.open(file, os.O_CREATE|os.O_APPEND|os.O_WRONLY)
+}
+
+//
+// Flush output buffer to disk.
+//
+func (writer *Writer) Flush() error {
+ return writer.BufWriter.Flush()
+}
+
+//
+// Close all open descriptor.
+//
+func (writer *Writer) Close() (e error) {
+ if nil != writer.BufWriter {
+ e = writer.BufWriter.Flush()
+ if e != nil {
+ return
+ }
+ }
+ if nil != writer.fWriter {
+ e = writer.fWriter.Close()
+ }
+ return
+}
+
+//
+// WriteRow dump content of Row to file using format in metadata.
+//
+func (writer *Writer) WriteRow(row *tabula.Row, recordMd []MetadataInterface) (
+ e error,
+) {
+ nRecord := row.Len()
+ v := []byte{}
+ esc := []byte(DefEscape)
+
+ for i := range writer.OutputMetadata {
+ md := writer.OutputMetadata[i]
+
+ // find the input index based on name on record metadata.
+ rIdx, mdMatch := FindMetadata(&md, recordMd)
+
+ // No input metadata matched? skip it too.
+ if rIdx >= nRecord {
+ continue
+ }
+
+ // If input column is ignored, continue to next record.
+ if mdMatch != nil && mdMatch.GetSkip() {
+ continue
+ }
+
+ recV := (*row)[rIdx].Bytes()
+ lq := md.GetLeftQuote()
+
+ if "" != lq {
+ v = append(v, []byte(lq)...)
+ }
+
+ rq := md.GetRightQuote()
+ sep := md.GetSeparator()
+
+ // Escape the escape character itself.
+ if md.T == tabula.TString {
+ recV, _ = libbytes.EncloseToken(recV, esc, esc, nil)
+ }
+
+ // Escape the right quote in field content before writing it.
+ if "" != rq && md.T == tabula.TString {
+ recV, _ = libbytes.EncloseToken(recV, []byte(rq), esc, nil)
+ } else {
+ // Escape the separator
+ if "" != sep && md.T == tabula.TString {
+ recV, _ = libbytes.EncloseToken(recV, []byte(sep), esc, nil)
+ }
+ }
+
+ v = append(v, recV...)
+
+ if "" != rq {
+ v = append(v, []byte(rq)...)
+ }
+
+ if "" != sep {
+ v = append(v, []byte(sep)...)
+ }
+ }
+
+ v = append(v, DefEOL)
+
+ _, e = writer.BufWriter.Write(v)
+
+ return e
+}
+
+//
+// WriteRows will loop each row in the list of rows and write their content to
+// output file.
+// Return n for number of row written, and e if error happened.
+//
+func (writer *Writer) WriteRows(rows tabula.Rows, recordMd []MetadataInterface) (
+ n int,
+ e error,
+) {
+ for n = range rows {
+ e = writer.WriteRow(rows[n], recordMd)
+ if nil != e {
+ break
+ }
+ }
+
+ _ = writer.Flush()
+ return
+}
+
+//
+// WriteColumns will write content of columns to output file.
+// Return n for number of row written, and e if error happened.
+//
+func (writer *Writer) WriteColumns(columns tabula.Columns,
+ colMd []MetadataInterface,
+) (
+ n int,
+ e error,
+) {
+ nColumns := len(columns)
+ if nColumns <= 0 {
+ return
+ }
+
+ emptyRec := tabula.NewRecordString("")
+
+ // Get minimum and maximum length of all columns.
+ // In case one of the column have different length (shorter or longer),
+ // we will take the column with minimum length first and continue with
+ // the maximum length.
+
+ minlen, maxlen := columns.GetMinMaxLength()
+
+ // If metadata is nil, generate it from column name.
+ if colMd == nil {
+ for _, col := range columns {
+ md := &Metadata{
+ Name: col.Name,
+ T: col.Type,
+ }
+
+ colMd = append(colMd, md)
+ }
+ }
+
+ // First loop, iterate until minimum column length.
+ row := make(tabula.Row, nColumns)
+
+ for ; n < minlen; n++ {
+ // Convert columns to record.
+ for y, col := range columns {
+ row[y] = col.Records[n]
+ }
+
+ e = writer.WriteRow(&row, colMd)
+ if e != nil {
+ goto err
+ }
+ }
+
+ // Second loop, iterate until maximum column length.
+ for ; n < maxlen; n++ {
+ // Convert columns to record.
+ for y, col := range columns {
+ if col.Len() > n {
+ row[y] = col.Records[n]
+ } else {
+ row[y] = emptyRec
+ }
+ }
+
+ e = writer.WriteRow(&row, colMd)
+ if e != nil {
+ goto err
+ }
+ }
+
+err:
+ _ = writer.Flush()
+ return n, e
+}
+
+//
+// WriteRawRow will write row data using separator `sep` for each record.
+//
+func (writer *Writer) WriteRawRow(row *tabula.Row, sep, esc []byte) (e error) {
+ if sep == nil {
+ sep = []byte(DefSeparator)
+ }
+ if esc == nil {
+ esc = []byte(DefEscape)
+ }
+
+ v := []byte{}
+ for x, rec := range *row {
+ if x > 0 {
+ v = append(v, sep...)
+ }
+
+ recV := rec.Bytes()
+
+ if rec.Type() == tabula.TString {
+ recV, _ = libbytes.EncloseToken(recV, sep, esc, nil)
+ }
+
+ v = append(v, recV...)
+ }
+
+ v = append(v, DefEOL)
+
+ _, e = writer.BufWriter.Write(v)
+
+ _ = writer.Flush()
+
+ return e
+}
+
+//
+// WriteRawRows write rows data using separator `sep` for each record.
+// We use pointer in separator parameter, so we can use empty string as
+// separator.
+//
+func (writer *Writer) WriteRawRows(rows *tabula.Rows, sep *string) (
+ nrow int,
+ e error,
+) {
+ nrow = len(*rows)
+ if nrow <= 0 {
+ return
+ }
+
+ if sep == nil {
+ sep = new(string)
+ *sep = DefSeparator
+ }
+
+ escbytes := []byte(DefEscape)
+ sepbytes := []byte(*sep)
+ x := 0
+
+ for ; x < nrow; x++ {
+ e = writer.WriteRawRow((*rows)[x], sepbytes, escbytes)
+ if nil != e {
+ break
+ }
+ }
+
+ return x, e
+}
+
+//
+// WriteRawColumns write raw columns using separator `sep` for each record to
+// file.
+//
+// We use pointer in separator parameter, so we can use empty string as
+// separator.
+//
+func (writer *Writer) WriteRawColumns(cols *tabula.Columns, sep *string) (
+ nrow int,
+ e error,
+) {
+ ncol := len(*cols)
+ if ncol <= 0 {
+ return
+ }
+
+ if sep == nil {
+ sep = new(string)
+ *sep = DefSeparator
+ }
+
+ // Find minimum and maximum column length.
+ minlen, maxlen := cols.GetMinMaxLength()
+
+ esc := []byte(DefEscape)
+ sepbytes := []byte(*sep)
+ x := 0
+
+ // First, write until minimum column length.
+ for ; x < minlen; x++ {
+ v := cols.Join(x, sepbytes, esc)
+ v = append(v, DefEOL)
+
+ _, e = writer.BufWriter.Write(v)
+
+ if nil != e {
+ return x, e
+ }
+ }
+
+ // and then write column until max length.
+ for ; x < maxlen; x++ {
+ v := cols.Join(x, sepbytes, esc)
+ v = append(v, DefEOL)
+
+ _, e = writer.BufWriter.Write(v)
+
+ if nil != e {
+ break
+ }
+ }
+
+ _ = writer.Flush()
+ return x, e
+}
+
+//
+// WriteRawDataset will write content of dataset to file without metadata but
+// using separator `sep` for each record.
+//
+// We use pointer in separator parameter, so we can use empty string as
+// separator.
+//
+func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface,
+ sep *string,
+) (
+ int, error,
+) {
+ if nil == writer.fWriter {
+ return 0, ErrNotOpen
+ }
+ if nil == dataset {
+ return 0, nil
+ }
+ if sep == nil {
+ sep = new(string)
+ *sep = DefSeparator
+ }
+
+ var rows *tabula.Rows
+
+ switch dataset.GetMode() {
+ case tabula.DatasetModeColumns:
+ cols := dataset.GetDataAsColumns()
+ return writer.WriteRawColumns(cols, sep)
+ case tabula.DatasetModeRows, tabula.DatasetModeMatrix:
+ fallthrough
+ default:
+ rows = dataset.GetDataAsRows()
+ }
+
+ return writer.WriteRawRows(rows, sep)
+}
+
+//
+// Write rows from Reader to file.
+// Return n for number of row written, or e if error happened.
+//
+func (writer *Writer) Write(reader ReaderInterface) (int, error) {
+ if nil == reader {
+ return 0, ErrNilReader
+ }
+ if nil == writer.fWriter {
+ return 0, ErrNotOpen
+ }
+
+ ds := reader.GetDataset().(tabula.DatasetInterface)
+
+ var rows *tabula.Rows
+
+ switch ds.GetMode() {
+ case tabula.DatasetModeColumns:
+ cols := ds.GetDataAsColumns()
+ return writer.WriteColumns(*cols, reader.GetInputMetadata())
+ case tabula.DatasetModeRows, tabula.DatasetModeMatrix:
+ fallthrough
+ default:
+ rows = ds.GetDataAsRows()
+ }
+
+ return writer.WriteRows(*rows, reader.GetInputMetadata())
+}
+
+//
+// String yes, it will print it in JSON like format.
+//
+func (writer *Writer) String() string {
+ r, e := json.MarshalIndent(writer, "", "\t")
+
+ if nil != e {
+ log.Print(e)
+ }
+
+ return string(r)
+}
diff --git a/lib/dsv/writer_test.go b/lib/dsv/writer_test.go
new file mode 100644
index 00000000..f03a1842
--- /dev/null
+++ b/lib/dsv/writer_test.go
@@ -0,0 +1,126 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+import (
+ "testing"
+
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// TestWriter test reading and writing DSV.
+//
+func TestWriter(t *testing.T) {
+ rw, e := New("testdata/config.dsv", nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true)
+
+ e = rw.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
+}
+
+//
+// TestWriterWithSkip test reading and writing DSV with some column in input being
+// skipped.
+//
+func TestWriterWithSkip(t *testing.T) {
+ rw, e := New("testdata/config_skip.dsv", nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ doReadWrite(t, &rw.Reader, &rw.Writer, expSkip, true)
+
+ e = rw.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, rw.GetOutput(), "testdata/expected_skip.dat", true)
+}
+
+//
+// TestWriterWithColumns test reading and writing DSV with where each row
+// is saved in DatasetMode = 'columns'.
+//
+func TestWriterWithColumns(t *testing.T) {
+ rw, e := New("testdata/config_skip.dsv", nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ rw.SetDatasetMode(DatasetModeCOLUMNS)
+
+ doReadWrite(t, &rw.Reader, &rw.Writer, expSkipColumns, true)
+
+ e = rw.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, "testdata/expected_skip.dat", rw.GetOutput(), true)
+}
+
+func TestWriteRawRows(t *testing.T) {
+ dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil)
+
+ populateWithRows(t, dataset)
+
+ writer, e := NewWriter("")
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ outfile := "testdata/writerawrows.out"
+ expfile := "testdata/writeraw.exp"
+
+ e = writer.OpenOutput(outfile)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ _, e = writer.WriteRawDataset(dataset, nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, outfile, expfile, true)
+}
+
+func TestWriteRawColumns(t *testing.T) {
+ var e error
+
+ dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil)
+
+ populateWithColumns(t, dataset)
+
+ writer, e := NewWriter("")
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ outfile := "testdata/writerawcolumns.out"
+ expfile := "testdata/writeraw.exp"
+
+ e = writer.OpenOutput(outfile)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ _, e = writer.WriteRawDataset(dataset, nil)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ assertFile(t, outfile, expfile, true)
+}
diff --git a/lib/dsv/writerinterface.go b/lib/dsv/writerinterface.go
new file mode 100644
index 00000000..e2b8856c
--- /dev/null
+++ b/lib/dsv/writerinterface.go
@@ -0,0 +1,45 @@
+// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dsv
+
+//
+// WriterInterface is an interface for writing DSV data to file.
+//
+type WriterInterface interface {
+ ConfigInterface
+ GetOutput() string
+ SetOutput(path string)
+ OpenOutput(file string) error
+ Flush() error
+ Close() error
+}
+
+//
+// OpenWriter configuration file and initialize the attributes.
+//
+func OpenWriter(writer WriterInterface, fcfg string) (e error) {
+ e = ConfigOpen(writer, fcfg)
+ if e != nil {
+ return
+ }
+
+ return InitWriter(writer)
+}
+
+//
+// InitWriter initialize writer by opening output file.
+//
+func InitWriter(writer WriterInterface) error {
+ out := writer.GetOutput()
+
+ // Exit immediately if no output file is defined in config.
+ if "" == out {
+ return ErrNoOutput
+ }
+
+ writer.SetOutput(ConfigCheckPath(writer, out))
+
+ return writer.OpenOutput("")
+}