diff options
| author | Shulhan <ms@kilabit.info> | 2018-09-17 03:48:37 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2018-09-18 01:50:21 +0700 |
| commit | 446fef94cd712861221c0098dcdd9ae52aaed0eb (patch) | |
| tree | 63167d5a90b27121b552ab428f337717bcf2b01f | |
| parent | 44b26edf7f390db383fe025454be0c4e30cfbd9b (diff) | |
| download | pakakeh.go-446fef94cd712861221c0098dcdd9ae52aaed0eb.tar.xz | |
Merge package "github.com/shuLhan/dsv"
32 files changed, 3951 insertions, 0 deletions
diff --git a/lib/dsv/.gitignore b/lib/dsv/.gitignore new file mode 100644 index 00000000..17cf232b --- /dev/null +++ b/lib/dsv/.gitignore @@ -0,0 +1,8 @@ +rejected.dat +testdata/output.dat +testdata/output_merge_columns.dat +testdata/output_merge_rows.dat +testdata/output_skip.dat +testdata/rejected.dat +testdata/writerawcolumns.out +testdata/writerawrows.out diff --git a/lib/dsv/LICENSE b/lib/dsv/LICENSE new file mode 100644 index 00000000..100cc757 --- /dev/null +++ b/lib/dsv/LICENSE @@ -0,0 +1,38 @@ +Copyright 2015-2018, Shulhan (ms@kilabit.info). All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of Kilabit nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + --- --- --- --- --- --- --- + + TT TT II BB AAAA LLLLLL II KKKKKKKK + TT TT II BB AA AA LL LL II KK + TTTT II BB AA AA LL LL II KK + TT TT II BB AAAAAAAA LLLLLL II KK + TT TT II BB AA AA LL LL II KK + TT TT II BBBBBBBB AA AA LLLLLL II KK + +Website: http://kilabit.info +Contact: ms@kilabit.info diff --git a/lib/dsv/README.md b/lib/dsv/README.md new file mode 100644 index 00000000..1a930c3b --- /dev/null +++ b/lib/dsv/README.md @@ -0,0 +1,350 @@ +[](https://godoc.org/github.com/shuLhan/share/lib/dsv) +[](https://goreportcard.com/report/github.com/shuLhan/share/lib/dsv) + +Package `dsv` is a Go library for working with delimited separated value (DSV). + +DSV is a free-style form of CSV format of text data, where each record is +separated by newline, and each column can be separated by any string, not just +comma. + +- [Example](#example) +- [Terminology](#terminology) +- [Configuration](#configuration) + - [Metadata](#metadata) + - [Input](#input) + - [DatasetMode Explained](#datasetmode-explained) + - [Output](#output) +- [Working with DSV](#working-with-dsv) + - [Processing each Rows/Columns](#processing-each-rowscolumns) + - [Using different Dataset](#using-different-dataset) + - [Builtin Functions for Dataset](#builtin-functions-for-dataset) +- [Limitations](#limitations) + +--- + +## Example + +Lets process this input file `input.dat`, + + Mon Dt HH MM SS Process + Nov 29 23:14:36 process-1 + Nov 29 23:14:37 process-2 + Nov 29 23:14:38 process-3 + +and generate output file `output.dat` which format like this, + + "process_1","29-Nov" + "process_2","29-Nov" + "process_3","29-Nov" + +How do we do it? + +First, create file metadata for input and output, name it `config.dsv`, + + { + "Input" :"input.dat" + , "Skip" :1 + , "InputMetadata" : + [{ + "Name" :"month" + , "Separator" :" " + },{ + "Name" :"date" + , "Separator" :" " + , "Type" :"integer" + },{ + "Name" :"hour" + , "Separator" :":" + , "Type" :"integer" + },{ + "Name" :"minute" + , "Separator" :":" + , "Type" :"integer" + },{ + "Name" :"second" + , "Separator" :" " + , "Type" :"integer" + },{ + "Name" :"process_name" + , "Separator" :"-" + },{ + "Name" :"process_id" + }] + , "Output" :"output.dat" + , "OutputMetadata": + [{ + "Name" :"process_name" + , "LeftQuote" :"\"" + , "Separator" :"_" + },{ + "Name" :"process_id" + , "RightQuote":"\"" + , "Separator" :"," + },{ + "Name" :"date" + , "LeftQuote" :"\"" + , "Separator" :"-" + },{ + "Name" :"month" + , "RightQuote":"\"" + }] + } + +The metadata is using JSON format. For more information see `metadata.go` +and `reader.go`. + +Second, we create a reader to read the input file. + + dsvReader, e := dsv.NewReader("config.dsv", nil) + + if nil != e { + t.Fatal(e) + } + +Third, we create a writer to write our output data, + + dsvWriter, e := dsv.NewWriter("config.dsv") + + if nil != e { + t.Error(e) + } + +Last action, we process them: read input records and pass them to writer. + + for { + n, e := dsv.Read(dsvReader) + + if n > 0 { + dsvWriter.Write(dsvReader) + + // EOF, no more record. + } else if e == io.EOF { + break + } + } + + // we will make sure all open descriptor is closed. + _ = dsvReader.Close() + +Easy enough? We can combine the reader and writer using `dsv.New()`, which will +create reader and writer, + + rw, e := dsv.New("config.dsv", nil) + + if nil != e { + t.Error(e) + } + + // do usual process like in the last step. + +Thats it! + +## Terminology + +Here are some terminology that we used in developing this library, which may +help reader understanding the configuration and API. + +- Dataset: is a content of file +- Record: a single cell in row or column, or the smallest building block of + dataset +- Row: is a horizontal representation of records in dataset +- Column: is a vertical representation of records in dataset + +``` + COL-0 COL-1 ... COL-x +ROW-0: record record ... record +ROW-1: record record ... record +... +ROW-y: record record ... record +``` + +## Configuration + +We choose and use JSON for configuration because, + +1. No additional source to test. +2. Easy to extended. User can embed the current metadata, add additional + configuration, and create another reader to work with it. + +### Metadata + +Metadata contain information about each column when reading input file and +writing to output file, + +- `Name`: mandatory, the name of column +- `Type`: optional, type of record when reading input file. Valid value are + "integer", "real", or "string" (default) +- `Separator`: optional, default to `"\n"`. Separator is a string that + separate the current record with the next record. +- `LeftQuote`: optional, default is empty `""`. LeftQuote is a string that + start at the beginning of record. +- `RightQuote`: optional, default is empty `""`. RightQuote is a string at the + end of record. +- `Skip`: optional, boolean, default is `false`. If true the column will be + saved in dataset when reading input file, otherwise it will be ignored. +- `ValueSpace`: optional, slice of string, default is empty. This contain the + string representation of all possible value in column. + +### Input + +Input configuration contain information about input file. + +- `Input`: mandatory, the name of input file, could use relative or absolute + path. If no path is given then it assumed that the input file is in the same + directory with configuration file. +- `InputMetadata`: mandatory, list of metadata. +- `Skip`: optional, number, default 0. Skip define the number of line that will + be skipped when first input file is opened. +- `TrimSpace`: optional, boolean, default is true. If its true, before parsed, the + white space in the beginning and end of each input line will be removed, + otherwise it will leave unmodified. +- `Rejected`: optional, default to `rejected.dat`. Rejected is file where + data that does not match with metadata will be saved. One can inspect the + rejected file fix it for re-process or ignore it. +- `MaxRows`: optional, default to `256`. Maximum number of rows for one read + operation that will be saved in memory. If its negative, i.e. `-1`, all data + in input file will be processed. +- `DatasetMode`: optional, default to "rows". Mode of dataset in memory. + Valid values are "rows", "columns", or "matrix". Matrix mode is combination of + rows and columns, it give more flexibility when processing the dataset but + will require additional memory. + +#### `DatasetMode` Explained + +For example, given input data file, + + col1,col2,col3 + a,b,c + 1,2,3 + +"rows" mode is where each line saved in its own slice, resulting in Rows: + + Rows[0]: [a b c] + Rows[1]: [1 2 3] + +"columns" mode is where each line saved by columns, resulting in Columns: + + Columns[0]: {col1 0 0 [] [a 1]} + Columns[1]: {col2 0 0 [] [b 2]} + Columns[1]: {col3 0 0 [] [c 3]} + +Unlike rows mode, each column contain metadata including column name, type, +flag, and value space (all possible value that _may_ contain in column value). + +"matrix" mode is where each record saved both in row and column. + +### Output + +Output configuration contain information about output file when writing the +dataset. + +- `Output`: mandatory, the name of output file, could use relative or absolute + path. If no path is given then it assumed that the output file is in the same + directory with configuration file. +- `OutputMetadata`: mandatory, list of metadata. + +## Working with DSV + +### Processing each Rows/Columns + +After opening the input file, we can process the dataset based on rows/columns +mode using simple `for` loop. Example, + +``` +// Save dataset object for used later. +dataset := dsvReader.GetDataset().(tabula.DatasetInterface) + +for { + n, e := dsv.Read(dsvReader) + + if n > 0 { + // Process each row ... + for x, row := dataset.GetDataAsRows() { + + for y, record := range row.Records { + // process each record in row + } + } + + // Or, process each columns + for x, column := dataset.GetDataAsColumns() { + + for y, record := range column.Records { + // process each record in column + } + } + + // Write the dataset to file after processed + dsvWriter.Write(dsvReader) + } + if e == io.EOF { + break + } + if e != nil { + // handle error + } +} +``` + +### Using different Dataset + +Default dataset used by Reader is +[tabula.Dataset](https://godoc.org/github.com/shuLhan/share/lib/tabula#Dataset). + +You can extend and implement +[DatasetInterface](https://godoc.org/github.com/shuLhan/share/lib/tabula#DatasetInterface) +and use it in reader object, either by + +- passing it in the second parameter in `NewReader`, for example, + + ``` + myset := MySet{ + ... + } + reader, e := dsv.NewReader("config.dsv", &myset) + ``` + +- or by calling `reader.Init` after creating new Reader, + + ``` + myset := MySet{ + ... + } + reader := dsv.Reader{ + ... + } + reader.Init("config.dsv", &myset) + ``` + +### Builtin Functions for Dataset + +Since we use tabula package to manage data, any features in those package +can be used in our dataset. +For more information see [tabula +package](https://godoc.org/github.com/shuLhan/share/lib/tabula). + +## Limitations + +- New line is `\n` for each row. + +- Reader and Writer operate in ASCII (8 bit or char type), UTF-8 is not + supported yet, since we can not test it. Patch for supporting UTF-8 (or + runes type) are welcome. + +- About escaped character in content of data. + + Since we said that we handle free-style form of CSV, what we mean was the + left-quote, right-quote and separator can be string. Its not only one single + character like single quote or double quote or any single character, but + literally one or more characters without space. Any escaped character will be + read as is (along with `'\'`) unless its followed by right-quote or separator. + For example, + + "test\'" + + will be readed as `test\'`. But + + "test\"" + + will be readed as `test"`, since the right-quote is matched with escaped + token. diff --git a/lib/dsv/claset_test.go b/lib/dsv/claset_test.go new file mode 100644 index 00000000..21ed197c --- /dev/null +++ b/lib/dsv/claset_test.go @@ -0,0 +1,34 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "testing" + + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +func TestReaderWithClaset(t *testing.T) { + fcfg := "testdata/claset.dsv" + + claset := tabula.Claset{} + + _, e := NewReader(fcfg, &claset) + if e != nil { + t.Fatal(e) + } + + test.Assert(t, "", 3, claset.GetClassIndex(), true) + + claset.SetMajorityClass("regular") + claset.SetMinorityClass("vandalism") + + clone := claset.Clone().(tabula.ClasetInterface) + + test.Assert(t, "", 3, clone.GetClassIndex(), true) + test.Assert(t, "", "regular", clone.MajorityClass(), true) + test.Assert(t, "", "vandalism", clone.MinorityClass(), true) +} diff --git a/lib/dsv/common_test.go b/lib/dsv/common_test.go new file mode 100644 index 00000000..239fdf5e --- /dev/null +++ b/lib/dsv/common_test.go @@ -0,0 +1,163 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "runtime/debug" + "testing" + + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +// +// assertFile compare content of two file, print error message and exit +// when both are different. +// +func assertFile(t *testing.T, a, b string, equal bool) { + out, e := ioutil.ReadFile(a) + + if nil != e { + debug.PrintStack() + t.Error(e) + } + + exp, e := ioutil.ReadFile(b) + + if nil != e { + debug.PrintStack() + t.Error(e) + } + + r := bytes.Compare(out, exp) + + if equal && 0 != r { + debug.PrintStack() + t.Fatal("Comparing", a, "with", b, ": result is different (", + r, ")") + } +} + +func checkDataset(t *testing.T, r *Reader, exp string) { + var got string + ds := r.GetDataset().(tabula.DatasetInterface) + data := ds.GetData() + + switch data.(type) { + case *tabula.Rows: + rows := data.(*tabula.Rows) + got = fmt.Sprint(*rows) + case *tabula.Columns: + cols := data.(*tabula.Columns) + got = fmt.Sprint(*cols) + case *tabula.Matrix: + matrix := data.(*tabula.Matrix) + got = fmt.Sprint(*matrix) + default: + fmt.Println("data type unknown") + } + + test.Assert(t, "", exp, got, true) +} + +// +// doReadWrite test reading and writing the DSV data. +// +func doReadWrite(t *testing.T, dsvReader *Reader, dsvWriter *Writer, + expectation []string, check bool) { + i := 0 + + for { + n, e := Read(dsvReader) + + if e == io.EOF || n == 0 { + _, e = dsvWriter.Write(dsvReader) + if e != nil { + t.Fatal(e) + } + + break + } + + if e != nil { + continue + } + + if check { + checkDataset(t, dsvReader, expectation[i]) + i++ + } + + _, e = dsvWriter.Write(dsvReader) + if e != nil { + t.Fatal(e) + } + } + + e := dsvWriter.Flush() + if e != nil { + t.Fatal(e) + } +} + +var datasetRows = [][]string{ + {"0", "1", "A"}, + {"1", "1.1", "B"}, + {"2", "1.2", "A"}, + {"3", "1.3", "B"}, + {"4", "1.4", "C"}, + {"5", "1.5", "D"}, + {"6", "1.6", "C"}, + {"7", "1.7", "D"}, + {"8", "1.8", "E"}, + {"9", "1.9", "F"}, +} + +var datasetCols = [][]string{ + {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + {"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"}, + {"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"}, +} + +var datasetTypes = []int{ + tabula.TInteger, + tabula.TReal, + tabula.TString, +} + +var datasetNames = []string{"int", "real", "string"} + +func populateWithRows(t *testing.T, dataset *tabula.Dataset) { + for _, rowin := range datasetRows { + row := make(tabula.Row, len(rowin)) + + for x, recin := range rowin { + rec, e := tabula.NewRecordBy(recin, datasetTypes[x]) + if e != nil { + t.Fatal(e) + } + + row[x] = rec + } + + dataset.PushRow(&row) + } +} + +func populateWithColumns(t *testing.T, dataset *tabula.Dataset) { + for x := range datasetCols { + col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x], + datasetNames[x]) + if e != nil { + t.Fatal(e) + } + + dataset.PushColumn(*col) + } +} diff --git a/lib/dsv/config.go b/lib/dsv/config.go new file mode 100644 index 00000000..a74fc315 --- /dev/null +++ b/lib/dsv/config.go @@ -0,0 +1,27 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +// +// Config for working with DSV configuration. +// +type Config struct { + // ConfigPath path to configuration file. + ConfigPath string +} + +// +// GetConfigPath return the base path of configuration file. +// +func (cfg *Config) GetConfigPath() string { + return cfg.ConfigPath +} + +// +// SetConfigPath for reading input and writing rejected file. +// +func (cfg *Config) SetConfigPath(dir string) { + cfg.ConfigPath = dir +} diff --git a/lib/dsv/configinterface.go b/lib/dsv/configinterface.go new file mode 100644 index 00000000..2c6fd3ae --- /dev/null +++ b/lib/dsv/configinterface.go @@ -0,0 +1,57 @@ +package dsv + +import ( + "encoding/json" + "io/ioutil" + "path" +) + +// +// ConfigInterface for reader and writer for initializing the config from JSON. +// +type ConfigInterface interface { + GetConfigPath() string + SetConfigPath(dir string) +} + +// +// ConfigOpen configuration file and initialize the attributes. +// +func ConfigOpen(rw interface{}, fcfg string) error { + cfg, e := ioutil.ReadFile(fcfg) + + if nil != e { + return e + } + + // Get directory where the config reside. + rwconfig := rw.(ConfigInterface) + rwconfig.SetConfigPath(path.Dir(fcfg)) + + return ConfigParse(rw, cfg) +} + +// +// ConfigParse from JSON string. +// +func ConfigParse(rw interface{}, cfg []byte) error { + return json.Unmarshal(cfg, rw) +} + +// +// ConfigCheckPath if no path in file, return the config path plus file name, +// otherwise leave it unchanged. +// +func ConfigCheckPath(comin ConfigInterface, file string) string { + dir := path.Dir(file) + + if dir == "." { + cfgPath := comin.GetConfigPath() + if cfgPath != "" && cfgPath != "." { + return cfgPath + "/" + file + } + } + + // nothing happen. + return file +} diff --git a/lib/dsv/data_test.go b/lib/dsv/data_test.go new file mode 100644 index 00000000..41f6cf7a --- /dev/null +++ b/lib/dsv/data_test.go @@ -0,0 +1,58 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +var expectation = []string{ + "&[1 A-B AB 1 0.1]", + "&[2 A-B-C BCD 2 0.02]", + "&[3 A;B-C,D A;B C,D 3 0.003]", + "&[4 A;B-C,D A;B C D 4 0.0004]", + "&[6 6 0.000006]", + "&[8 ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]", + "&[10 test integer 10 0.101]", + "&[12 test real 123456789 0.123456789]", + "&[13 string with string with 13 13]", + "&[14 string with\" quote string with]] escape 14 14]", +} + +var expSkip = []string{ + "&[A-B AB 1 0.1]", + "&[A-B-C BCD 2 0.02]", + "&[A;B-C,D A;B C,D 3 0.003]", + "&[A;B-C,D A;B C D 4 0.0004]", + "&[ 6 0.000006]", + "&[ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]", + "&[test integer 10 0.101]", + "&[test real 123456789 0.123456789]", + "&[string with string with 13 13]", + "&[string with\" quote string with]] escape 14 14]", +} + +var expSkipColumns = []string{ + "[{name 0 0 [] [A-B]} {value 0 0 [] [AB]} {integer 1 0 [] [1]} {real 2 0 [] [0.1]}]", + "[{name 0 0 [] [A-B-C]} {value 0 0 [] [BCD]} {integer 1 0 [] [2]} {real 2 0 [] [0.02]}]", + "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C,D]} {integer 1 0 [] [3]} {real 2 0 [] [0.003]}]", + "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C D]} {integer 1 0 [] [4]} {real 2 0 [] [0.0004]}]", + "[{name 0 0 [] []} {value 0 0 [] []} {integer 1 0 [] [6]} {real 2 0 [] [0.000006]}]", + "[{name 0 0 [] [ok]} {value 0 0 [] [missing right-quote];8;0.00000008\n9;\"ok\"-[[ok]} {integer 1 0 [] [9]} {real 2 0 [] [0.000000009]}]", + "[{name 0 0 [] [test]} {value 0 0 [] [integer]} {integer 1 0 [] [10]} {real 2 0 [] [0.101]}]", + "[{name 0 0 [] [test]} {value 0 0 [] [real]} {integer 1 0 [] [123456789]} {real 2 0 [] [0.123456789]}]", + "[{name 0 0 [] [string with]} {value 0 0 [] [string with]} {integer 1 0 [] [13]} {real 2 0 [] [13]}]", + "[{name 0 0 [] [string with\" quote]} {value 0 0 [] [string with]] escape]} {integer 1 0 [] [14]} {real 2 0 [] [14]}]", +} + +var expSkipColumnsAll = []string{ + "{name 0 0 [] [A-B A-B-C A;B-C,D A;B-C,D ok test test string with string with\" quote]}", + "{value 0 0 [] [AB BCD A;B C,D A;B C D missing right-quote];8;0.00000008\n9;\"ok\"-[[ok integer real string with string with]] escape]}", + "{integer 1 0 [] [1 2 3 4 6 9 10 123456789 13 14]}", + "{real 2 0 [] [0.1 0.02 0.003 0.0004 0.000006 0.000000009 0.101 0.123456789 13 14]}", +} + +var expSkipColumnsAllRev = []string{ + "{name 0 0 [] [string with\" quote string with test test ok A;B-C,D A;B-C,D A-B-C A-B]}", + "{value 0 0 [] [string with]] escape string with real integer missing right-quote];8;0.00000008\n9;\"ok\"-[[ok A;B C D A;B C,D BCD AB]}", + "{integer 1 0 [] [14 13 123456789 10 9 6 4 3 2 1]}", + "{real 2 0 [] [14 13 0.123456789 0.101 0.000000009 0.000006 0.0004 0.003 0.02 0.1]}", +} diff --git a/lib/dsv/dsv.go b/lib/dsv/dsv.go new file mode 100644 index 00000000..b777264f --- /dev/null +++ b/lib/dsv/dsv.go @@ -0,0 +1,100 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package dsv is a library for working with delimited separated value (DSV). +// +// DSV is a free-style form of Comma Separated Value (CSV) format of text data, +// where each row is separated by newline, and each column can be separated by +// any string enclosed with left-quote and right-quote. +// +package dsv + +import ( + "errors" +) + +const ( + // DefaultRejected define the default file which will contain the + // rejected row. + DefaultRejected = "rejected.dat" + + // DefaultMaxRows define default maximum row that will be saved + // in memory for each read if input data is too large and can not be + // consumed in one read operation. + DefaultMaxRows = 256 + + // DefDatasetMode default output mode is rows. + DefDatasetMode = DatasetModeROWS + + // DefEOL default end-of-line + DefEOL = '\n' +) + +var ( + // ErrNoInput define an error when no Input file is given to Reader. + ErrNoInput = errors.New("dsv: No input file is given in config") + + // ErrMissRecordsLen define an error when trying to push Row + // to Field, when their length is not equal. + // See reader.PushRowToColumns(). + ErrMissRecordsLen = errors.New("dsv: Mismatch between number of record in row and columns length") + + // ErrNoOutput define an error when no output file is given to Writer. + ErrNoOutput = errors.New("dsv: No output file is given in config") + + // ErrNotOpen define an error when output file has not been opened + // by Writer. + ErrNotOpen = errors.New("dsv: Output file is not opened") + + // ErrNilReader define an error when Reader object is nil when passed + // to Write function. + ErrNilReader = errors.New("dsv: Reader object is nil") +) + +// +// ReadWriter combine reader and writer. +// +type ReadWriter struct { + Reader + Writer +} + +// +// New create a new ReadWriter object. +// +func New(config string, dataset interface{}) (rw *ReadWriter, e error) { + rw = &ReadWriter{} + + e = rw.Reader.Init(config, dataset) + if e != nil { + return nil, e + } + + e = OpenWriter(&rw.Writer, config) + if e != nil { + return nil, e + } + + return +} + +// +// SetConfigPath of input and output file. +// +func (dsv *ReadWriter) SetConfigPath(dir string) { + dsv.Reader.SetConfigPath(dir) + dsv.Writer.SetConfigPath(dir) +} + +// +// Close reader and writer. +// +func (dsv *ReadWriter) Close() (e error) { + e = dsv.Writer.Close() + if e != nil { + return + } + return dsv.Reader.Close() +} diff --git a/lib/dsv/dsv_test.go b/lib/dsv/dsv_test.go new file mode 100644 index 00000000..f4661cdf --- /dev/null +++ b/lib/dsv/dsv_test.go @@ -0,0 +1,96 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "testing" +) + +// +// doInit create read-write object. +// +func doInit(t *testing.T, fcfg string) (rw *ReadWriter, e error) { + // Initialize dsv + rw, e = New(fcfg, nil) + + if nil != e { + t.Fatal(e) + } + + return +} + +// +// TestReadWriter test reading and writing DSV. +// +func TestReadWriter(t *testing.T) { + rw, _ := doInit(t, "testdata/config.dsv") + + doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true) + + e := rw.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) +} + +// +// TestReadWriter test reading and writing DSV. +// +func TestReadWriterAll(t *testing.T) { + rw, _ := doInit(t, "testdata/config.dsv") + + rw.SetMaxRows(-1) + + doReadWrite(t, &rw.Reader, &rw.Writer, expectation, false) + + e := rw.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) +} + +func TestSimpleReadWrite(t *testing.T) { + fcfg := "testdata/config_simpleread.dsv" + + reader, e := SimpleRead(fcfg, nil) + if e != nil { + t.Fatal(e) + } + + fout := "testdata/output.dat" + fexp := "testdata/expected.dat" + + _, e = SimpleWrite(reader, fcfg) + if e != nil { + t.Fatal(e) + } + + assertFile(t, fexp, fout, true) +} + +func TestSimpleMerge(t *testing.T) { + fcfg1 := "testdata/config_simpleread.dsv" + fcfg2 := "testdata/config_simpleread.dsv" + + reader, e := SimpleMerge(fcfg1, fcfg2, nil, nil) + if e != nil { + t.Fatal(e) + } + + _, e = SimpleWrite(reader, fcfg1) + if e != nil { + t.Fatal(e) + } + + fexp := "testdata/expected_simplemerge.dat" + fout := "testdata/output.dat" + + assertFile(t, fexp, fout, true) +} diff --git a/lib/dsv/dsvinterface.go b/lib/dsv/dsvinterface.go new file mode 100644 index 00000000..6de50d08 --- /dev/null +++ b/lib/dsv/dsvinterface.go @@ -0,0 +1,85 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "io" +) + +// +// SimpleRead provide a shortcut to read data from file using configuration file +// from `fcfg`. +// Return the reader contained data or error if failed. +// Reader object upon returned has been closed, so if one need to read all +// data in it simply set the `MaxRows` to `-1` in config file. +// +func SimpleRead(fcfg string, dataset interface{}) ( + reader ReaderInterface, + e error, +) { + reader, e = NewReader(fcfg, dataset) + + if e != nil { + return + } + + _, e = Read(reader) + if e != nil && e != io.EOF { + return nil, e + } + + e = reader.Close() + + return +} + +// +// SimpleWrite provide a shortcut to write data from reader using output metadata +// format and output file defined in file `fcfg`. +// +func SimpleWrite(reader ReaderInterface, fcfg string) (nrows int, e error) { + writer, e := NewWriter(fcfg) + if e != nil { + return + } + + nrows, e = writer.Write(reader) + if e != nil { + return + } + + e = writer.Close() + + return +} + +// +// SimpleMerge provide a shortcut to merge two dsv files using configuration +// files passed in parameters. +// +// One must remember to set, +// - "MaxRows" to -1 to be able to read all rows, in both input configuration, and +// - "DatasetMode" to "columns" to speeding up process. +// +// This function return the merged reader or error if failed. +// +func SimpleMerge(fin1, fin2 string, dataset1, dataset2 interface{}) ( + ReaderInterface, + error, +) { + reader1, e := SimpleRead(fin1, dataset1) + if e != nil { + return nil, e + } + + reader2, e := SimpleRead(fin2, dataset2) + if e != nil { + return nil, e + } + + reader1.MergeColumns(reader2) + + return reader1, nil +} diff --git a/lib/dsv/metadata.go b/lib/dsv/metadata.go new file mode 100644 index 00000000..6e457080 --- /dev/null +++ b/lib/dsv/metadata.go @@ -0,0 +1,163 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "encoding/json" + "log" + "strings" + + "github.com/shuLhan/share/lib/tabula" +) + +// +// Metadata represent on how to parse each column in record. +// +type Metadata struct { + // Name of the column, optional. + Name string `json:"Name"` + // Type of the column, default to "string". + // Valid value are: "string", "integer", "real" + Type string `json:"Type"` + // T type of column in integer. + T int + // Separator for column in record. + Separator string `json:"Separator"` + // LeftQuote define the characters that enclosed the column in the left + // side. + LeftQuote string `json:"LeftQuote"` + // RightQuote define the characters that enclosed the column in the + // right side. + RightQuote string `json:"RightQuote"` + // Skip, if its true this column will be ignored, not saved in reader + // object. Default to false. + Skip bool `json:"Skip"` + // ValueSpace contain the possible value in records + ValueSpace []string `json:"ValueSpace"` +} + +// +// NewMetadata create and return new metadata. +// +func NewMetadata(name, tipe, sep, leftq, rightq string, vs []string) ( + md *Metadata, +) { + md = &Metadata{ + Name: name, + Type: tipe, + Separator: sep, + LeftQuote: leftq, + RightQuote: rightq, + ValueSpace: vs, + } + + md.Init() + + return +} + +// +// Init initialize metadata column, i.e. check and set column type. +// +// If type is unknown it will default to string. +// +func (md *Metadata) Init() { + switch strings.ToUpper(md.Type) { + case "INTEGER", "INT": + md.T = tabula.TInteger + case "REAL": + md.T = tabula.TReal + default: + md.T = tabula.TString + md.Type = "string" + } +} + +// +// GetName return the name of metadata. +// +func (md *Metadata) GetName() string { + return md.Name +} + +// +// GetType return type of metadata. +// +func (md *Metadata) GetType() int { + return md.T +} + +// +// GetTypeName return string representation of type. +// +func (md *Metadata) GetTypeName() string { + return md.Type +} + +// +// GetSeparator return the field separator. +// +func (md *Metadata) GetSeparator() string { + return md.Separator +} + +// +// GetLeftQuote return the string used in the beginning of record value. +// +func (md *Metadata) GetLeftQuote() string { + return md.LeftQuote +} + +// +// GetRightQuote return string that end in record value. +// +func (md *Metadata) GetRightQuote() string { + return md.RightQuote +} + +// +// GetSkip return number of rows that will be skipped when reading data. +// +func (md *Metadata) GetSkip() bool { + return md.Skip +} + +// +// GetValueSpace return value space. +// +func (md *Metadata) GetValueSpace() []string { + return md.ValueSpace +} + +// +// IsEqual return true if this metadata equal with other instance, return false +// otherwise. +// +func (md *Metadata) IsEqual(o MetadataInterface) bool { + if md.Name != o.GetName() { + return false + } + if md.Separator != o.GetSeparator() { + return false + } + if md.LeftQuote != o.GetLeftQuote() { + return false + } + if md.RightQuote != o.GetRightQuote() { + return false + } + return true +} + +// +// String yes, it will print it JSON like format. +// +func (md *Metadata) String() string { + r, e := json.MarshalIndent(md, "", "\t") + if nil != e { + log.Print(e) + } + return string(r) +} diff --git a/lib/dsv/metadata_test.go b/lib/dsv/metadata_test.go new file mode 100644 index 00000000..46630c11 --- /dev/null +++ b/lib/dsv/metadata_test.go @@ -0,0 +1,48 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "testing" +) + +func TestMetadataIsEqual(t *testing.T) { + cases := []struct { + in Metadata + out Metadata + result bool + }{ + { + Metadata{ + Name: "A", + Separator: ",", + }, + Metadata{ + Name: "A", + Separator: ",", + }, + true, + }, + { + Metadata{ + Name: "A", + Separator: ",", + }, + Metadata{ + Name: "A", + Separator: ";", + }, + false, + }, + } + + for _, c := range cases { + r := c.in.IsEqual(&c.out) + + if r != c.result { + t.Error("Test failed on ", c.in, c.out) + } + } +} diff --git a/lib/dsv/metadatainterface.go b/lib/dsv/metadatainterface.go new file mode 100644 index 00000000..a0425b2e --- /dev/null +++ b/lib/dsv/metadatainterface.go @@ -0,0 +1,45 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +// +// MetadataInterface is the interface for field metadata. +// This is to make anyone can extend the DSV library including the metadata. +// +type MetadataInterface interface { + Init() + GetName() string + GetType() int + GetTypeName() string + GetLeftQuote() string + GetRightQuote() string + GetSeparator() string + GetSkip() bool + GetValueSpace() []string + + IsEqual(MetadataInterface) bool +} + +// +// FindMetadata Given a slice of metadata, find `mdin` in the slice which has the +// same name, ignoring metadata where Skip value is true. +// If found, return the index and metadata object of matched metadata name. +// If not found return -1 as index and nil in `mdout`. +// +func FindMetadata(mdin MetadataInterface, mds []MetadataInterface) ( + idx int, + mdout MetadataInterface, +) { + for _, md := range mds { + if md.GetName() == mdin.GetName() { + mdout = md + break + } + if !md.GetSkip() { + idx++ + } + } + return idx, mdout +} diff --git a/lib/dsv/reader.go b/lib/dsv/reader.go new file mode 100644 index 00000000..1e78352e --- /dev/null +++ b/lib/dsv/reader.go @@ -0,0 +1,632 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "bufio" + "log" + "os" + "strings" + + "github.com/shuLhan/share/lib/tabula" +) + +const ( + // DatasetModeROWS is a string representation of output mode rows. + DatasetModeROWS = "ROWS" + // DatasetModeCOLUMNS is a string representation of output mode columns. + DatasetModeCOLUMNS = "COLUMNS" + // DatasetModeMATRIX will save data in rows and columns. This mode will + // consume more memory that "rows" and "columns" but give greater + // flexibility when working with data. + DatasetModeMATRIX = "MATRIX" +) + +// +// Reader hold all configuration, metadata and input data. +// +// DSV Reader work like this, +// +// (1) Initialize new dsv reader object +// +// dsvReader, e := dsv.NewReader(configfile) +// +// (2) Do not forget to check for error ... +// +// if e != nil { +// // handle error +// } +// +// (3) Make sure to close all files after finished +// +// defer dsvReader.Close () +// +// (4) Create loop to read input data +// +// for { +// n, e := dsv.Read (dsvReader) +// +// if e == io.EOF { +// break +// } +// +// (4.1) Iterate through rows +// +// for row := range dsvReader.GetDataAsRows() { +// // work with row ... +// } +// } +// +// Thats it. +// +// +type Reader struct { + // Config define path of configuration file. + // + // If the configuration located in other directory, e.g. + // "../../config.dsv", and the Input option is set with name only, like + // "input.dat", we assume that its in the same directory where the + // configuration file belong. + Config + // Dataset contains the content of input file after read. + dataset interface{} + // Input file, mandatory. + Input string `json:"Input"` + // Skip n lines from the head. + Skip int `json:"Skip"` + // TrimSpace or not. If its true, before parsing the line, the white + // space in the beginning and end of each input line will be removed, + // otherwise it will leave unmodified. Default is true. + TrimSpace bool `json:"TrimSpace"` + // Rejected is the file name where row that does not fit + // with metadata will be saved. + Rejected string `json:"Rejected"` + // InputMetadata define format for each column in input data. + InputMetadata []Metadata `json:"InputMetadata"` + // MaxRows define maximum row that this reader will read and + // saved in the memory at one read operation. + // If the value is -1, all rows will read. + MaxRows int `json:"MaxRows"` + // DatasetMode define on how do you want the result is saved. There are + // three options: either in "rows", "columns", or "matrix" mode. + // For example, input data file, + // + // a,b,c + // 1,2,3 + // + // "rows" mode is where each line saved in its own slice, resulting + // in Rows: + // + // [a b c] + // [1 2 3] + // + // "columns" mode is where each line saved by columns, resulting in + // Columns: + // + // [a 1] + // [b 2] + // [c 3] + // + // "matrix" mode is where each record saved in their own row and column. + // + DatasetMode string `json:"DatasetMode"` + // fRead is read descriptor. + fRead *os.File + // fReject is reject descriptor. + fReject *os.File + // bufRead is a buffer for working with input file. + bufRead *bufio.Reader + // bufReject is a buffer for working with rejected file. + bufReject *bufio.Writer +} + +// +// NewReader create and initialize new instance of DSV Reader with default values. +// +func NewReader(config string, dataset interface{}) (reader *Reader, e error) { + reader = &Reader{ + Input: "", + Skip: 0, + TrimSpace: true, + Rejected: DefaultRejected, + InputMetadata: nil, + MaxRows: DefaultMaxRows, + DatasetMode: DefDatasetMode, + dataset: dataset, + fRead: nil, + fReject: nil, + bufRead: nil, + bufReject: nil, + } + + e = reader.Init(config, dataset) + if e != nil { + return nil, e + } + + return +} + +// +// Init will initialize reader object by +// +// (1) Check if dataset is not empty. +// (2) Read config file. +// (3) Set reader object default value. +// (4) Check if output mode is valid and initialize it if valid. +// (5) Check and initialize metadata and columns attributes. +// (6) Check if Input is name only without path, so we can prefix it with +// config path. +// (7) Open rejected file. +// (8) Open input file. +// +func (reader *Reader) Init(fcfg string, dataset interface{}) (e error) { + // (1) + if dataset == nil { + dataset = reader.GetDataset() + if dataset == nil { + dataset = &tabula.Dataset{} + reader.dataset = dataset + } + } + + // (2) + fcfg = strings.TrimSpace(fcfg) + if fcfg != "" { + e = ConfigOpen(reader, fcfg) + if e != nil { + return e + } + + e = tabula.ReadDatasetConfig(dataset, fcfg) + if e != nil { + return e + } + } + + // (3) + reader.SetDefault() + + // (4) + reader.SetDatasetMode(reader.GetDatasetMode()) + + // (5) + ds := dataset.(tabula.DatasetInterface) + md := reader.GetInputMetadata() + for i := range md { + md[i].Init() + + // Count number of output columns. + if !md[i].GetSkip() { + // add type of metadata to list of type + col := tabula.Column{ + Type: md[i].GetType(), + Name: md[i].GetName(), + ValueSpace: md[i].GetValueSpace(), + } + ds.PushColumn(col) + } + } + + // (6) + reader.SetInput(ConfigCheckPath(reader, reader.GetInput())) + reader.SetRejected(ConfigCheckPath(reader, reader.GetRejected())) + + // (7) + e = reader.OpenRejected() + if nil != e { + return + } + + // (8) + e = reader.OpenInput() + if nil != e { + return + } + + return +} + +// +// SetDefault options for global config and each metadata. +// +func (reader *Reader) SetDefault() { + if "" == strings.TrimSpace(reader.Rejected) { + reader.Rejected = DefaultRejected + } + if 0 == reader.MaxRows { + reader.MaxRows = DefaultMaxRows + } + if "" == strings.TrimSpace(reader.DatasetMode) { + reader.DatasetMode = DefDatasetMode + } + if nil == reader.dataset { + reader.dataset = &tabula.Dataset{} + } +} + +// +// CopyConfig copy configuration from other reader object not including data +// and metadata. +// +func (reader *Reader) CopyConfig(src *Reader) { + reader.ConfigPath = src.GetConfigPath() + reader.Input = src.GetInput() + reader.Skip = src.GetSkip() + reader.TrimSpace = src.IsTrimSpace() + reader.Rejected = src.GetRejected() + reader.MaxRows = src.GetMaxRows() + reader.DatasetMode = src.GetDatasetMode() +} + +// +// GetInput return the input file. +// +func (reader *Reader) GetInput() string { + return reader.Input +} + +// +// SetInput file. +// +func (reader *Reader) SetInput(path string) { + reader.Input = path +} + +// +// GetSkip return number of line that will be skipped. +// +func (reader *Reader) GetSkip() int { + return reader.Skip +} + +// +// SetSkip set number of lines that will be skipped before reading actual data. +// +func (reader *Reader) SetSkip(n int) { + reader.Skip = n +} + +// +// IsTrimSpace return value of TrimSpace option. +// +func (reader *Reader) IsTrimSpace() bool { + return reader.TrimSpace +} + +// +// GetRejected return name of rejected file. +// +func (reader *Reader) GetRejected() string { + return reader.Rejected +} + +// +// SetRejected file. +// +func (reader *Reader) SetRejected(path string) { + reader.Rejected = path +} + +// +// AddInputMetadata add new input metadata to reader. +// +func (reader *Reader) AddInputMetadata(md *Metadata) { + reader.InputMetadata = append(reader.InputMetadata, *md) + ds := reader.dataset.(tabula.DatasetInterface) + ds.AddColumn(md.GetType(), md.GetName(), md.GetValueSpace()) +} + +// +// AppendMetadata will append new metadata `md` to list of reader input metadata. +// +func (reader *Reader) AppendMetadata(mdi MetadataInterface) { + md := mdi.(*Metadata) + reader.InputMetadata = append(reader.InputMetadata, *md) +} + +// +// GetInputMetadata return pointer to slice of metadata. +// +func (reader *Reader) GetInputMetadata() []MetadataInterface { + md := make([]MetadataInterface, len(reader.InputMetadata)) + for i := range reader.InputMetadata { + md[i] = &reader.InputMetadata[i] + } + + return md +} + +// +// GetInputMetadataAt return pointer to metadata at index 'idx'. +// +func (reader *Reader) GetInputMetadataAt(idx int) MetadataInterface { + return &reader.InputMetadata[idx] +} + +// +// GetMaxRows return number of maximum rows for reading. +// +func (reader *Reader) GetMaxRows() int { + return reader.MaxRows +} + +// +// SetMaxRows will set maximum rows that will be read from input file. +// +func (reader *Reader) SetMaxRows(max int) { + reader.MaxRows = max +} + +// +// GetDatasetMode return output mode of data. +// +func (reader *Reader) GetDatasetMode() string { + return reader.DatasetMode +} + +// +// SetDatasetMode to `mode`. +// +func (reader *Reader) SetDatasetMode(mode string) { + ds := reader.dataset.(tabula.DatasetInterface) + switch strings.ToUpper(mode) { + case DatasetModeROWS: + ds.SetMode(tabula.DatasetModeRows) + case DatasetModeCOLUMNS: + ds.SetMode(tabula.DatasetModeColumns) + case DatasetModeMATRIX: + fallthrough + default: + ds.SetMode(tabula.DatasetModeMatrix) + mode = DatasetModeMATRIX + } + reader.DatasetMode = mode +} + +// +// GetNColumnIn return number of input columns, or number of metadata, including +// column with Skip=true. +// +func (reader *Reader) GetNColumnIn() int { + return len(reader.InputMetadata) +} + +// +// OpenInput open the input file, metadata must have been initialize. +// +func (reader *Reader) OpenInput() (e error) { + reader.fRead, e = os.OpenFile(reader.Input, os.O_RDONLY, 0600) + if nil != e { + return e + } + + reader.bufRead = bufio.NewReader(reader.fRead) + + // Skip lines + if reader.GetSkip() > 0 { + e = reader.SkipLines() + + if nil != e { + return + } + } + + return nil +} + +// +// OpenRejected open rejected file, for saving unparseable line. +// +func (reader *Reader) OpenRejected() (e error) { + reader.fReject, e = os.OpenFile(reader.Rejected, + os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) + if nil != e { + return e + } + + reader.bufReject = bufio.NewWriter(reader.fReject) + + return nil +} + +// +// Open input and rejected file. +// +func (reader *Reader) Open() (e error) { + // do not let file descriptor leaked + e = reader.Close() + if e != nil { + return + } + + e = reader.OpenInput() + if e != nil { + return + } + + e = reader.OpenRejected() + + return +} + +// +// SkipLines skip parsing n lines from input file. +// The n is defined in the attribute "Skip" +// +func (reader *Reader) SkipLines() (e error) { + for i := 0; i < reader.Skip; i++ { + _, e = reader.ReadLine() + + if nil != e { + log.Print("dsv: ", e) + return + } + } + return +} + +// +// Reset all variables for next read operation. Number of rows will be 0, and +// Rows will be empty again. +// +func (reader *Reader) Reset() (e error) { + e = reader.Flush() + if e != nil { + return + } + e = reader.dataset.(tabula.DatasetInterface).Reset() + return +} + +// +// Flush all output buffer. +// +func (reader *Reader) Flush() error { + return reader.bufReject.Flush() +} + +// +// ReadLine will read one line from input file. +// +func (reader *Reader) ReadLine() (line []byte, e error) { + line, e = reader.bufRead.ReadBytes(DefEOL) + + if e == nil { + // remove EOL + line = line[:len(line)-1] + } + + return +} + +// +// FetchNextLine read the next line and combine it with the `lastline`. +// +func (reader *Reader) FetchNextLine(lastline []byte) (line []byte, e error) { + line, e = reader.ReadLine() + + lastline = append(lastline, DefEOL) + lastline = append(lastline, line...) + + return lastline, e +} + +// +// Reject the line and save it to the reject file. +// +func (reader *Reader) Reject(line []byte) (int, error) { + return reader.bufReject.Write(line) +} + +// +// deleteEmptyRejected if rejected file is empty, delete it. +// +func (reader *Reader) deleteEmptyRejected() { + finfo, e := os.Stat(reader.Rejected) + if e != nil { + return + } + + if finfo.Size() >= 0 { + _ = os.Remove(reader.Rejected) + } +} + +// +// Close all open descriptors. +// +func (reader *Reader) Close() (e error) { + if nil != reader.bufReject { + e = reader.bufReject.Flush() + if e != nil { + return + } + } + if nil != reader.fReject { + e = reader.fReject.Close() + if e != nil { + return + } + } + + reader.deleteEmptyRejected() + + if nil != reader.fRead { + e = reader.fRead.Close() + } + return +} + +// +// IsEqual compare only the configuration and metadata with other instance. +// +func (reader *Reader) IsEqual(other *Reader) bool { + if reader == other { + return true + } + if reader.Input != other.Input { + return false + } + + l, r := len(reader.InputMetadata), len(other.InputMetadata) + + if l != r { + return false + } + + for a := 0; a < l; a++ { + if !reader.InputMetadata[a].IsEqual(&other.InputMetadata[a]) { + return false + } + } + + return true +} + +// +// GetDataset return reader dataset. +// +func (reader *Reader) GetDataset() interface{} { + return reader.dataset +} + +// +// MergeColumns append metadata and columns from another reader if not exist in +// current metadata set. +// +func (reader *Reader) MergeColumns(other ReaderInterface) { + for _, md := range other.GetInputMetadata() { + if md.GetSkip() { + continue + } + + // Check if the same metadata name exist in current dataset. + found := false + for _, lmd := range reader.GetInputMetadata() { + if lmd.GetName() == md.GetName() { + found = true + break + } + } + + if found { + continue + } + + reader.AppendMetadata(md) + } + + reader.dataset.(tabula.DatasetInterface).MergeColumns( + other.GetDataset().(tabula.DatasetInterface)) +} + +// +// MergeRows append rows from another reader. +// +func (reader *Reader) MergeRows(other *Reader) { + reader.dataset.(tabula.DatasetInterface).MergeRows( + other.GetDataset().(tabula.DatasetInterface)) +} diff --git a/lib/dsv/reader_test.go b/lib/dsv/reader_test.go new file mode 100644 index 00000000..d8d724b4 --- /dev/null +++ b/lib/dsv/reader_test.go @@ -0,0 +1,601 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "fmt" + "io" + "strings" + "testing" + + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +var jsonSample = []string{ + `{}`, + `{ + "Input" :"testdata/input.dat" + }`, + `{ + "Input" :"testdata/input.dat" + }`, + `{ + "Input" :"testdata/input.dat" + , "InputMetadata" : + [{ + "Name" :"A" + , "Separator" :"," + },{ + "Name" :"B" + , "Separator" :";" + }] + }`, + `{ + "Input" :"testdata/input.dat" + , "Skip" :1 + , "MaxRows" :1 + , "InputMetadata" : + [{ + "Name" :"id" + , "Separator" :";" + , "Type" :"integer" + },{ + "Name" :"name" + , "Separator" :"-" + , "LeftQuote" :"\"" + , "RightQuote" :"\"" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"[[" + , "RightQuote" :"]]" + },{ + "Name" :"integer" + , "Type" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + , "Type" :"real" + }] + }`, + `{ + "Input" :"testdata/input.dat" + , "Skip" :1 + , "MaxRows" :1 + , "InputMetadata" : + [{ + "Name" :"id" + },{ + "Name" :"editor" + },{ + "Name" :"old_rev_id" + },{ + "Name" :"new_rev_id" + },{ + "Name" :"diff_url" + },{ + "Name" :"edit_time" + },{ + "Name" :"edit_comment" + },{ + "Name" :"article_id" + },{ + "Name" :"article_title" + }] + }`, +} + +var readers = []*Reader{ + {}, + { + Input: "testdata/input.dat", + }, + { + Input: "test-another.dsv", + }, + { + Input: "testdata/input.dat", + InputMetadata: []Metadata{ + { + Name: "A", + Separator: ",", + }, + { + Name: "B", + Separator: ";", + }, + }, + }, +} + +// +// TestReaderNoInput will print error that the input is not defined. +// +func TestReaderNoInput(t *testing.T) { + dsvReader := &Reader{} + + e := ConfigParse(dsvReader, []byte(jsonSample[0])) + + if nil != e { + t.Fatal(e) + } + + e = dsvReader.Init("", nil) + + if nil == e { + t.Fatal("TestReaderNoInput: failed, should return non nil!") + } +} + +// +// TestConfigParse test parsing metadata. +// +func TestConfigParse(t *testing.T) { + cases := []struct { + in string + out *Reader + }{ + { + jsonSample[1], + readers[1], + }, + { + jsonSample[3], + readers[3], + }, + } + + dsvReader := &Reader{} + + for _, c := range cases { + e := ConfigParse(dsvReader, []byte(c.in)) + + if e != nil { + t.Fatal(e) + } + if !dsvReader.IsEqual(c.out) { + t.Fatal("Test failed on ", c.in) + } + } +} + +func TestReaderIsEqual(t *testing.T) { + cases := []struct { + in *Reader + out *Reader + result bool + }{ + { + readers[1], + &Reader{ + Input: "testdata/input.dat", + }, + true, + }, + { + readers[1], + readers[2], + false, + }, + } + + var r bool + + for _, c := range cases { + r = c.in.IsEqual(c.out) + + if r != c.result { + t.Fatal("Test failed on equality between ", c.in, + "\n and ", c.out) + } + } +} + +// +// doRead test reading the DSV data. +// +func doRead(t *testing.T, dsvReader *Reader, exp []string) { + i := 0 + var n int + var e error + + for { + n, e = Read(dsvReader) + + if n > 0 { + r := fmt.Sprint(dsvReader. + GetDataset().(tabula.DatasetInterface). + GetDataAsRows()) + + test.Assert(t, "", exp[i], r, true) + + i++ + } else if e == io.EOF { + // EOF + break + } + } +} + +// +// TestReader test reading. +// +func TestReaderRead(t *testing.T) { + dsvReader := &Reader{} + + e := ConfigParse(dsvReader, []byte(jsonSample[4])) + + if nil != e { + t.Fatal(e) + } + + e = dsvReader.Init("", nil) + if nil != e { + t.Fatal(e) + } + + doRead(t, dsvReader, expectation) + + e = dsvReader.Close() + if e != nil { + t.Fatal(e) + } +} + +// +// TestReaderOpen real example from the start. +// +func TestReaderOpen(t *testing.T) { + dsvReader, e := NewReader("testdata/config.dsv", nil) + if nil != e { + t.Fatal(e) + } + + doRead(t, dsvReader, expectation) + + e = dsvReader.Close() + if e != nil { + t.Fatal(e) + } +} + +func TestDatasetMode(t *testing.T) { + var e error + var config = []string{`{ + "Input" :"testdata/input.dat" + , "DatasetMode" :"row" + }`, `{ + "Input" :"testdata/input.dat" + , "DatasetMode" :"rows" + }`, `{ + "Input" :"testdata/input.dat" + , "DatasetMode" :"columns" + }`} + + var exps = []struct { + status bool + value string + }{{ + false, + string(config[0]), + }, { + true, + string(config[1]), + }, { + true, + string(config[2]), + }} + + reader := &Reader{} + + for k, v := range exps { + e = ConfigParse(reader, []byte(config[k])) + + if e != nil { + t.Fatal(e) + } + + e = reader.Init("", nil) + if e != nil { + if v.status { + t.Fatal(e) + } + } + } +} + +func TestReaderToColumns(t *testing.T) { + reader := &Reader{} + + e := ConfigParse(reader, []byte(jsonSample[4])) + if nil != e { + t.Fatal(e) + } + + e = reader.Init("", nil) + if nil != e { + t.Fatal(e) + } + + reader.SetDatasetMode(DatasetModeCOLUMNS) + + var n, i int + for { + n, e = Read(reader) + + if n > 0 { + ds := reader.GetDataset().(tabula.DatasetInterface) + ds.TransposeToRows() + + r := fmt.Sprint(ds.GetData()) + + test.Assert(t, "", expectation[i], r, true) + + i++ + } else if e == io.EOF { + // EOF + break + } + } +} + +// +// TestReaderSkip will test the 'Skip' option in Metadata. +// +func TestReaderSkip(t *testing.T) { + dsvReader, e := NewReader("testdata/config_skip.dsv", nil) + if nil != e { + t.Fatal(e) + } + + doRead(t, dsvReader, expSkip) + + e = dsvReader.Close() + if e != nil { + t.Fatal(e) + } +} + +func TestTransposeToColumns(t *testing.T) { + reader, e := NewReader("testdata/config_skip.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader.SetMaxRows(-1) + + _, e = Read(reader) + + if e != io.EOF { + t.Fatal(e) + } + + ds := reader.GetDataset().(tabula.DatasetInterface) + ds.TransposeToColumns() + + exp := fmt.Sprint(expSkipColumnsAll) + + columns := ds.GetDataAsColumns() + + got := fmt.Sprint(*columns) + + test.Assert(t, "", exp, got, true) + + e = reader.Close() + if e != nil { + t.Fatal(e) + } +} + +func TestSortColumnsByIndex(t *testing.T) { + reader, e := NewReader("testdata/config_skip.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader.SetMaxRows(-1) + + _, e = Read(reader) + if e != io.EOF { + t.Fatal(e) + } + + // reverse the data + var idxReverse []int + var expReverse []string + + for x := len(expSkip) - 1; x >= 0; x-- { + idxReverse = append(idxReverse, x) + expReverse = append(expReverse, expSkip[x]) + } + + ds := reader.GetDataset().(tabula.DatasetInterface) + + tabula.SortColumnsByIndex(ds, idxReverse) + + exp := strings.Join(expReverse, "") + got := fmt.Sprint(ds.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + exp = "[" + strings.Join(expSkipColumnsAllRev, " ") + "]" + + columns := ds.GetDataAsColumns() + + got = fmt.Sprint(*columns) + + test.Assert(t, "", exp, got, true) + + e = reader.Close() + if e != nil { + t.Fatal(e) + } +} + +func TestSplitRowsByValue(t *testing.T) { + reader, e := NewReader("testdata/config.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader.SetMaxRows(256) + + _, e = Read(reader) + + if e != nil && e != io.EOF { + t.Fatal(e) + } + + ds := reader.GetDataset().(tabula.DatasetInterface) + splitL, splitR, e := tabula.SplitRowsByValue(ds, 0, 6) + + if e != nil { + t.Fatal(e) + } + + // test left split + exp := "" + for x := 0; x < 4; x++ { + exp += expectation[x] + } + + got := fmt.Sprint(splitL.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + // test right split + exp = "" + for x := 4; x < len(expectation); x++ { + exp += expectation[x] + } + + got = fmt.Sprint(splitR.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + e = reader.Close() + if e != nil { + t.Fatal(e) + } +} + +// +// testWriteOutput will write merged reader and check with expected file output. +// +func testWriteOutput(t *testing.T, r *Reader, outfile, expfile string) { + + writer, e := NewWriter("") + if e != nil { + t.Fatal(e) + } + + e = writer.OpenOutput(outfile) + + if e != nil { + t.Fatal(e) + } + + sep := "\t" + ds := r.GetDataset().(tabula.DatasetInterface) + + _, e = writer.WriteRawDataset(ds, &sep) + if e != nil { + t.Fatal(e) + } + + e = writer.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, outfile, expfile, true) +} + +func TestMergeColumns(t *testing.T) { + reader1, e := NewReader("testdata/config.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader2, e := NewReader("testdata/config_skip.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader1.SetMaxRows(-1) + reader2.SetMaxRows(-1) + + _, e = Read(reader1) + if e != io.EOF { + t.Fatal(e) + } + + _, e = Read(reader2) + if e != io.EOF { + t.Fatal(e) + } + + e = reader1.Close() + if e != nil { + t.Fatal(e) + } + + e = reader2.Close() + if e != nil { + t.Fatal(e) + } + + reader1.InputMetadata[len(reader1.InputMetadata)-1].Separator = ";" + + reader1.MergeColumns(reader2) + + outfile := "testdata/output_merge_columns.dat" + expfile := "testdata/expected_merge_columns.dat" + + testWriteOutput(t, reader1, outfile, expfile) +} + +func TestMergeRows(t *testing.T) { + reader1, e := NewReader("testdata/config.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader2, e := NewReader("testdata/config_skip.dsv", nil) + if nil != e { + t.Fatal(e) + } + + reader1.SetMaxRows(-1) + reader2.SetMaxRows(-1) + + _, e = Read(reader1) + if e != io.EOF { + t.Fatal(e) + } + + _, e = Read(reader2) + if e != io.EOF { + t.Fatal(e) + } + + e = reader1.Close() + if e != nil { + t.Fatal(e) + } + + e = reader2.Close() + if e != nil { + t.Fatal(e) + } + + reader1.MergeRows(reader2) + + outfile := "testdata/output_merge_rows.dat" + expfile := "testdata/expected_merge_rows.dat" + + testWriteOutput(t, reader1, outfile, expfile) +} diff --git a/lib/dsv/readererror.go b/lib/dsv/readererror.go new file mode 100644 index 00000000..6bd7616e --- /dev/null +++ b/lib/dsv/readererror.go @@ -0,0 +1,52 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "fmt" +) + +const ( + _ = iota + // EReadMissLeftQuote read error when no left-quote found on line. + EReadMissLeftQuote + // EReadMissRightQuote read error when no right-quote found on line. + EReadMissRightQuote + // EReadMissSeparator read error when no separator found on line. + EReadMissSeparator + // EReadLine error when reading line from file. + EReadLine + // EReadEOF error which indicated end-of-file. + EReadEOF + // ETypeConversion error when converting type from string to numeric or + // vice versa. + ETypeConversion +) + +// +// ReaderError to handle error data and message. +// +type ReaderError struct { + // T define type of error. + T int + // Func where error happened + Func string + // What cause the error? + What string + // Line define the line which cause error + Line string + // Pos character position which cause error + Pos int + // N line number + N int +} + +// +// Error to string. +// +func (e *ReaderError) Error() string { + return fmt.Sprintf("dsv.Reader.%-20s [%d:%d]: %-30s data:|%s|", e.Func, e.N, + e.Pos, e.What, e.Line) +} diff --git a/lib/dsv/readerinterface.go b/lib/dsv/readerinterface.go new file mode 100644 index 00000000..b7bc489f --- /dev/null +++ b/lib/dsv/readerinterface.go @@ -0,0 +1,434 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "bytes" + "fmt" + "io" + "os" + + libbytes "github.com/shuLhan/share/lib/bytes" + "github.com/shuLhan/share/lib/tabula" +) + +// +// ReaderInterface is the interface for reading DSV file. +// +type ReaderInterface interface { + ConfigInterface + AddInputMetadata(*Metadata) + AppendMetadata(MetadataInterface) + GetInputMetadata() []MetadataInterface + GetInputMetadataAt(idx int) MetadataInterface + GetMaxRows() int + SetMaxRows(max int) + GetDatasetMode() string + SetDatasetMode(mode string) + GetNColumnIn() int + GetInput() string + SetInput(path string) + GetRejected() string + SetRejected(path string) + GetSkip() int + SetSkip(n int) + IsTrimSpace() bool + SetDefault() + OpenInput() error + OpenRejected() error + SkipLines() error + + Reset() error + Flush() error + ReadLine() ([]byte, error) + FetchNextLine([]byte) ([]byte, error) + Reject(line []byte) (int, error) + Close() error + + GetDataset() interface{} + MergeColumns(ReaderInterface) +} + +// +// Read row from input file. +// +func Read(reader ReaderInterface) (n int, e error) { + var ( + row *tabula.Row + line []byte + linenum int + eRead *ReaderError + ) + maxrows := reader.GetMaxRows() + + e = reader.Reset() + if e != nil { + return + } + + dataset := reader.GetDataset().(tabula.DatasetInterface) + + // Loop until we reached MaxRows (> 0) or when all rows has been + // read (= -1) + for { + row, line, linenum, eRead = ReadRow(reader, linenum) + if nil == eRead { + dataset.PushRow(row) + + n++ + if maxrows > 0 && n >= maxrows { + break + } + continue + } + + if eRead.T&EReadEOF == EReadEOF { + _ = reader.Flush() + e = io.EOF + return + } + + eRead.N = linenum + fmt.Fprintf(os.Stderr, "%s\n", eRead) + + // If error, save the rejected line. + line = append(line, DefEOL) + + _, e = reader.Reject(line) + if e != nil { + break + } + } + + // remember to flush if we have rejected rows. + e = reader.Flush() + + return n, e +} + +// +// parsingLeftQuote parse the left-quote string from line. +// +func parsingLeftQuote(lq, line []byte, startAt int) ( + p int, eRead *ReaderError, +) { + p = startAt + + // parsing until we found left quote token + p, found := libbytes.SkipAfterToken(line, lq, p, false) + + if found { + return p, nil + } + + eRead = &ReaderError{ + T: EReadMissLeftQuote, + Func: "parsingLeftQuote", + What: "Missing left-quote '" + string(lq) + "'", + Line: string(line), + Pos: p, + N: 0, + } + + return p, eRead +} + +// +// parsingSeparator parsing the line until we found the separator. +// +// Return the data and index of last parsed line, or error if separator is not +// found or not match with specification. +// +func parsingSeparator(sep, line []byte, startAt int) ( + v []byte, p int, eRead *ReaderError, +) { + p = startAt + + v, p, found := libbytes.CutUntilToken(line, sep, p, false) + + if found { + return v, p, nil + } + + eRead = &ReaderError{ + Func: "parsingSeparator", + What: "Missing separator '" + string(sep) + "'", + Line: string(line), + Pos: p, + N: 0, + } + + return v, p, eRead +} + +// +// parsingRightQuote parsing the line until we found the right quote or separator. +// +// Return the data and index of last parsed line, or error if right-quote is not +// found or not match with specification. +// +func parsingRightQuote(reader ReaderInterface, rq, line []byte, startAt int) ( + v, lines []byte, p int, eRead *ReaderError, +) { + var e error + var content []byte + p = startAt + var found bool + + // (2.2.1) + for { + content, p, found = libbytes.CutUntilToken(line, rq, p, true) + + v = append(v, content...) + + if found { + return v, line, p, nil + } + + // EOL before finding right-quote. + // Read and join with the next line. + line, e = reader.FetchNextLine(line) + + if e != nil { + break + } + } + + eRead = &ReaderError{ + T: EReadMissRightQuote, + Func: "parsingRightQuote", + What: "Missing right-quote '" + string(rq) + "'", + Line: string(line), + Pos: p, + N: 0, + } + + if e == io.EOF { + eRead.T &= EReadEOF + } + + return v, line, p, eRead +} + +// +// parsingSkipSeparator parse until we found separator or EOF +// +func parsingSkipSeparator(sep, line []byte, startAt int) ( + p int, eRead *ReaderError, +) { + p = startAt + + p, found := libbytes.SkipAfterToken(line, sep, p, false) + + if found { + return p, nil + } + + eRead = &ReaderError{ + T: EReadMissSeparator, + Func: "parsingSkipSeparator", + What: "Missing separator '" + string(sep) + "'", + Line: string(line), + Pos: p, + N: 0, + } + + return p, eRead +} + +// +// parsingSkipSpace skip all space starting from `startAt`. +// +func parsingSkipSpace(line []byte, startAt int) (p int) { + linelen := len(line) + + for p = startAt; p < linelen; p++ { + if line[p] == ' ' || line[p] == '\t' || line[p] == '\n' || + line[p] == '\r' { + continue + } + break + } + return +} + +// +// ParseLine parse a line containing records. The output is array of record +// (or single row). +// +// This is how the algorithm works +// (1) create n slice of record, where n is number of column metadata +// (2) for each metadata +// (2.0) Check if the next sequence matched with separator. +// (2.0.1) If its match, create empty record +// (2.1) If using left quote, skip until we found left-quote +// (2.2) If using right quote, append byte to buffer until right-quote +// (2.2.1) If using separator, skip until separator +// (2.3) If using separator, append byte to buffer until separator +// (2.4) else append all byte to buffer. +// (3) save buffer to record +// +func ParseLine(reader ReaderInterface, line []byte) ( + prow *tabula.Row, eRead *ReaderError, +) { + p := 0 + rIdx := 0 + inputMd := reader.GetInputMetadata() + row := make(tabula.Row, 0) + + for _, md := range inputMd { + lq := md.GetLeftQuote() + rq := md.GetRightQuote() + sep := md.GetSeparator() + v := []byte{} + + // (2.0) + if sep != "" && sep != lq { + match := libbytes.IsTokenAt(line, []byte(sep), p) + + // (2.0.1) + if match { + p += len(sep) + goto empty + } + } + + // (2.1) + if lq != "" { + p, eRead = parsingLeftQuote([]byte(lq), line, p) + + if eRead != nil { + return + } + } + + // (2.2) + if rq != "" { + v, line, p, eRead = parsingRightQuote(reader, []byte(rq), + line, p) + + if eRead != nil { + return + } + + if sep != "" { + p, eRead = parsingSkipSeparator([]byte(sep), + line, p) + + if eRead != nil { + return + } + + // Handle multi space if separator is a single + // space. + if sep == " " { + p = parsingSkipSpace(line, p) + } + } + } else { + if sep != "" { + // Skip space at beginning if separator is a + // single space. + if sep == " " { + p = parsingSkipSpace(line, p) + } + + v, p, eRead = parsingSeparator([]byte(sep), + line, p) + + if eRead != nil { + return + } + + // Handle multi space if separator is a single + // space. + if sep == " " { + p = parsingSkipSpace(line, p) + } + } else { + v = line[p:] + p = p + len(line) + } + } + + if md.GetSkip() { + continue + } + empty: + r, e := tabula.NewRecordBy(string(v), md.GetType()) + + if nil != e { + msg := fmt.Sprintf("md %s: Type convertion error from %q to %s", + md.GetName(), string(v), md.GetTypeName()) + + return nil, &ReaderError{ + T: ETypeConversion, + Func: "ParseLine", + What: msg, + Line: string(line), + Pos: p, + N: 0, + } + } + + row = append(row, r) + rIdx++ + } + + return &row, nil +} + +// +// ReadRow read one line at a time until we get one row or error when parsing the +// data. +// +func ReadRow(reader ReaderInterface, linenum int) ( + row *tabula.Row, + line []byte, + n int, + eRead *ReaderError, +) { + var e error + n = linenum + + // Read one line, skip empty line. + for { + line, e = reader.ReadLine() + n++ + + if e != nil { + goto err + } + + // check for empty line + linetrimed := bytes.TrimSpace(line) + + if len(linetrimed) > 0 { + break + } + } + + if reader.IsTrimSpace() { + line = bytes.TrimSpace(line) + } + + row, eRead = ParseLine(reader, line) + + return row, line, n, eRead + +err: + eRead = &ReaderError{ + Func: "ReadRow", + What: fmt.Sprint(e), + } + + if e == io.EOF { + eRead.T = EReadEOF + } else { + eRead.T = EReadLine + } + + return nil, line, n, eRead +} diff --git a/lib/dsv/testdata/claset.dsv b/lib/dsv/testdata/claset.dsv new file mode 100644 index 00000000..98082e09 --- /dev/null +++ b/lib/dsv/testdata/claset.dsv @@ -0,0 +1,7 @@ +{ + "Input" :"input.dat" +, "Rejected" :"rejected.dat" +, "Skip" :1 +, "MaxRows" :2 +, "ClassIndex" :3 +} diff --git a/lib/dsv/testdata/config.dsv b/lib/dsv/testdata/config.dsv new file mode 100644 index 00000000..b8b8bd52 --- /dev/null +++ b/lib/dsv/testdata/config.dsv @@ -0,0 +1,50 @@ +{ + "Input" :"input.dat" +, "Rejected" :"rejected.dat" +, "Skip" :1 +, "MaxRows" :1 +, "InputMetadata" : + [{ + "Name" :"id" + , "Separator" :";" + , "Type" :"integer" + },{ + "Name" :"name" + , "Separator" :"-" + , "LeftQuote" :"\"" + , "RightQuote" :"\"" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"[[" + , "RightQuote" :"]]" + },{ + "Name" :"integer" + , "Type" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + , "Type" :"real" + }] +, "Output" :"output.dat" +, "OutputMetadata": + [{ + "Name" :"id" + , "LeftQuote" :"ID " + , "Separator" :"/" + },{ + "Name" :"name" + , "RightQuote" :"#" + , "Separator" :"\t" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"{{" + , "RightQuote" :"}}" + },{ + "Name" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + }] +} diff --git a/lib/dsv/testdata/config_simpleread.dsv b/lib/dsv/testdata/config_simpleread.dsv new file mode 100644 index 00000000..5a877a4c --- /dev/null +++ b/lib/dsv/testdata/config_simpleread.dsv @@ -0,0 +1,50 @@ +{ + "Input" :"input.dat" +, "Rejected" :"rejected.dat" +, "Skip" :1 +, "MaxRows" :-1 +, "InputMetadata" : + [{ + "Name" :"id" + , "Separator" :";" + , "Type" :"integer" + },{ + "Name" :"name" + , "Separator" :"-" + , "LeftQuote" :"\"" + , "RightQuote" :"\"" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"[[" + , "RightQuote" :"]]" + },{ + "Name" :"integer" + , "Type" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + , "Type" :"real" + }] +, "Output" :"output.dat" +, "OutputMetadata": + [{ + "Name" :"id" + , "LeftQuote" :"ID " + , "Separator" :"/" + },{ + "Name" :"name" + , "RightQuote" :"#" + , "Separator" :"\t" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"{{" + , "RightQuote" :"}}" + },{ + "Name" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + }] +} diff --git a/lib/dsv/testdata/config_skip.dsv b/lib/dsv/testdata/config_skip.dsv new file mode 100644 index 00000000..d3a3aa13 --- /dev/null +++ b/lib/dsv/testdata/config_skip.dsv @@ -0,0 +1,54 @@ +{ + "Input" :"input.dat" +, "Rejected" :"rejected.dat" +, "Skip" :1 +, "MaxRows" :1 +, "InputMetadata" : + [{ + "Name" :"id" + , "Separator" :";" + , "Type" :"integer" + , "Skip" :true + },{ + "Name" :"name" + , "Separator" :"-" + , "LeftQuote" :"\"" + , "RightQuote" :"\"" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"[[" + , "RightQuote" :"]]" + },{ + "Name" :"integer" + , "Type" :"integer" + , "Separator" :";" + },{ + "Name" :"real" + , "Type" :"real" + }] +, "Output" :"testdata/output_skip.dat" +, "OutputMetadata": + [{ + "Name" :"real" + , "Separator" :";" + },{ + "Name" :"integer" + , "Separator" :";" + },{ + "Name" :"value" + , "Separator" :";" + , "LeftQuote" :"{{" + , "RightQuote" :"}}" + },{ + "Name" :"name" + , "RightQuote" :"#" + },{ + "Name" :"id" + , "LeftQuote" :"ID " + , "Separator" :"/" + },{ + "Name" :"invalid" + , "Separator" :";" + }] +} diff --git a/lib/dsv/testdata/expected.dat b/lib/dsv/testdata/expected.dat new file mode 100644 index 00000000..6aeb2f94 --- /dev/null +++ b/lib/dsv/testdata/expected.dat @@ -0,0 +1,11 @@ +ID 1/A-B# {{AB}};1;0.1 +ID 2/A-B-C# {{BCD}};2;0.02 +ID 3/A;B-C,D# {{A;B C,D}};3;0.003 +ID 4/A;B-C,D# {{A;B C D}};4;0.0004 +ID 6/# {{}};6;0.000006 +ID 8/ok# {{missing right-quote];8;0.00000008 +9;"ok"-[[ok}};9;0.000000009 +ID 10/test# {{integer}};10;0.101 +ID 12/test# {{real}};123456789;0.123456789 +ID 13/string with# {{string with}};13;13 +ID 14/string with" quote# {{string with]] escape}};14;14 diff --git a/lib/dsv/testdata/expected_merge_columns.dat b/lib/dsv/testdata/expected_merge_columns.dat new file mode 100644 index 00000000..5c279bab --- /dev/null +++ b/lib/dsv/testdata/expected_merge_columns.dat @@ -0,0 +1,22 @@ +1 A-B AB 1 0.1 +2 A-B-C BCD 2 0.02 +3 A;B-C,D A;B C,D 3 0.003 +4 A;B-C,D A;B C D 4 0.0004 +6 6 0.000006 +8 ok missing right-quote];8;0.00000008 +9;"ok"-[[ok 9 0.000000009 +10 test integer 10 0.101 +12 test real 123456789 0.123456789 +13 string with string with 13 13 +14 string with" quote string with]] escape 14 14 + A-B AB 1 0.1 + A-B-C BCD 2 0.02 + A;B-C,D A;B C,D 3 0.003 + A;B-C,D A;B C D 4 0.0004 + 6 0.000006 + ok missing right-quote];8;0.00000008 +9;"ok"-[[ok 9 0.000000009 + test integer 10 0.101 + test real 123456789 0.123456789 + string with string with 13 13 + string with" quote string with]] escape 14 14 diff --git a/lib/dsv/testdata/expected_merge_rows.dat b/lib/dsv/testdata/expected_merge_rows.dat new file mode 100644 index 00000000..a5880c76 --- /dev/null +++ b/lib/dsv/testdata/expected_merge_rows.dat @@ -0,0 +1,22 @@ +1 A-B AB 1 0.1 +2 A-B-C BCD 2 0.02 +3 A;B-C,D A;B C,D 3 0.003 +4 A;B-C,D A;B C D 4 0.0004 +6 6 0.000006 +8 ok missing right-quote];8;0.00000008 +9;"ok"-[[ok 9 0.000000009 +10 test integer 10 0.101 +12 test real 123456789 0.123456789 +13 string with string with 13 13 +14 string with" quote string with]] escape 14 14 +A-B AB 1 0.1 +A-B-C BCD 2 0.02 +A;B-C,D A;B C,D 3 0.003 +A;B-C,D A;B C D 4 0.0004 + 6 0.000006 +ok missing right-quote];8;0.00000008 +9;"ok"-[[ok 9 0.000000009 +test integer 10 0.101 +test real 123456789 0.123456789 +string with string with 13 13 +string with" quote string with]] escape 14 14 diff --git a/lib/dsv/testdata/expected_simplemerge.dat b/lib/dsv/testdata/expected_simplemerge.dat new file mode 100644 index 00000000..481c7f80 --- /dev/null +++ b/lib/dsv/testdata/expected_simplemerge.dat @@ -0,0 +1,22 @@ +ID 1/A-B# {{AB}};1;0.1 +ID 2/A-B-C# {{BCD}};2;0.02 +ID 3/A;B-C,D# {{A;B C,D}};3;0.003 +ID 4/A;B-C,D# {{A;B C D}};4;0.0004 +ID 6/# {{}};6;0.000006 +ID 8/ok# {{missing right-quote];8;0.00000008 +9;"ok"-[[ok}};9;0.000000009 +ID 10/test# {{integer}};10;0.101 +ID 12/test# {{real}};123456789;0.123456789 +ID 13/string with# {{string with}};13;13 +ID 14/string with" quote# {{string with]] escape}};14;14 +ID 1/A-B# {{AB}};1;0.1 +ID 2/A-B-C# {{BCD}};2;0.02 +ID 3/A;B-C,D# {{A;B C,D}};3;0.003 +ID 4/A;B-C,D# {{A;B C D}};4;0.0004 +ID 6/# {{}};6;0.000006 +ID 8/ok# {{missing right-quote];8;0.00000008 +9;"ok"-[[ok}};9;0.000000009 +ID 10/test# {{integer}};10;0.101 +ID 12/test# {{real}};123456789;0.123456789 +ID 13/string with# {{string with}};13;13 +ID 14/string with" quote# {{string with]] escape}};14;14 diff --git a/lib/dsv/testdata/expected_skip.dat b/lib/dsv/testdata/expected_skip.dat new file mode 100644 index 00000000..10286f1a --- /dev/null +++ b/lib/dsv/testdata/expected_skip.dat @@ -0,0 +1,11 @@ +0.1;1;{{AB}};A-B# +0.02;2;{{BCD}};A-B-C# +0.003;3;{{A;B C,D}};A;B-C,D# +0.0004;4;{{A;B C D}};A;B-C,D# +0.000006;6;{{}};# +0.000000009;9;{{missing right-quote];8;0.00000008 +9;"ok"-[[ok}};ok# +0.101;10;{{integer}};test# +0.123456789;123456789;{{real}};test# +13;13;{{string with}};string with# +14;14;{{string with]] escape}};string with" quote# diff --git a/lib/dsv/testdata/input.dat b/lib/dsv/testdata/input.dat new file mode 100644 index 00000000..6774d376 --- /dev/null +++ b/lib/dsv/testdata/input.dat @@ -0,0 +1,15 @@ +"id","name","value","integer";"real" +1;"A-B"-[[AB]];1;0.1 +2;"A-B-C"-[[BCD]];2;0.02 +3;"A;B-C,D"-[[A;B C,D]];3;0.003 +4;"A;B-C,D"-[[A;B C D]];4;0.0004 +5;"A;B-C,D-"[[A;B C D]];5;0.00005 +6;""-[[]];6;0.000006 +7;"ok"-[missing left-quote]];7;0.0000007 +8;"ok"-[[missing right-quote];8;0.00000008 +9;"ok"-[[ok]];9;0.000000009 +10;"test"-[[integer]];010;0.101 +11;"test"-[[integer]];1a;0.1001 +12;"test"-[[real]];123456789;00.123456789 +13;"string with" quote"-[[string with]];13;13.0 +14;"string with\" quote"-[[string with\]] escape]];14;14.0 diff --git a/lib/dsv/testdata/writeraw.exp b/lib/dsv/testdata/writeraw.exp new file mode 100644 index 00000000..05f968ee --- /dev/null +++ b/lib/dsv/testdata/writeraw.exp @@ -0,0 +1,10 @@ +0,1,A +1,1.1,B +2,1.2,A +3,1.3,B +4,1.4,C +5,1.5,D +6,1.6,C +7,1.7,D +8,1.8,E +9,1.9,F diff --git a/lib/dsv/writer.go b/lib/dsv/writer.go new file mode 100644 index 00000000..1801ad36 --- /dev/null +++ b/lib/dsv/writer.go @@ -0,0 +1,515 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "bufio" + "encoding/json" + "log" + "os" + + libbytes "github.com/shuLhan/share/lib/bytes" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + // DefSeparator default separator that will be used if its not given + // in config file. + DefSeparator = "," + // DefOutput file. + DefOutput = "output.dat" + // DefEscape default string to escape the right quote or separator. + DefEscape = "\\" +) + +// +// Writer write records from reader or slice using format configuration in +// metadata. +// +type Writer struct { + Config `json:"-"` + // Output file where the records will be written. + Output string `json:"Output"` + // OutputMetadata define format for each column. + OutputMetadata []Metadata `json:"OutputMetadata"` + // fWriter as write descriptor. + fWriter *os.File + // BufWriter for buffered writer. + BufWriter *bufio.Writer +} + +// +// NewWriter create a writer object. +// User must call Open after that to populate the output and metadata. +// +func NewWriter(config string) (writer *Writer, e error) { + writer = &Writer{ + Output: "", + OutputMetadata: nil, + fWriter: nil, + BufWriter: nil, + } + + if config == "" { + return + } + + e = OpenWriter(writer, config) + if e != nil { + return nil, e + } + + return +} + +// +// GetOutput return output filename. +// +func (writer *Writer) GetOutput() string { + return writer.Output +} + +// +// SetOutput will set the output file to path. +// +func (writer *Writer) SetOutput(path string) { + writer.Output = path +} + +// +// AddMetadata will add new output metadata to writer. +// +func (writer *Writer) AddMetadata(md Metadata) { + writer.OutputMetadata = append(writer.OutputMetadata, md) +} + +// +// open a generic method to open output file with specific flag. +// +func (writer *Writer) open(file string, flag int) (e error) { + if file == "" { + if writer.Output == "" { + file = DefOutput + } else { + file = writer.Output + } + } + + writer.fWriter, e = os.OpenFile(file, flag, 0600) + if nil != e { + return e + } + + writer.BufWriter = bufio.NewWriter(writer.fWriter) + + return nil +} + +// +// OpenOutput file and buffered writer. +// File will be truncated if its exist. +// +func (writer *Writer) OpenOutput(file string) (e error) { + return writer.open(file, os.O_CREATE|os.O_TRUNC|os.O_WRONLY) +} + +// +// ReopenOutput will open the output file back without truncating the content. +// +func (writer *Writer) ReopenOutput(file string) (e error) { + if e = writer.Close(); e != nil { + return + } + return writer.open(file, os.O_CREATE|os.O_APPEND|os.O_WRONLY) +} + +// +// Flush output buffer to disk. +// +func (writer *Writer) Flush() error { + return writer.BufWriter.Flush() +} + +// +// Close all open descriptor. +// +func (writer *Writer) Close() (e error) { + if nil != writer.BufWriter { + e = writer.BufWriter.Flush() + if e != nil { + return + } + } + if nil != writer.fWriter { + e = writer.fWriter.Close() + } + return +} + +// +// WriteRow dump content of Row to file using format in metadata. +// +func (writer *Writer) WriteRow(row *tabula.Row, recordMd []MetadataInterface) ( + e error, +) { + nRecord := row.Len() + v := []byte{} + esc := []byte(DefEscape) + + for i := range writer.OutputMetadata { + md := writer.OutputMetadata[i] + + // find the input index based on name on record metadata. + rIdx, mdMatch := FindMetadata(&md, recordMd) + + // No input metadata matched? skip it too. + if rIdx >= nRecord { + continue + } + + // If input column is ignored, continue to next record. + if mdMatch != nil && mdMatch.GetSkip() { + continue + } + + recV := (*row)[rIdx].Bytes() + lq := md.GetLeftQuote() + + if "" != lq { + v = append(v, []byte(lq)...) + } + + rq := md.GetRightQuote() + sep := md.GetSeparator() + + // Escape the escape character itself. + if md.T == tabula.TString { + recV, _ = libbytes.EncloseToken(recV, esc, esc, nil) + } + + // Escape the right quote in field content before writing it. + if "" != rq && md.T == tabula.TString { + recV, _ = libbytes.EncloseToken(recV, []byte(rq), esc, nil) + } else { + // Escape the separator + if "" != sep && md.T == tabula.TString { + recV, _ = libbytes.EncloseToken(recV, []byte(sep), esc, nil) + } + } + + v = append(v, recV...) + + if "" != rq { + v = append(v, []byte(rq)...) + } + + if "" != sep { + v = append(v, []byte(sep)...) + } + } + + v = append(v, DefEOL) + + _, e = writer.BufWriter.Write(v) + + return e +} + +// +// WriteRows will loop each row in the list of rows and write their content to +// output file. +// Return n for number of row written, and e if error happened. +// +func (writer *Writer) WriteRows(rows tabula.Rows, recordMd []MetadataInterface) ( + n int, + e error, +) { + for n = range rows { + e = writer.WriteRow(rows[n], recordMd) + if nil != e { + break + } + } + + _ = writer.Flush() + return +} + +// +// WriteColumns will write content of columns to output file. +// Return n for number of row written, and e if error happened. +// +func (writer *Writer) WriteColumns(columns tabula.Columns, + colMd []MetadataInterface, +) ( + n int, + e error, +) { + nColumns := len(columns) + if nColumns <= 0 { + return + } + + emptyRec := tabula.NewRecordString("") + + // Get minimum and maximum length of all columns. + // In case one of the column have different length (shorter or longer), + // we will take the column with minimum length first and continue with + // the maximum length. + + minlen, maxlen := columns.GetMinMaxLength() + + // If metadata is nil, generate it from column name. + if colMd == nil { + for _, col := range columns { + md := &Metadata{ + Name: col.Name, + T: col.Type, + } + + colMd = append(colMd, md) + } + } + + // First loop, iterate until minimum column length. + row := make(tabula.Row, nColumns) + + for ; n < minlen; n++ { + // Convert columns to record. + for y, col := range columns { + row[y] = col.Records[n] + } + + e = writer.WriteRow(&row, colMd) + if e != nil { + goto err + } + } + + // Second loop, iterate until maximum column length. + for ; n < maxlen; n++ { + // Convert columns to record. + for y, col := range columns { + if col.Len() > n { + row[y] = col.Records[n] + } else { + row[y] = emptyRec + } + } + + e = writer.WriteRow(&row, colMd) + if e != nil { + goto err + } + } + +err: + _ = writer.Flush() + return n, e +} + +// +// WriteRawRow will write row data using separator `sep` for each record. +// +func (writer *Writer) WriteRawRow(row *tabula.Row, sep, esc []byte) (e error) { + if sep == nil { + sep = []byte(DefSeparator) + } + if esc == nil { + esc = []byte(DefEscape) + } + + v := []byte{} + for x, rec := range *row { + if x > 0 { + v = append(v, sep...) + } + + recV := rec.Bytes() + + if rec.Type() == tabula.TString { + recV, _ = libbytes.EncloseToken(recV, sep, esc, nil) + } + + v = append(v, recV...) + } + + v = append(v, DefEOL) + + _, e = writer.BufWriter.Write(v) + + _ = writer.Flush() + + return e +} + +// +// WriteRawRows write rows data using separator `sep` for each record. +// We use pointer in separator parameter, so we can use empty string as +// separator. +// +func (writer *Writer) WriteRawRows(rows *tabula.Rows, sep *string) ( + nrow int, + e error, +) { + nrow = len(*rows) + if nrow <= 0 { + return + } + + if sep == nil { + sep = new(string) + *sep = DefSeparator + } + + escbytes := []byte(DefEscape) + sepbytes := []byte(*sep) + x := 0 + + for ; x < nrow; x++ { + e = writer.WriteRawRow((*rows)[x], sepbytes, escbytes) + if nil != e { + break + } + } + + return x, e +} + +// +// WriteRawColumns write raw columns using separator `sep` for each record to +// file. +// +// We use pointer in separator parameter, so we can use empty string as +// separator. +// +func (writer *Writer) WriteRawColumns(cols *tabula.Columns, sep *string) ( + nrow int, + e error, +) { + ncol := len(*cols) + if ncol <= 0 { + return + } + + if sep == nil { + sep = new(string) + *sep = DefSeparator + } + + // Find minimum and maximum column length. + minlen, maxlen := cols.GetMinMaxLength() + + esc := []byte(DefEscape) + sepbytes := []byte(*sep) + x := 0 + + // First, write until minimum column length. + for ; x < minlen; x++ { + v := cols.Join(x, sepbytes, esc) + v = append(v, DefEOL) + + _, e = writer.BufWriter.Write(v) + + if nil != e { + return x, e + } + } + + // and then write column until max length. + for ; x < maxlen; x++ { + v := cols.Join(x, sepbytes, esc) + v = append(v, DefEOL) + + _, e = writer.BufWriter.Write(v) + + if nil != e { + break + } + } + + _ = writer.Flush() + return x, e +} + +// +// WriteRawDataset will write content of dataset to file without metadata but +// using separator `sep` for each record. +// +// We use pointer in separator parameter, so we can use empty string as +// separator. +// +func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface, + sep *string, +) ( + int, error, +) { + if nil == writer.fWriter { + return 0, ErrNotOpen + } + if nil == dataset { + return 0, nil + } + if sep == nil { + sep = new(string) + *sep = DefSeparator + } + + var rows *tabula.Rows + + switch dataset.GetMode() { + case tabula.DatasetModeColumns: + cols := dataset.GetDataAsColumns() + return writer.WriteRawColumns(cols, sep) + case tabula.DatasetModeRows, tabula.DatasetModeMatrix: + fallthrough + default: + rows = dataset.GetDataAsRows() + } + + return writer.WriteRawRows(rows, sep) +} + +// +// Write rows from Reader to file. +// Return n for number of row written, or e if error happened. +// +func (writer *Writer) Write(reader ReaderInterface) (int, error) { + if nil == reader { + return 0, ErrNilReader + } + if nil == writer.fWriter { + return 0, ErrNotOpen + } + + ds := reader.GetDataset().(tabula.DatasetInterface) + + var rows *tabula.Rows + + switch ds.GetMode() { + case tabula.DatasetModeColumns: + cols := ds.GetDataAsColumns() + return writer.WriteColumns(*cols, reader.GetInputMetadata()) + case tabula.DatasetModeRows, tabula.DatasetModeMatrix: + fallthrough + default: + rows = ds.GetDataAsRows() + } + + return writer.WriteRows(*rows, reader.GetInputMetadata()) +} + +// +// String yes, it will print it in JSON like format. +// +func (writer *Writer) String() string { + r, e := json.MarshalIndent(writer, "", "\t") + + if nil != e { + log.Print(e) + } + + return string(r) +} diff --git a/lib/dsv/writer_test.go b/lib/dsv/writer_test.go new file mode 100644 index 00000000..f03a1842 --- /dev/null +++ b/lib/dsv/writer_test.go @@ -0,0 +1,126 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +import ( + "testing" + + "github.com/shuLhan/share/lib/tabula" +) + +// +// TestWriter test reading and writing DSV. +// +func TestWriter(t *testing.T) { + rw, e := New("testdata/config.dsv", nil) + if e != nil { + t.Fatal(e) + } + + doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true) + + e = rw.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) +} + +// +// TestWriterWithSkip test reading and writing DSV with some column in input being +// skipped. +// +func TestWriterWithSkip(t *testing.T) { + rw, e := New("testdata/config_skip.dsv", nil) + if e != nil { + t.Fatal(e) + } + + doReadWrite(t, &rw.Reader, &rw.Writer, expSkip, true) + + e = rw.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, rw.GetOutput(), "testdata/expected_skip.dat", true) +} + +// +// TestWriterWithColumns test reading and writing DSV with where each row +// is saved in DatasetMode = 'columns'. +// +func TestWriterWithColumns(t *testing.T) { + rw, e := New("testdata/config_skip.dsv", nil) + if e != nil { + t.Fatal(e) + } + + rw.SetDatasetMode(DatasetModeCOLUMNS) + + doReadWrite(t, &rw.Reader, &rw.Writer, expSkipColumns, true) + + e = rw.Close() + if e != nil { + t.Fatal(e) + } + + assertFile(t, "testdata/expected_skip.dat", rw.GetOutput(), true) +} + +func TestWriteRawRows(t *testing.T) { + dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil) + + populateWithRows(t, dataset) + + writer, e := NewWriter("") + if e != nil { + t.Fatal(e) + } + + outfile := "testdata/writerawrows.out" + expfile := "testdata/writeraw.exp" + + e = writer.OpenOutput(outfile) + if e != nil { + t.Fatal(e) + } + + _, e = writer.WriteRawDataset(dataset, nil) + if e != nil { + t.Fatal(e) + } + + assertFile(t, outfile, expfile, true) +} + +func TestWriteRawColumns(t *testing.T) { + var e error + + dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil) + + populateWithColumns(t, dataset) + + writer, e := NewWriter("") + if e != nil { + t.Fatal(e) + } + + outfile := "testdata/writerawcolumns.out" + expfile := "testdata/writeraw.exp" + + e = writer.OpenOutput(outfile) + if e != nil { + t.Fatal(e) + } + + _, e = writer.WriteRawDataset(dataset, nil) + if e != nil { + t.Fatal(e) + } + + assertFile(t, outfile, expfile, true) +} diff --git a/lib/dsv/writerinterface.go b/lib/dsv/writerinterface.go new file mode 100644 index 00000000..e2b8856c --- /dev/null +++ b/lib/dsv/writerinterface.go @@ -0,0 +1,45 @@ +// Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package dsv + +// +// WriterInterface is an interface for writing DSV data to file. +// +type WriterInterface interface { + ConfigInterface + GetOutput() string + SetOutput(path string) + OpenOutput(file string) error + Flush() error + Close() error +} + +// +// OpenWriter configuration file and initialize the attributes. +// +func OpenWriter(writer WriterInterface, fcfg string) (e error) { + e = ConfigOpen(writer, fcfg) + if e != nil { + return + } + + return InitWriter(writer) +} + +// +// InitWriter initialize writer by opening output file. +// +func InitWriter(writer WriterInterface) error { + out := writer.GetOutput() + + // Exit immediately if no output file is defined in config. + if "" == out { + return ErrNoOutput + } + + writer.SetOutput(ConfigCheckPath(writer, out)) + + return writer.OpenOutput("") +} |
