aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2018-09-17 01:21:27 +0700
committerShulhan <ms@kilabit.info>2018-09-18 01:50:21 +0700
commit44b26edf7f390db383fe025454be0c4e30cfbd9b (patch)
tree84d02953bc9095312182534936c1b60667957f07 /lib
parent4a820ec157501c957d2e30f1670656cceec5c044 (diff)
downloadpakakeh.go-44b26edf7f390db383fe025454be0c4e30cfbd9b.tar.xz
Merge package "github.com/shuLhan/tabula"
Diffstat (limited to 'lib')
-rw-r--r--lib/tabula/.gitignore5
-rw-r--r--lib/tabula/LICENSE39
-rw-r--r--lib/tabula/Makefile31
-rw-r--r--lib/tabula/README.md165
-rw-r--r--lib/tabula/claset.go303
-rw-r--r--lib/tabula/clasetinterface.go38
-rw-r--r--lib/tabula/column.go309
-rw-r--r--lib/tabula/column_test.go67
-rw-r--r--lib/tabula/columninterface.go20
-rw-r--r--lib/tabula/columns.go147
-rw-r--r--lib/tabula/columns_test.go56
-rw-r--r--lib/tabula/dataset.go747
-rw-r--r--lib/tabula/dataset_bench_test.go20
-rw-r--r--lib/tabula/dataset_test.go365
-rw-r--r--lib/tabula/datasetinterface.go442
-rw-r--r--lib/tabula/maprows.go65
-rw-r--r--lib/tabula/maprows_test.go54
-rw-r--r--lib/tabula/matrix.go13
-rw-r--r--lib/tabula/record.go292
-rw-r--r--lib/tabula/record_test.go35
-rw-r--r--lib/tabula/records.go54
-rw-r--r--lib/tabula/records_test.go29
-rw-r--r--lib/tabula/row.go123
-rw-r--r--lib/tabula/row_test.go33
-rw-r--r--lib/tabula/rows.go251
-rw-r--r--lib/tabula/rows_test.go181
-rw-r--r--lib/tabula/tabula.go76
-rw-r--r--lib/tabula/tabula_test.go81
28 files changed, 4041 insertions, 0 deletions
diff --git a/lib/tabula/.gitignore b/lib/tabula/.gitignore
new file mode 100644
index 00000000..f5ddbe1c
--- /dev/null
+++ b/lib/tabula/.gitignore
@@ -0,0 +1,5 @@
+cover.html
+cover.out
+*.bench
+*.prof
+*.test
diff --git a/lib/tabula/LICENSE b/lib/tabula/LICENSE
new file mode 100644
index 00000000..d3ff23a6
--- /dev/null
+++ b/lib/tabula/LICENSE
@@ -0,0 +1,39 @@
+Copyright 2017, Shulhan (ms@kilabit.info).
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of copyright holder nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ --- --- --- --- --- --- ---
+
+ TT TT II BB AAAA LLLLLL II KKKKKKKK
+ TT TT II BB AA AA LL LL II KK
+ TTTT II BB AA AA LL LL II KK
+ TT TT II BB AAAAAAAA LLLLLL II KK
+ TT TT II BB AA AA LL LL II KK
+ TT TT II BBBBBBBB AA AA LLLLLL II KK
+
+Website: http://kilabit.info
+Contact: ms@kilabit.info
diff --git a/lib/tabula/Makefile b/lib/tabula/Makefile
new file mode 100644
index 00000000..d77283bd
--- /dev/null
+++ b/lib/tabula/Makefile
@@ -0,0 +1,31 @@
+#!/bin/make
+
+## Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+## Use of this source code is governed by a BSD-style license that can be found
+## in the LICENSE file.
+
+SRC_FILES :=$(shell go list -f '{{ join .GoFiles " " }}')
+TEST_FILES :=$(shell go list -f '{{ join .TestGoFiles " " }}')
+XTEST_FILES :=$(shell go list -f '{{ join .XTestGoFiles " " }}')
+COVER_OUT :=cover.out
+COVER_HTML :=cover.html
+TARGET :=$(shell go list -f '{{ .Target }}')
+
+.PHONY: all clean coverbrowse
+
+all: ${TARGET}
+
+${TARGET}: ${COVER_HTML}
+ go install -a .
+
+${COVER_HTML}: ${COVER_OUT}
+ go tool cover -html=$< -o $@
+
+${COVER_OUT}: ${SRC_FILES} ${TEST_FILES} ${XTEST_FILES}
+ go test -v -coverprofile $@
+
+coverbrowse: ${COVER_HTML}
+ xdg-open $<
+
+clean:
+ rm -f ${COVER_HTML} ${COVER_OUT} *.bench *.prof *.test
diff --git a/lib/tabula/README.md b/lib/tabula/README.md
new file mode 100644
index 00000000..8fbd2a40
--- /dev/null
+++ b/lib/tabula/README.md
@@ -0,0 +1,165 @@
+[![GoDoc](https://godoc.org/github.com/shuLhan/share/lib/tabula?status.svg)](https://godoc.org/github.com/shuLhan/share/lib/tabula)
+[![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/share/lib/tabula)](https://goreportcard.com/report/github.com/shuLhan/share/lib/tabula)
+![cover.run go](https://cover.run/go/github.com/shuLhan/share/lib/tabula.svg)
+
+Package tabula is a Go library for working with rows, columns, or matrix
+(table), or in another terms working with data set.
+
+# Overview
+
+Go's slice gave a flexible way to manage sequence of data in one type, but what
+if you want to manage a sequence of value but with different type of data?
+Or manage a bunch of values like a table?
+
+You can use this library to manage sequence of value with different type
+and manage data in two dimensional tuple.
+
+## Terminology
+
+Here are some terminologies that we used in developing this library, which may
+help reader understand the internal and API.
+
+Record is a single cell in row or column, or the smallest building block of
+dataset.
+
+Row is a horizontal representation of records in dataset.
+
+Column is a vertical representation of records in dataset.
+Each column has a unique name and has the same type data.
+
+Dataset is a collection of rows and columns.
+
+Given those definitions we can draw the representation of rows, columns, or
+matrix:
+
+ COL-0 COL-1 ... COL-x
+ ROW-0: record record ... record
+ ROW-1: record record ... record
+ ...
+ ROW-y: record record ... record
+
+## What make this package different from other dataset packages?
+
+### Record Type
+
+There are only three valid type in record: int64, float64, and string.
+
+Each record is a pointer to interface value. Which means,
+
+- Switching between rows to columns mode, or vice versa, is only a matter of
+ pointer switching, no memory relocations.
+- When using matrix mode, additional memory is used only to allocate slice, the
+ record in each rows and columns is shared.
+
+### Dataset Mode
+
+Tabula has three mode for dataset: rows, columns, or matrix.
+
+For example, given a table of data,
+
+ col1,col2,col3
+ a,b,c
+ 1,2,3
+
+- When in "rows" mode, each line is saved in its own slice, resulting in Rows:
+
+ ```
+ Rows[0]: [a b c]
+ Rows[1]: [1 2 3]
+ ```
+
+ Columns is used only to save record metadata: column name, type, flag and
+ value space.
+
+- When in "columns" mode, each line saved in columns, resulting in Columns:
+
+ ```
+ Columns[0]: {col1 0 0 [] [a 1]}
+ Columns[1]: {col2 0 0 [] [b 2]}
+ Columns[1]: {col3 0 0 [] [c 3]}
+ ```
+
+ Each column will contain metadata including column name, type, flag, and
+ value space (all possible value that _may_ contain in column value).
+
+ Rows in "columns" mode is empty.
+
+- When in "matrix" mode, each record is saved both in row and column using
+ shared pointer to record.
+
+ Matrix mode consume more memory by allocating two slice in rows and columns,
+ but give flexible way to manage records.
+
+## Features
+
+- **Switching between rows and columns mode**.
+
+- [**Random pick rows with or without replacement**](https://godoc.org/github.com/shuLhan/share/lib/tabula#RandomPickRows).
+
+- [**Random pick columns with or without replacement**](https://godoc.org/github.com/shuLhan/share/lib/tabula#RandomPickColumns).
+
+- [**Select column from dataset by index**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SelectColumnsByIdx).
+
+- [**Sort columns by index**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SortColumnsByIndex),
+ or indirect sort.
+
+- [**Split rows value by numeric**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SplitRowsByNumeric).
+ For example, given two numeric rows,
+
+ ```
+ A: {1,2,3,4}
+ B: {5,6,7,8}
+ ```
+
+ if we split row by value 7, the data will splitted into left set
+
+ ```
+ A': {1,2}
+ B': {5,6}
+ ```
+
+ and the right set would be
+
+ ```
+ A'': {3,4}
+ B'': {7,8}
+ ```
+
+- [**Split rows by string**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SplitRowsByCategorical).
+ For example, given two rows,
+
+ ```
+ X: [A,B,A,B,C,D,C,D]
+ Y: [1,2,3,4,5,6,7,8]
+ ```
+
+ if we split the rows with value set `[A,C]`, the data will splitted into left
+ set which contain all rows that have A or C,
+
+ ```
+ X': [A,A,C,C]
+ Y': [1,3,5,7]
+ ```
+
+ and the right set, excluded set, will contain all rows which is not A or C,
+
+ ```
+ X'': [B,B,D,D]
+ Y'': [2,4,6,8]
+ ```
+
+- [**Select row where**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SelectRowsWhere).
+ Select row at column index x where their value is equal to y (an analogy to
+ _select where_ in SQL).
+ For example, given a rows of dataset,
+ ```
+ ROW-1: {1,A}
+ ROW-2: {2,B}
+ ROW-3: {3,A}
+ ROW-4: {4,C}
+ ```
+ we can select row where the second column contain 'A', which result in,
+ ```
+ ROW-1: {1,A}
+ ROW-3: {3,A}
+ ```
diff --git a/lib/tabula/claset.go b/lib/tabula/claset.go
new file mode 100644
index 00000000..5d7eea7e
--- /dev/null
+++ b/lib/tabula/claset.go
@@ -0,0 +1,303 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "strconv"
+
+ libnumbers "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+)
+
+//
+// Claset define a dataset with class attribute.
+//
+type Claset struct {
+ // Dataset embedded, for implementing the dataset interface.
+ Dataset
+ // ClassIndex contain index for target classification in columns.
+ ClassIndex int `json:"ClassIndex"`
+
+ // vs contain a copy of value space.
+ vs []string
+ // counts number of value space in current set.
+ counts []int
+
+ // major contain the name of majority class in dataset.
+ major string
+ // minor contain the name of minority class in dataset.
+ minor string
+}
+
+//
+// NewClaset create and return new Claset object.
+//
+func NewClaset(mode int, types []int, names []string) (claset *Claset) {
+ claset = &Claset{
+ ClassIndex: -1,
+ }
+
+ claset.Init(mode, types, names)
+
+ return
+}
+
+//
+// Clone return a copy of current claset object.
+//
+func (claset *Claset) Clone() interface{} {
+ clone := Claset{
+ ClassIndex: claset.GetClassIndex(),
+ major: claset.MajorityClass(),
+ minor: claset.MinorityClass(),
+ }
+ clone.SetDataset(claset.GetDataset().Clone().(DatasetInterface))
+ return &clone
+}
+
+//
+// GetDataset return the dataset.
+//
+func (claset *Claset) GetDataset() DatasetInterface {
+ return &claset.Dataset
+}
+
+//
+// GetClassType return type of class in dataset.
+//
+func (claset *Claset) GetClassType() int {
+ if claset.Columns.Len() <= 0 {
+ return TString
+ }
+ return claset.Columns[claset.ClassIndex].Type
+}
+
+//
+// GetClassValueSpace return the class value space.
+//
+func (claset *Claset) GetClassValueSpace() []string {
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return claset.Columns[claset.ClassIndex].ValueSpace
+}
+
+//
+// GetClassColumn return dataset class values in column.
+//
+func (claset *Claset) GetClassColumn() *Column {
+ if claset.Mode == DatasetModeRows {
+ claset.TransposeToColumns()
+ }
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return &claset.Columns[claset.ClassIndex]
+}
+
+//
+// GetClassRecords return class values as records.
+//
+func (claset *Claset) GetClassRecords() *Records {
+ if claset.Mode == DatasetModeRows {
+ claset.TransposeToColumns()
+ }
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return &claset.Columns[claset.ClassIndex].Records
+}
+
+//
+// GetClassAsStrings return all class values as slice of string.
+//
+func (claset *Claset) GetClassAsStrings() []string {
+ if claset.Mode == DatasetModeRows {
+ claset.TransposeToColumns()
+ }
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return claset.Columns[claset.ClassIndex].ToStringSlice()
+}
+
+//
+// GetClassAsReals return class record value as slice of float64.
+//
+func (claset *Claset) GetClassAsReals() []float64 {
+ if claset.Mode == DatasetModeRows {
+ claset.TransposeToColumns()
+ }
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return claset.Columns[claset.ClassIndex].ToFloatSlice()
+}
+
+//
+// GetClassAsInteger return class record value as slice of int64.
+//
+func (claset *Claset) GetClassAsInteger() []int64 {
+ if claset.Mode == DatasetModeRows {
+ claset.TransposeToColumns()
+ }
+ if claset.Columns.Len() <= 0 {
+ return nil
+ }
+ return claset.Columns[claset.ClassIndex].ToIntegers()
+}
+
+//
+// GetClassIndex return index of class attribute in dataset.
+//
+func (claset *Claset) GetClassIndex() int {
+ return claset.ClassIndex
+}
+
+//
+// MajorityClass return the majority class of data.
+//
+func (claset *Claset) MajorityClass() string {
+ return claset.major
+}
+
+//
+// MinorityClass return the minority class in dataset.
+//
+func (claset *Claset) MinorityClass() string {
+ return claset.minor
+}
+
+//
+// Counts return the number of each class in value-space.
+//
+func (claset *Claset) Counts() []int {
+ if len(claset.counts) <= 0 {
+ claset.CountValueSpaces()
+ }
+ return claset.counts
+}
+
+//
+// SetDataset in class set.
+//
+func (claset *Claset) SetDataset(dataset DatasetInterface) {
+ claset.Dataset = *(dataset.(*Dataset))
+}
+
+//
+// SetClassIndex will set the class index to `v`.
+//
+func (claset *Claset) SetClassIndex(v int) {
+ claset.ClassIndex = v
+}
+
+//
+// SetMajorityClass will set the majority class to `v`.
+//
+func (claset *Claset) SetMajorityClass(v string) {
+ claset.major = v
+}
+
+//
+// SetMinorityClass will set the minority class to `v`.
+//
+func (claset *Claset) SetMinorityClass(v string) {
+ claset.minor = v
+}
+
+//
+// CountValueSpaces will count number of value space in current dataset.
+//
+func (claset *Claset) CountValueSpaces() {
+ classv := claset.GetClassAsStrings()
+ claset.vs = claset.GetClassValueSpace()
+
+ claset.counts = libstrings.CountTokens(classv, claset.vs, false)
+}
+
+//
+// RecountMajorMinor recount major and minor class in claset.
+//
+func (claset *Claset) RecountMajorMinor() {
+ claset.CountValueSpaces()
+
+ _, maxIdx, maxok := libnumbers.IntsFindMax(claset.counts)
+ _, minIdx, minok := libnumbers.IntsFindMin(claset.counts)
+
+ if maxok {
+ claset.major = claset.vs[maxIdx]
+ }
+ if minok {
+ claset.minor = claset.vs[minIdx]
+ }
+}
+
+//
+// IsInSingleClass check whether all target class contain only single value.
+// Return true and name of target if all rows is in the same class,
+// false and empty string otherwise.
+//
+func (claset *Claset) IsInSingleClass() (single bool, class string) {
+ classv := claset.GetClassAsStrings()
+
+ for i, t := range classv {
+ if i == 0 {
+ single = true
+ class = t
+ continue
+ }
+ if t != class {
+ return false, ""
+ }
+ }
+ return
+}
+
+//
+// GetMinorityRows return rows where their class is minority in dataset, or nil
+// if dataset is empty.
+//
+func (claset *Claset) GetMinorityRows() *Rows {
+ if claset.Len() == 0 {
+ return nil
+ }
+ if claset.vs == nil {
+ claset.RecountMajorMinor()
+ }
+
+ minRows := claset.GetRows().SelectWhere(claset.ClassIndex,
+ claset.minor)
+
+ return &minRows
+}
+
+//
+// String, yes it will pretty print the meta-data in JSON format.
+//
+func (claset *Claset) String() (s string) {
+ if claset.vs == nil {
+ claset.RecountMajorMinor()
+ }
+
+ s = fmt.Sprintf("'claset':{'rows': %d, 'columns': %d, ", claset.Len(),
+ claset.GetNColumn())
+
+ s += "'vs':{"
+ for x, v := range claset.vs {
+ if x > 0 {
+ s += ", "
+ }
+ s += "'" + v + "':" + strconv.Itoa(claset.counts[x])
+ }
+ s += "}"
+
+ s += ", 'major': '" + claset.major + "'"
+ s += ", 'minor': '" + claset.minor + "'"
+ s += "}"
+
+ return
+}
diff --git a/lib/tabula/clasetinterface.go b/lib/tabula/clasetinterface.go
new file mode 100644
index 00000000..ae8cdfcd
--- /dev/null
+++ b/lib/tabula/clasetinterface.go
@@ -0,0 +1,38 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+//
+// ClasetInterface is the interface for working with dataset containing class
+// or target attribute. It embed dataset interface.
+//
+// Yes, the name is Claset with single `s` not Classset with triple `s` to
+// minimize typo.
+//
+type ClasetInterface interface {
+ DatasetInterface
+
+ GetClassType() int
+ GetClassValueSpace() []string
+ GetClassColumn() *Column
+ GetClassRecords() *Records
+ GetClassAsStrings() []string
+ GetClassAsReals() []float64
+ GetClassIndex() int
+ MajorityClass() string
+ MinorityClass() string
+ Counts() []int
+
+ SetDataset(DatasetInterface)
+ SetClassIndex(int)
+ SetMajorityClass(string)
+ SetMinorityClass(string)
+
+ CountValueSpaces()
+ RecountMajorMinor()
+ IsInSingleClass() (bool, string)
+
+ GetMinorityRows() *Rows
+}
diff --git a/lib/tabula/column.go b/lib/tabula/column.go
new file mode 100644
index 00000000..f631fb30
--- /dev/null
+++ b/lib/tabula/column.go
@@ -0,0 +1,309 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "strconv"
+)
+
+//
+// Column represent slice of record. A vertical representation of data.
+//
+type Column struct {
+ // Name of column. String identifier for the column.
+ Name string
+ // Type of column. All record in column have the same type.
+ Type int
+ // Flag additional attribute that can be set to mark some value on this
+ // column
+ Flag int
+ // ValueSpace contain the possible value in records
+ ValueSpace []string
+ // Records contain column data.
+ Records Records
+}
+
+//
+// NewColumn return new column with type and name.
+//
+func NewColumn(colType int, colName string) (col *Column) {
+ col = &Column{
+ Type: colType,
+ Name: colName,
+ Flag: 0,
+ }
+
+ col.Records = make([]*Record, 0)
+
+ return
+}
+
+//
+// NewColumnString initialize column with type anda data as string.
+//
+func NewColumnString(data []string, colType int, colName string) (
+ col *Column,
+ e error,
+) {
+ col = NewColumn(colType, colName)
+
+ datalen := len(data)
+
+ if datalen <= 0 {
+ return
+ }
+
+ col.Records = make([]*Record, datalen)
+
+ for x := 0; x < datalen; x++ {
+ col.Records[x] = NewRecordString(data[x])
+ }
+
+ return col, nil
+}
+
+//
+// NewColumnInt create new column with record type as integer, and fill it
+// with `data`.
+//
+func NewColumnInt(data []int64, colName string) (col *Column) {
+ col = NewColumn(TInteger, colName)
+
+ datalen := len(data)
+ if datalen <= 0 {
+ return
+ }
+
+ col.Records = make([]*Record, datalen)
+
+ for x, v := range data {
+ col.Records[x] = NewRecordInt(v)
+ }
+ return
+}
+
+//
+// NewColumnReal create new column with record type is real.
+//
+func NewColumnReal(data []float64, colName string) (col *Column) {
+ col = NewColumn(TReal, colName)
+
+ datalen := len(data)
+
+ if datalen <= 0 {
+ return
+ }
+
+ col.Records = make([]*Record, datalen)
+
+ for x := 0; x < datalen; x++ {
+ rec := NewRecordReal(data[x])
+ col.Records[x] = rec
+ }
+
+ return
+}
+
+//
+// SetType will set the type of column to `tipe`.
+//
+func (col *Column) SetType(tipe int) {
+ col.Type = tipe
+}
+
+//
+// SetName will set the name of column to `name`.
+//
+func (col *Column) SetName(name string) {
+ col.Name = name
+}
+
+//
+// GetType return the type of column.
+//
+func (col *Column) GetType() int {
+ return col.Type
+}
+
+//
+// GetName return the column name.
+//
+func (col *Column) GetName() string {
+ return col.Name
+}
+
+//
+// SetRecords will set records in column to `recs`.
+//
+func (col *Column) SetRecords(recs *Records) {
+ col.Records = *recs
+}
+
+//
+// Interface return the column object as an interface.
+//
+func (col *Column) Interface() interface{} {
+ return col
+}
+
+//
+// Reset column data and flag.
+//
+func (col *Column) Reset() {
+ col.Flag = 0
+ col.Records = make([]*Record, 0)
+}
+
+//
+// Len return number of record.
+//
+func (col *Column) Len() int {
+ return len(col.Records)
+}
+
+//
+// PushBack push record the end of column.
+//
+func (col *Column) PushBack(r *Record) {
+ col.Records = append(col.Records, r)
+}
+
+//
+// PushRecords append slice of record to the end of column's records.
+//
+func (col *Column) PushRecords(rs []*Record) {
+ col.Records = append(col.Records, rs...)
+}
+
+//
+// ToIntegers convert slice of record to slice of int64.
+//
+func (col *Column) ToIntegers() []int64 {
+ newcol := make([]int64, col.Len())
+
+ for x := range col.Records {
+ newcol[x] = col.Records[x].Integer()
+ }
+
+ return newcol
+}
+
+//
+// ToFloatSlice convert slice of record to slice of float64.
+//
+func (col *Column) ToFloatSlice() (newcol []float64) {
+ newcol = make([]float64, col.Len())
+
+ for i := range col.Records {
+ newcol[i] = col.Records[i].Float()
+ }
+
+ return
+}
+
+//
+// ToStringSlice convert slice of record to slice of string.
+//
+func (col *Column) ToStringSlice() (newcol []string) {
+ newcol = make([]string, col.Len())
+
+ for i := range col.Records {
+ newcol[i] = col.Records[i].String()
+ }
+
+ return
+}
+
+//
+// ClearValues set all value in column to empty string or zero if column type is
+// numeric.
+//
+func (col *Column) ClearValues() {
+ for _, r := range col.Records {
+ r.Reset()
+ }
+}
+
+//
+// SetValueAt will set column value at cell `idx` with `v`, unless the index
+// is out of range.
+//
+func (col *Column) SetValueAt(idx int, v string) {
+ if idx < 0 {
+ return
+ }
+ if col.Records.Len() <= idx {
+ return
+ }
+ _ = col.Records[idx].SetValue(v, col.Type)
+}
+
+//
+// SetValueByNumericAt will set column value at cell `idx` with numeric value
+// `v`, unless the index is out of range.
+//
+func (col *Column) SetValueByNumericAt(idx int, v float64) {
+ if idx < 0 {
+ return
+ }
+ if col.Records.Len() <= idx {
+ return
+ }
+ switch col.Type {
+ case TString:
+ col.Records[idx].SetString(strconv.FormatFloat(v, 'f', -1, 64))
+ case TInteger:
+ col.Records[idx].SetInteger(int64(v))
+ case TReal:
+ col.Records[idx].SetFloat(v)
+ }
+}
+
+//
+// SetValues of all column record.
+//
+func (col *Column) SetValues(values []string) {
+ vallen := len(values)
+ reclen := col.Len()
+
+ // initialize column record if its empty.
+ if reclen <= 0 {
+ col.Records = make([]*Record, vallen)
+ reclen = vallen
+ }
+
+ // pick the least length
+ minlen := reclen
+ if vallen < reclen {
+ minlen = vallen
+ }
+
+ for x := 0; x < minlen; x++ {
+ _ = col.Records[x].SetValue(values[x], col.Type)
+ }
+}
+
+//
+// DeleteRecordAt will delete record at index `i` and return it.
+//
+func (col *Column) DeleteRecordAt(i int) *Record {
+ if i < 0 {
+ return nil
+ }
+
+ clen := col.Len()
+ if i >= clen {
+ return nil
+ }
+
+ r := col.Records[i]
+
+ last := clen - 1
+ copy(col.Records[i:], col.Records[i+1:])
+ col.Records[last] = nil
+ col.Records = col.Records[0:last]
+
+ return r
+}
diff --git a/lib/tabula/column_test.go b/lib/tabula/column_test.go
new file mode 100644
index 00000000..bf2434fc
--- /dev/null
+++ b/lib/tabula/column_test.go
@@ -0,0 +1,67 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+var data = []string{"9.987654321", "8.8", "7.7", "6.6", "5.5", "4.4", "3.3"}
+var expFloat = []float64{9.987654321, 8.8, 7.7, 6.6, 5.5, 4.4, 3.3}
+
+func initColReal(t *testing.T) (col *Column) {
+ col = NewColumn(TReal, "TREAL")
+
+ for x := range data {
+ rec, e := NewRecordBy(data[x], TReal)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ col.PushBack(rec)
+ }
+
+ return col
+}
+
+func TestToFloatSlice(t *testing.T) {
+ col := initColReal(t)
+ got := col.ToFloatSlice()
+
+ test.Assert(t, "", expFloat, got, true)
+}
+
+func TestToStringSlice(t *testing.T) {
+ var col Column
+
+ for x := range data {
+ rec, e := NewRecordBy(data[x], TString)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ col.PushBack(rec)
+ }
+
+ got := col.ToStringSlice()
+
+ test.Assert(t, "", data, got, true)
+}
+
+func TestDeleteRecordAt(t *testing.T) {
+ var exp []float64
+ del := 2
+
+ exp = append(exp, expFloat[:del]...)
+ exp = append(exp, expFloat[del+1:]...)
+
+ col := initColReal(t)
+ col.DeleteRecordAt(del)
+ got := col.ToFloatSlice()
+
+ test.Assert(t, "", exp, got, true)
+}
diff --git a/lib/tabula/columninterface.go b/lib/tabula/columninterface.go
new file mode 100644
index 00000000..8a961b8b
--- /dev/null
+++ b/lib/tabula/columninterface.go
@@ -0,0 +1,20 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+//
+// ColumnInterface define an interface for working with Column.
+//
+type ColumnInterface interface {
+ SetType(tipe int)
+ SetName(name string)
+
+ GetType() int
+ GetName() string
+
+ SetRecords(recs *Records)
+
+ Interface() interface{}
+}
diff --git a/lib/tabula/columns.go b/lib/tabula/columns.go
new file mode 100644
index 00000000..a5cd05d5
--- /dev/null
+++ b/lib/tabula/columns.go
@@ -0,0 +1,147 @@
+// Copyright 2017m Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ libbytes "github.com/shuLhan/share/lib/bytes"
+ libnumbers "github.com/shuLhan/share/lib/numbers"
+)
+
+//
+// Columns represent slice of Column.
+//
+type Columns []Column
+
+//
+// Len return length of columns.
+//
+func (cols *Columns) Len() int {
+ return len(*cols)
+}
+
+//
+// Reset each data and attribute in all columns.
+//
+func (cols *Columns) Reset() {
+ for x := range *cols {
+ (*cols)[x].Reset()
+ }
+}
+
+//
+// SetTypes of each column. The length of type must be equal with the number of
+// column, otherwise it will used the minimum length between types or columns.
+//
+func (cols *Columns) SetTypes(types []int) {
+ typeslen := len(types)
+ colslen := len(*cols)
+ minlen := typeslen
+
+ if colslen < minlen {
+ minlen = colslen
+ }
+
+ for x := 0; x < minlen; x++ {
+ (*cols)[x].Type = types[x]
+ }
+}
+
+//
+// RandomPick column in columns until n item and return it like its has been
+// shuffled. If duplicate is true, column that has been picked can be picked up
+// again, otherwise it will only picked up once.
+//
+// This function return picked and unpicked column and index of them.
+//
+func (cols *Columns) RandomPick(n int, dup bool, excludeIdx []int) (
+ picked Columns,
+ unpicked Columns,
+ pickedIdx []int,
+ unpickedIdx []int,
+) {
+ excLen := len(excludeIdx)
+ colsLen := len(*cols)
+ allowedLen := colsLen - excLen
+
+ // if duplication is not allowed, limit the number of selected
+ // column.
+ if n > allowedLen && !dup {
+ n = allowedLen
+ }
+
+ for ; n >= 1; n-- {
+ idx := libnumbers.IntPickRandPositive(colsLen, dup, pickedIdx,
+ excludeIdx)
+
+ pickedIdx = append(pickedIdx, idx)
+ picked = append(picked, (*cols)[idx])
+ }
+
+ // select unpicked columns using picked index.
+ for cid := range *cols {
+ // check if column index has been picked up
+ isPicked := false
+ for _, idx := range pickedIdx {
+ if cid == idx {
+ isPicked = true
+ break
+ }
+ }
+ if !isPicked {
+ unpicked = append(unpicked, (*cols)[cid])
+ unpickedIdx = append(unpickedIdx, cid)
+ }
+ }
+
+ return
+}
+
+//
+// GetMinMaxLength given a slice of column, find the minimum and maximum column
+// length among them.
+//
+func (cols *Columns) GetMinMaxLength() (min, max int) {
+ for _, col := range *cols {
+ collen := col.Len()
+ if collen < min {
+ min = collen
+ } else if collen > max {
+ max = collen
+ }
+ }
+ return
+}
+
+//
+// Join all column records value at index `row` using separator `sep` and make
+// sure if there is a separator in value it will be escaped with `esc`.
+//
+// Given slice of columns, where row is 1 and sep is `,` and escape is `\`
+//
+// 0 1 2
+// 0 A B C
+// 1 D , F <- row
+// 2 G H I
+//
+// this function will return "D,\,,F" in bytes.
+//
+//
+func (cols *Columns) Join(row int, sep, esc []byte) (v []byte) {
+ for y, col := range *cols {
+ if y > 0 {
+ v = append(v, sep...)
+ }
+
+ rec := col.Records[row]
+ recV := rec.Bytes()
+
+ if rec.Type() == TString {
+ recV, _ = libbytes.EncloseToken(recV, sep, esc, nil)
+ }
+
+ v = append(v, recV...)
+ }
+ return
+}
diff --git a/lib/tabula/columns_test.go b/lib/tabula/columns_test.go
new file mode 100644
index 00000000..43b30028
--- /dev/null
+++ b/lib/tabula/columns_test.go
@@ -0,0 +1,56 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestRandomPickColumns(t *testing.T) {
+ var dataset Dataset
+ var e error
+
+ dataset.Init(DatasetModeRows, testColTypes, testColNames)
+
+ dataset.Rows, e = initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ dataset.TransposeToColumns()
+
+ // random pick with duplicate
+ ncols := 6
+ dup := true
+ excludeIdx := []int{3}
+
+ for i := 0; i < 5; i++ {
+ picked, unpicked, _, _ :=
+ dataset.Columns.RandomPick(ncols, dup, excludeIdx)
+
+ // check if unpicked item exist in picked items.
+ for _, un := range unpicked {
+ for _, pick := range picked {
+ test.Assert(t, "", un, pick, false)
+ }
+ }
+ }
+
+ // random pick without duplicate
+ dup = false
+ for i := 0; i < 5; i++ {
+ picked, unpicked, _, _ :=
+ dataset.Columns.RandomPick(ncols, dup, excludeIdx)
+
+ // check if unpicked item exist in picked items.
+ for _, un := range unpicked {
+ for _, pick := range picked {
+ test.Assert(t, "", un, pick, false)
+ }
+ }
+ }
+}
diff --git a/lib/tabula/dataset.go b/lib/tabula/dataset.go
new file mode 100644
index 00000000..703aca35
--- /dev/null
+++ b/lib/tabula/dataset.go
@@ -0,0 +1,747 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "errors"
+ "math"
+)
+
+const (
+ // DatasetNoMode default to matrix.
+ DatasetNoMode = 0
+ // DatasetModeRows for output mode in rows.
+ DatasetModeRows = 1
+ // DatasetModeColumns for output mode in columns.
+ DatasetModeColumns = 2
+ // DatasetModeMatrix will save data in rows and columns.
+ DatasetModeMatrix = 4
+)
+
+var (
+ // ErrColIdxOutOfRange operation on column index is invalid
+ ErrColIdxOutOfRange = errors.New("tabula: Column index out of range")
+ // ErrInvalidColType operation on column with different type
+ ErrInvalidColType = errors.New("tabula: Invalid column type")
+ // ErrMisColLength returned when operation on columns does not match
+ // between parameter and their length
+ ErrMisColLength = errors.New("tabula: mismatch on column length")
+)
+
+//
+// Dataset contain the data, mode of saved data, number of columns and rows in
+// data.
+//
+type Dataset struct {
+ // Mode define the numeric value of output mode.
+ Mode int
+ // Columns is input data that has been parsed.
+ Columns Columns
+ // Rows is input data that has been parsed.
+ Rows Rows
+}
+
+//
+// NewDataset create new dataset, use the mode to initialize the dataset.
+//
+func NewDataset(mode int, types []int, names []string) (
+ dataset *Dataset,
+) {
+ dataset = &Dataset{}
+
+ dataset.Init(mode, types, names)
+
+ return
+}
+
+//
+// Init will set the dataset using mode and types.
+//
+func (dataset *Dataset) Init(mode int, types []int, names []string) {
+ if types == nil {
+ dataset.Columns = make(Columns, 0)
+ } else {
+ dataset.Columns = make(Columns, len(types))
+ dataset.Columns.SetTypes(types)
+ }
+
+ dataset.SetColumnsName(names)
+ dataset.SetMode(mode)
+}
+
+//
+// Clone return a copy of current dataset.
+//
+func (dataset *Dataset) Clone() interface{} {
+ clone := NewDataset(dataset.GetMode(), nil, nil)
+
+ for _, col := range dataset.Columns {
+ newcol := Column{
+ Type: col.Type,
+ Name: col.Name,
+ ValueSpace: col.ValueSpace,
+ }
+ clone.PushColumn(newcol)
+ }
+
+ return clone
+}
+
+//
+// Reset all data and attributes.
+//
+func (dataset *Dataset) Reset() error {
+ dataset.Rows = Rows{}
+ dataset.Columns.Reset()
+ return nil
+}
+
+//
+// GetMode return mode of data.
+//
+func (dataset *Dataset) GetMode() int {
+ return dataset.Mode
+}
+
+//
+// SetMode of saved data to `mode`.
+//
+func (dataset *Dataset) SetMode(mode int) {
+ switch mode {
+ case DatasetModeRows:
+ dataset.Mode = DatasetModeRows
+ dataset.Rows = make(Rows, 0)
+ case DatasetModeColumns:
+ dataset.Mode = DatasetModeColumns
+ dataset.Columns.Reset()
+ default:
+ dataset.Mode = DatasetModeMatrix
+ dataset.Rows = make(Rows, 0)
+ dataset.Columns.Reset()
+ }
+}
+
+//
+// GetNColumn return the number of column in dataset.
+//
+func (dataset *Dataset) GetNColumn() (ncol int) {
+ ncol = len(dataset.Columns)
+
+ if ncol > 0 {
+ return
+ }
+
+ switch dataset.Mode {
+ case DatasetModeRows:
+ if len(dataset.Rows) <= 0 {
+ return 0
+ }
+ return dataset.Rows[0].Len()
+ }
+
+ return
+}
+
+//
+// GetNRow return number of rows in dataset.
+//
+func (dataset *Dataset) GetNRow() (nrow int) {
+ switch dataset.Mode {
+ case DatasetModeRows:
+ nrow = len(dataset.Rows)
+ case DatasetModeColumns:
+ if len(dataset.Columns) <= 0 {
+ nrow = 0
+ } else {
+ // get length of record in the first column
+ nrow = dataset.Columns[0].Len()
+ }
+ case DatasetModeMatrix, DatasetNoMode:
+ // matrix mode could have empty either in rows or column.
+ nrow = len(dataset.Rows)
+ }
+ return
+}
+
+//
+// Len return number of row in dataset.
+//
+func (dataset *Dataset) Len() int {
+ return dataset.GetNRow()
+}
+
+//
+// GetColumnsType return the type of all columns.
+//
+func (dataset *Dataset) GetColumnsType() (types []int) {
+ for x := range dataset.Columns {
+ types = append(types, dataset.Columns[x].Type)
+ }
+
+ return
+}
+
+//
+// SetColumnsType of data in all columns.
+//
+func (dataset *Dataset) SetColumnsType(types []int) {
+ dataset.Columns = make(Columns, len(types))
+ dataset.Columns.SetTypes(types)
+}
+
+//
+// GetColumnTypeAt return type of column in index `colidx` in dataset.
+//
+func (dataset *Dataset) GetColumnTypeAt(idx int) (int, error) {
+ if idx >= dataset.GetNColumn() {
+ return TUndefined, ErrColIdxOutOfRange
+ }
+
+ return dataset.Columns[idx].Type, nil
+}
+
+//
+// SetColumnTypeAt will set column type at index `colidx` to `tipe`.
+//
+func (dataset *Dataset) SetColumnTypeAt(idx, tipe int) error {
+ if idx >= dataset.GetNColumn() {
+ return ErrColIdxOutOfRange
+ }
+
+ dataset.Columns[idx].Type = tipe
+ return nil
+}
+
+//
+// GetColumnsName return name of all columns.
+//
+func (dataset *Dataset) GetColumnsName() (names []string) {
+ for x := range dataset.Columns {
+ names = append(names, dataset.Columns[x].Name)
+ }
+
+ return
+}
+
+//
+// SetColumnsName set column name.
+//
+func (dataset *Dataset) SetColumnsName(names []string) {
+ nameslen := len(names)
+
+ if nameslen <= 0 {
+ // empty names, return immediately.
+ return
+ }
+
+ collen := dataset.GetNColumn()
+
+ if collen <= 0 {
+ dataset.Columns = make(Columns, nameslen)
+ collen = nameslen
+ }
+
+ // find minimum length
+ minlen := collen
+ if nameslen < collen {
+ minlen = nameslen
+ }
+
+ for x := 0; x < minlen; x++ {
+ dataset.Columns[x].Name = names[x]
+ }
+}
+
+//
+// AddColumn will create and add new empty column with specific type and name
+// into dataset.
+//
+func (dataset *Dataset) AddColumn(tipe int, name string, vs []string) {
+ col := Column{
+ Type: tipe,
+ Name: name,
+ ValueSpace: vs,
+ }
+ dataset.PushColumn(col)
+}
+
+//
+// GetColumn return pointer to column object at index `idx`. If `idx` is out of
+// range return nil.
+//
+func (dataset *Dataset) GetColumn(idx int) (col *Column) {
+ if idx > dataset.GetNColumn() {
+ return
+ }
+
+ switch dataset.Mode {
+ case DatasetModeRows:
+ dataset.TransposeToColumns()
+ case DatasetModeColumns:
+ // do nothing
+ case DatasetModeMatrix:
+ // do nothing
+ }
+
+ return &dataset.Columns[idx]
+}
+
+//
+// GetColumnByName return column based on their `name`.
+//
+func (dataset *Dataset) GetColumnByName(name string) (col *Column) {
+ switch dataset.Mode {
+ case DatasetModeRows:
+ dataset.TransposeToColumns()
+ }
+
+ for x, col := range dataset.Columns {
+ if col.Name == name {
+ return &dataset.Columns[x]
+ }
+ }
+ return
+}
+
+//
+// GetColumns return columns in dataset, without transposing.
+//
+func (dataset *Dataset) GetColumns() *Columns {
+ return &dataset.Columns
+}
+
+//
+// SetColumns will replace current columns with new one from parameter.
+//
+func (dataset *Dataset) SetColumns(cols *Columns) {
+ dataset.Columns = *cols
+}
+
+//
+// GetRow return pointer to row at index `idx` or nil if index is out of range.
+//
+func (dataset *Dataset) GetRow(idx int) *Row {
+ if idx < 0 {
+ return nil
+ }
+ if idx >= dataset.Rows.Len() {
+ return nil
+ }
+ return dataset.Rows[idx]
+}
+
+//
+// GetRows return rows in dataset, without transposing.
+//
+func (dataset *Dataset) GetRows() *Rows {
+ return &dataset.Rows
+}
+
+//
+// SetRows will replace current rows with new one from parameter.
+//
+func (dataset *Dataset) SetRows(rows *Rows) {
+ dataset.Rows = *rows
+}
+
+//
+// GetData return the data, based on mode (rows, columns, or matrix).
+//
+func (dataset *Dataset) GetData() interface{} {
+ switch dataset.Mode {
+ case DatasetModeRows:
+ return &dataset.Rows
+ case DatasetModeColumns:
+ return &dataset.Columns
+ case DatasetModeMatrix, DatasetNoMode:
+ return &Matrix{
+ Columns: &dataset.Columns,
+ Rows: &dataset.Rows,
+ }
+ }
+
+ return nil
+}
+
+//
+// GetDataAsRows return data in rows mode.
+//
+func (dataset *Dataset) GetDataAsRows() *Rows {
+ if dataset.Mode == DatasetModeColumns {
+ dataset.TransposeToRows()
+ }
+ return &dataset.Rows
+}
+
+//
+// GetDataAsColumns return data in columns mode.
+//
+func (dataset *Dataset) GetDataAsColumns() (columns *Columns) {
+ if dataset.Mode == DatasetModeRows {
+ dataset.TransposeToColumns()
+ }
+ return &dataset.Columns
+}
+
+//
+// TransposeToColumns move all data from rows (horizontal) to columns
+// (vertical) mode.
+//
+func (dataset *Dataset) TransposeToColumns() {
+ if dataset.GetNRow() <= 0 {
+ // nothing to transpose
+ return
+ }
+
+ ncol := dataset.GetNColumn()
+ if ncol <= 0 {
+ // if no columns defined, initialize it using record type
+ // in the first row.
+ types := dataset.GetRow(0).Types()
+ dataset.SetColumnsType(types)
+ ncol = len(types)
+ }
+
+ orgmode := dataset.GetMode()
+
+ switch orgmode {
+ case DatasetModeRows:
+ // do nothing.
+ case DatasetModeColumns, DatasetModeMatrix, DatasetNoMode:
+ // check if column records contain data.
+ nrow := dataset.Columns[0].Len()
+ if nrow > 0 {
+ // return if column record is not empty, its already
+ // transposed
+ return
+ }
+ }
+
+ // use the least length
+ minlen := len(*dataset.GetRow(0))
+
+ if minlen > ncol {
+ minlen = ncol
+ }
+
+ switch orgmode {
+ case DatasetModeRows, DatasetNoMode:
+ dataset.SetMode(DatasetModeColumns)
+ }
+
+ for _, row := range dataset.Rows {
+ for y := 0; y < minlen; y++ {
+ dataset.Columns[y].PushBack((*row)[y])
+ }
+ }
+
+ // reset the rows data only if original mode is rows
+ // this to prevent empty data when mode is matrix.
+ switch orgmode {
+ case DatasetModeRows:
+ dataset.Rows = nil
+ }
+}
+
+//
+// TransposeToRows will move all data from columns (vertical) to rows
+// (horizontal) mode.
+//
+func (dataset *Dataset) TransposeToRows() {
+ orgmode := dataset.GetMode()
+
+ if orgmode == DatasetModeRows {
+ // already transposed
+ return
+ }
+
+ if orgmode == DatasetModeColumns {
+ // only set mode if transposing from columns to rows
+ dataset.SetMode(DatasetModeRows)
+ }
+
+ // Get the max length of columns.
+ rowlen := math.MinInt32
+ flen := len(dataset.Columns)
+
+ for f := 0; f < flen; f++ {
+ l := dataset.Columns[f].Len()
+
+ if l > rowlen {
+ rowlen = l
+ }
+ }
+
+ dataset.Rows = make(Rows, 0)
+
+ // Transpose record from column to row.
+ for r := 0; r < rowlen; r++ {
+ row := make(Row, flen)
+
+ for f := 0; f < flen; f++ {
+ if dataset.Columns[f].Len() > r {
+ row[f] = dataset.Columns[f].Records[r]
+ } else {
+ row[f] = NewRecord()
+ }
+ }
+
+ dataset.Rows = append(dataset.Rows, &row)
+ }
+
+ // Only reset the columns if original dataset mode is "columns".
+ // This to prevent empty data when mode is matrix.
+ if orgmode == DatasetModeColumns {
+ dataset.Columns.Reset()
+ }
+}
+
+//
+// PushRow save the data, which is already in row object, to Rows.
+//
+func (dataset *Dataset) PushRow(row *Row) {
+ switch dataset.GetMode() {
+ case DatasetModeRows:
+ dataset.Rows = append(dataset.Rows, row)
+ case DatasetModeColumns:
+ dataset.PushRowToColumns(row)
+ case DatasetModeMatrix, DatasetNoMode:
+ dataset.Rows = append(dataset.Rows, row)
+ dataset.PushRowToColumns(row)
+ }
+}
+
+//
+// PushRowToColumns push each data in Row to Columns.
+//
+func (dataset *Dataset) PushRowToColumns(row *Row) {
+ rowlen := row.Len()
+ if rowlen <= 0 {
+ // return immediately if no data in row.
+ return
+ }
+
+ // check if columns is initialize.
+ collen := len(dataset.Columns)
+ if collen <= 0 {
+ dataset.Columns = make(Columns, rowlen)
+ collen = rowlen
+ }
+
+ // pick the minimum length.
+ min := rowlen
+ if collen < rowlen {
+ min = collen
+ }
+
+ for x := 0; x < min; x++ {
+ dataset.Columns[x].PushBack((*row)[x])
+ }
+}
+
+//
+// FillRowsWithColumn given a column, fill the dataset with row where the record
+// only set at index `colIdx`.
+//
+// Example, content of dataset was,
+//
+// index: 0 1 2
+// A B C
+// X (step 1) nrow = 2
+//
+// If we filled column at index 2 with [Y Z], the dataset will become:
+//
+// index: 0 1 2
+// A B C
+// X Y (step 2) fill the empty row
+// Z (step 3) create dummy row which contain the rest of column data.
+//
+func (dataset *Dataset) FillRowsWithColumn(colIdx int, col Column) {
+ if dataset.GetMode() != DatasetModeRows {
+ // Only work if dataset mode is ROWS
+ return
+ }
+
+ nrow := dataset.GetNRow()
+ emptyAt := nrow
+
+ // (step 1) Find the row with empty records
+ for x, row := range dataset.Rows {
+ if row.IsNilAt(colIdx) {
+ emptyAt = x
+ break
+ }
+ }
+
+ // (step 2) Fill the empty rows using column records.
+ y := 0
+ for x := emptyAt; x < nrow; x++ {
+ dataset.Rows[x].SetValueAt(colIdx, col.Records[y])
+ y++
+ }
+
+ // (step 3) Continue filling the column but using dummy row which
+ // contain only record at index `colIdx`.
+ ncol := dataset.GetNColumn()
+ nrow = col.Len()
+ for ; y < nrow; y++ {
+ row := make(Row, ncol)
+
+ for z := 0; z < ncol; z++ {
+ if z == colIdx {
+ row[colIdx] = col.Records[y]
+ } else {
+ row[z] = NewRecord()
+ }
+ }
+
+ dataset.PushRow(&row)
+ }
+}
+
+//
+// PushColumn will append new column to the end of slice if no existing column
+// with the same name. If it exist, the records will be merged.
+//
+func (dataset *Dataset) PushColumn(col Column) {
+ exist := false
+ colIdx := 0
+ for x, c := range dataset.Columns {
+ if c.Name == col.Name {
+ exist = true
+ colIdx = x
+ break
+ }
+ }
+
+ switch dataset.GetMode() {
+ case DatasetModeRows:
+ if exist {
+ dataset.FillRowsWithColumn(colIdx, col)
+ } else {
+ // append new column
+ dataset.Columns = append(dataset.Columns, col)
+ dataset.PushColumnToRows(col)
+ // Remove records in column
+ dataset.Columns[dataset.GetNColumn()-1].Reset()
+ }
+ case DatasetModeColumns:
+ if exist {
+ dataset.Columns[colIdx].PushRecords(col.Records)
+ } else {
+ dataset.Columns = append(dataset.Columns, col)
+ }
+ case DatasetModeMatrix, DatasetNoMode:
+ if exist {
+ dataset.Columns[colIdx].PushRecords(col.Records)
+ } else {
+ dataset.Columns = append(dataset.Columns, col)
+ dataset.PushColumnToRows(col)
+ }
+ }
+}
+
+//
+// PushColumnToRows add each record in column to each rows, from top to bottom.
+//
+func (dataset *Dataset) PushColumnToRows(col Column) {
+ colsize := col.Len()
+ if colsize <= 0 {
+ // Do nothing if column is empty.
+ return
+ }
+
+ nrow := dataset.GetNRow()
+ if nrow <= 0 {
+ // If no existing rows in dataset, initialize the rows slice.
+ dataset.Rows = make(Rows, colsize)
+
+ for nrow = 0; nrow < colsize; nrow++ {
+ row := make(Row, 0)
+ dataset.Rows[nrow] = &row
+ }
+ }
+
+ // Pick the minimum length between column or current row length.
+ minrow := nrow
+
+ if colsize < nrow {
+ minrow = colsize
+ }
+
+ // Push each record in column to each rows
+ var row *Row
+ var rec *Record
+
+ for x := 0; x < minrow; x++ {
+ row = dataset.Rows[x]
+ rec = col.Records[x]
+
+ row.PushBack(rec)
+ }
+}
+
+//
+// MergeColumns append columns from other dataset into current dataset.
+//
+func (dataset *Dataset) MergeColumns(other DatasetInterface) {
+ othermode := other.GetMode()
+ if othermode == DatasetModeRows {
+ other.TransposeToColumns()
+ }
+
+ cols := other.GetDataAsColumns()
+ for _, col := range *cols {
+ dataset.PushColumn(col)
+ }
+
+ switch othermode {
+ case DatasetModeRows:
+ other.TransposeToRows()
+ }
+}
+
+//
+// MergeRows append rows from other dataset into current dataset.
+//
+func (dataset *Dataset) MergeRows(other DatasetInterface) {
+ rows := other.GetDataAsRows()
+ for _, row := range *rows {
+ dataset.PushRow(row)
+ }
+}
+
+//
+// DeleteRow will detach row at index `i` from dataset and return it.
+//
+func (dataset *Dataset) DeleteRow(i int) (row *Row) {
+ if i < 0 {
+ return
+ }
+ if i >= dataset.Rows.Len() {
+ return
+ }
+
+ orgmode := dataset.GetMode()
+ if orgmode == DatasetModeColumns {
+ dataset.TransposeToRows()
+ }
+
+ row = dataset.Rows.Del(i)
+
+ if orgmode == DatasetModeColumns {
+ dataset.TransposeToColumns()
+ }
+
+ if orgmode != DatasetModeRows {
+ // Delete record in each columns as the same index as deleted
+ // row.
+ for x := range dataset.Columns {
+ dataset.Columns[x].DeleteRecordAt(i)
+ }
+ }
+
+ return row
+}
diff --git a/lib/tabula/dataset_bench_test.go b/lib/tabula/dataset_bench_test.go
new file mode 100644
index 00000000..86e36cc9
--- /dev/null
+++ b/lib/tabula/dataset_bench_test.go
@@ -0,0 +1,20 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "testing"
+)
+
+func BenchmarkPushRow(b *testing.B) {
+ dataset := NewDataset(DatasetModeRows, nil, nil)
+
+ for i := 0; i < b.N; i++ {
+ e := populateWithRows(dataset)
+ if e != nil {
+ b.Fatal(e)
+ }
+ }
+}
diff --git a/lib/tabula/dataset_test.go b/lib/tabula/dataset_test.go
new file mode 100644
index 00000000..0b43f71c
--- /dev/null
+++ b/lib/tabula/dataset_test.go
@@ -0,0 +1,365 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+var datasetRows = [][]string{
+ {"0", "1", "A"},
+ {"1", "1.1", "B"},
+ {"2", "1.2", "A"},
+ {"3", "1.3", "B"},
+ {"4", "1.4", "C"},
+ {"5", "1.5", "D"},
+ {"6", "1.6", "C"},
+ {"7", "1.7", "D"},
+ {"8", "1.8", "E"},
+ {"9", "1.9", "F"},
+}
+
+var datasetCols = [][]string{
+ {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+ {"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"},
+ {"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"},
+}
+
+var datasetTypes = []int{
+ TInteger,
+ TReal,
+ TString,
+}
+
+var datasetNames = []string{"int", "real", "string"}
+
+func populateWithRows(dataset *Dataset) error {
+ for _, rowin := range datasetRows {
+ row := make(Row, len(rowin))
+
+ for x, recin := range rowin {
+ rec, e := NewRecordBy(recin, datasetTypes[x])
+ if e != nil {
+ return e
+ }
+
+ row[x] = rec
+ }
+
+ dataset.PushRow(&row)
+ }
+ return nil
+}
+
+func populateWithColumns(t *testing.T, dataset *Dataset) {
+ for x := range datasetCols {
+ col, e := NewColumnString(datasetCols[x], datasetTypes[x],
+ datasetNames[x])
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ dataset.PushColumn(*col)
+ }
+}
+
+func createDataset(t *testing.T) (dataset *Dataset) {
+ dataset = NewDataset(DatasetModeRows, datasetTypes,
+ datasetNames)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ return
+}
+
+func DatasetStringJoinByIndex(t *testing.T, dataset [][]string, indis []int) (res string) {
+ for x := range indis {
+ res += fmt.Sprint("&", dataset[indis[x]])
+ }
+ return res
+}
+
+func DatasetRowsJoin(t *testing.T) (s string) {
+ for x := range datasetRows {
+ s += fmt.Sprint("&", datasetRows[x])
+ }
+ return
+}
+
+func DatasetColumnsJoin(t *testing.T) (s string) {
+ for x := range datasetCols {
+ s += fmt.Sprint(datasetCols[x])
+ }
+ return
+}
+
+func TestSplitRowsByNumeric(t *testing.T) {
+ dataset := createDataset(t)
+
+ // Split integer by float
+ splitL, splitR, e := SplitRowsByNumeric(dataset, 0, 4.5)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ expIdx := []int{0, 1, 2, 3, 4}
+ exp := DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ rows := splitL.GetDataAsRows()
+ got := fmt.Sprint(rows)
+
+ test.Assert(t, "", exp, got, true)
+
+ expIdx = []int{5, 6, 7, 8, 9}
+ exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ got = fmt.Sprint(splitR.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ // Split by float
+ splitL, splitR, e = SplitRowsByNumeric(dataset, 1, 1.8)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ expIdx = []int{0, 1, 2, 3, 4, 5, 6, 7}
+ exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ got = fmt.Sprint(splitL.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ expIdx = []int{8, 9}
+ exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ got = fmt.Sprint(splitR.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestSplitRowsByCategorical(t *testing.T) {
+ dataset := createDataset(t)
+ splitval := []string{"A", "D"}
+
+ splitL, splitR, e := SplitRowsByCategorical(dataset, 2,
+ splitval)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ expIdx := []int{0, 2, 5, 7}
+ exp := DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ got := fmt.Sprint(splitL.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+
+ expIdx = []int{1, 3, 4, 6, 8, 9}
+ exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
+ got = fmt.Sprint(splitR.GetDataAsRows())
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeColumnsPushColumn(t *testing.T) {
+ dataset := NewDataset(DatasetModeColumns, nil, nil)
+
+ exp := ""
+ got := ""
+ for x := range datasetCols {
+ col, e := NewColumnString(datasetCols[x], datasetTypes[x],
+ datasetNames[x])
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ dataset.PushColumn(*col)
+
+ exp += fmt.Sprint(datasetCols[x])
+ got += fmt.Sprint(dataset.Columns[x].Records)
+ }
+
+ test.Assert(t, "", exp, got, true)
+
+ // Check rows
+ exp = ""
+ got = fmt.Sprint(dataset.Rows)
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeRowsPushColumn(t *testing.T) {
+ dataset := NewDataset(DatasetModeRows, nil, nil)
+
+ populateWithColumns(t, dataset)
+
+ // Check rows
+ exp := DatasetRowsJoin(t)
+ got := fmt.Sprint(dataset.Rows)
+
+ test.Assert(t, "", exp, got, true)
+
+ // Check columns
+ exp = "[{int 1 0 [] []} {real 2 0 [] []} {string 0 0 [] []}]"
+ got = fmt.Sprint(dataset.Columns)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeMatrixPushColumn(t *testing.T) {
+ dataset := NewDataset(DatasetModeMatrix, nil, nil)
+
+ exp := ""
+ got := ""
+ for x := range datasetCols {
+ col, e := NewColumnString(datasetCols[x], datasetTypes[x],
+ datasetNames[x])
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ dataset.PushColumn(*col)
+
+ exp += fmt.Sprint(datasetCols[x])
+ got += fmt.Sprint(dataset.Columns[x].Records)
+ }
+
+ test.Assert(t, "", exp, got, true)
+
+ // Check rows
+ exp = DatasetRowsJoin(t)
+ got = fmt.Sprint(dataset.Rows)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeRowsPushRows(t *testing.T) {
+ dataset := NewDataset(DatasetModeRows, nil, nil)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ exp := DatasetRowsJoin(t)
+ got := fmt.Sprint(dataset.Rows)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeColumnsPushRows(t *testing.T) {
+ dataset := NewDataset(DatasetModeColumns, nil, nil)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // check rows
+ exp := ""
+ got := fmt.Sprint(dataset.Rows)
+
+ test.Assert(t, "", exp, got, true)
+
+ // check columns
+ exp = DatasetColumnsJoin(t)
+ got = ""
+ for x := range dataset.Columns {
+ got += fmt.Sprint(dataset.Columns[x].Records)
+ }
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestModeMatrixPushRows(t *testing.T) {
+ dataset := NewDataset(DatasetModeMatrix, nil, nil)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ exp := DatasetRowsJoin(t)
+ got := fmt.Sprint(dataset.Rows)
+
+ test.Assert(t, "", exp, got, true)
+
+ // check columns
+ exp = DatasetColumnsJoin(t)
+ got = ""
+ for x := range dataset.Columns {
+ got += fmt.Sprint(dataset.Columns[x].Records)
+ }
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestSelectRowsWhere(t *testing.T) {
+ dataset := NewDataset(DatasetModeMatrix, nil, nil)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // select all rows where the first column value is 9.
+ selected := SelectRowsWhere(dataset, 0, "9")
+ exp := dataset.GetRow(9)
+ got := selected.GetRow(0)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestDeleteRow(t *testing.T) {
+ dataset := NewDataset(DatasetModeMatrix, nil, nil)
+
+ e := populateWithRows(dataset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ delIdx := 2
+
+ // Check rows len.
+ exp := dataset.Len() - 1
+ dataset.DeleteRow(delIdx)
+ got := dataset.Len()
+
+ test.Assert(t, "", exp, got, true)
+
+ // Check columns len.
+ for _, col := range dataset.Columns {
+ got = col.Len()
+
+ test.Assert(t, "", exp, got, true)
+ }
+
+ // Check rows data.
+ ridx := 0
+ for x, row := range datasetRows {
+ if x == delIdx {
+ continue
+ }
+ exp := fmt.Sprint("&", row)
+ got := fmt.Sprint(dataset.GetRow(ridx))
+ ridx++
+
+ test.Assert(t, "", exp, got, true)
+ }
+
+ // Check columns data.
+ for x := range dataset.Columns {
+ col := datasetCols[x]
+
+ coldel := []string{}
+ coldel = append(coldel, col[:delIdx]...)
+ coldel = append(coldel, col[delIdx+1:]...)
+
+ exp := fmt.Sprint(coldel)
+ got := fmt.Sprint(dataset.Columns[x].Records)
+ test.Assert(t, "", exp, got, true)
+ }
+}
diff --git a/lib/tabula/datasetinterface.go b/lib/tabula/datasetinterface.go
new file mode 100644
index 00000000..b68b5b12
--- /dev/null
+++ b/lib/tabula/datasetinterface.go
@@ -0,0 +1,442 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+
+ "github.com/shuLhan/share/lib/debug"
+)
+
+//
+// DatasetInterface is the interface for working with DSV data.
+//
+type DatasetInterface interface {
+ Init(mode int, types []int, names []string)
+ Clone() interface{}
+ Reset() error
+
+ GetMode() int
+ SetMode(mode int)
+
+ GetNColumn() int
+ GetNRow() int
+ Len() int
+
+ GetColumnsType() []int
+ SetColumnsType(types []int)
+
+ GetColumnTypeAt(idx int) (int, error)
+ SetColumnTypeAt(idx, tipe int) error
+
+ GetColumnsName() []string
+ SetColumnsName(names []string)
+
+ AddColumn(tipe int, name string, vs []string)
+ GetColumn(idx int) *Column
+ GetColumnByName(name string) *Column
+ GetColumns() *Columns
+ SetColumns(*Columns)
+
+ GetRow(idx int) *Row
+ GetRows() *Rows
+ SetRows(*Rows)
+ DeleteRow(idx int) *Row
+
+ GetData() interface{}
+ GetDataAsRows() *Rows
+ GetDataAsColumns() *Columns
+
+ TransposeToColumns()
+ TransposeToRows()
+
+ PushRow(r *Row)
+ PushRowToColumns(r *Row)
+ FillRowsWithColumn(colidx int, col Column)
+ PushColumn(col Column)
+ PushColumnToRows(col Column)
+
+ MergeColumns(DatasetInterface)
+ MergeRows(DatasetInterface)
+}
+
+//
+// ReadDatasetConfig open dataset configuration file and initialize dataset
+// field from there.
+//
+func ReadDatasetConfig(ds interface{}, fcfg string) (e error) {
+ cfg, e := ioutil.ReadFile(fcfg)
+
+ if nil != e {
+ return e
+ }
+
+ return json.Unmarshal(cfg, ds)
+}
+
+//
+// SortColumnsByIndex will sort all columns using sorted index.
+//
+func SortColumnsByIndex(di DatasetInterface, sortedIdx []int) {
+ if di.GetMode() == DatasetModeRows {
+ di.TransposeToColumns()
+ }
+
+ cols := di.GetColumns()
+ for x, col := range *cols {
+ colsorted := col.Records.SortByIndex(sortedIdx)
+ (*cols)[x].SetRecords(colsorted)
+ }
+}
+
+//
+// SplitRowsByNumeric will split the data using splitVal in column `colidx`.
+//
+// For example, given two continuous attribute,
+//
+// A: {1,2,3,4}
+// B: {5,6,7,8}
+//
+// if colidx is (1) B and splitVal is 7, the data will splitted into left set
+//
+// A': {1,2}
+// B': {5,6}
+//
+// and right set
+//
+// A'': {3,4}
+// B'': {7,8}
+//
+func SplitRowsByNumeric(di DatasetInterface, colidx int, splitVal float64) (
+ splitLess DatasetInterface,
+ splitGreater DatasetInterface,
+ e error,
+) {
+ // check type of column
+ coltype, e := di.GetColumnTypeAt(colidx)
+ if e != nil {
+ return
+ }
+
+ if !(coltype == TInteger || coltype == TReal) {
+ return splitLess, splitGreater, ErrInvalidColType
+ }
+
+ // Should we convert the data mode back later.
+ orgmode := di.GetMode()
+
+ if orgmode == DatasetModeColumns {
+ di.TransposeToRows()
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[tabula] dataset:", di)
+ }
+
+ splitLess = di.Clone().(DatasetInterface)
+ splitGreater = di.Clone().(DatasetInterface)
+
+ rows := di.GetRows()
+ for _, row := range *rows {
+ if (*row)[colidx].Float() < splitVal {
+ splitLess.PushRow(row)
+ } else {
+ splitGreater.PushRow(row)
+ }
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[tabula] split less:", splitLess)
+ fmt.Println("[tabula] split greater:", splitGreater)
+ }
+
+ switch orgmode {
+ case DatasetModeColumns:
+ di.TransposeToColumns()
+ splitLess.TransposeToColumns()
+ splitGreater.TransposeToColumns()
+ case DatasetModeMatrix:
+ // do nothing, since its already filled when pushing new row.
+ }
+
+ return
+}
+
+//
+// SplitRowsByCategorical will split the data using a set of split value in
+// column `colidx`.
+//
+// For example, given two attributes,
+//
+// X: [A,B,A,B,C,D,C,D]
+// Y: [1,2,3,4,5,6,7,8]
+//
+// if colidx is (0) or A and split value is a set `[A,C]`, the data will
+// splitted into left set which contain all rows that have A or C,
+//
+// X': [A,A,C,C]
+// Y': [1,3,5,7]
+//
+// and the right set, excluded set, will contain all rows which is not A or C,
+//
+// X'': [B,B,D,D]
+// Y'': [2,4,6,8]
+//
+func SplitRowsByCategorical(di DatasetInterface, colidx int,
+ splitVal []string) (
+ splitIn DatasetInterface,
+ splitEx DatasetInterface,
+ e error,
+) {
+ // check type of column
+ coltype, e := di.GetColumnTypeAt(colidx)
+ if e != nil {
+ return
+ }
+
+ if coltype != TString {
+ return splitIn, splitEx, ErrInvalidColType
+ }
+
+ // should we convert the data mode back?
+ orgmode := di.GetMode()
+
+ if orgmode == DatasetModeColumns {
+ di.TransposeToRows()
+ }
+
+ splitIn = di.Clone().(DatasetInterface)
+ splitEx = di.Clone().(DatasetInterface)
+
+ for _, row := range *di.GetRows() {
+ found := false
+ for _, val := range splitVal {
+ if (*row)[colidx].String() == val {
+ splitIn.PushRow(row)
+ found = true
+ break
+ }
+ }
+ if !found {
+ splitEx.PushRow(row)
+ }
+ }
+
+ // convert all dataset based on original
+ switch orgmode {
+ case DatasetModeColumns:
+ di.TransposeToColumns()
+ splitIn.TransposeToColumns()
+ splitEx.TransposeToColumns()
+ case DatasetModeMatrix, DatasetNoMode:
+ splitIn.TransposeToColumns()
+ splitEx.TransposeToColumns()
+ }
+
+ return
+}
+
+//
+// SplitRowsByValue generic function to split data by value. This function will
+// split data using value in column `colidx`. If value is numeric it will return
+// any rows that have column value less than `value` in `splitL`, and any column
+// value greater or equal to `value` in `splitR`.
+//
+func SplitRowsByValue(di DatasetInterface, colidx int, value interface{}) (
+ splitL DatasetInterface,
+ splitR DatasetInterface,
+ e error,
+) {
+ coltype, e := di.GetColumnTypeAt(colidx)
+ if e != nil {
+ return
+ }
+
+ if coltype == TString {
+ splitL, splitR, e = SplitRowsByCategorical(di, colidx,
+ value.([]string))
+ } else {
+ var splitval float64
+
+ switch value.(type) {
+ case int:
+ splitval = float64(value.(int))
+ case int64:
+ splitval = float64(value.(int64))
+ case float32:
+ splitval = float64(value.(float32))
+ case float64:
+ splitval = value.(float64)
+ }
+
+ splitL, splitR, e = SplitRowsByNumeric(di, colidx,
+ splitval)
+ }
+
+ if e != nil {
+ return nil, nil, e
+ }
+
+ return
+}
+
+//
+// SelectRowsWhere return all rows which column value in `colidx` is equal to
+// `colval`.
+//
+func SelectRowsWhere(dataset DatasetInterface, colidx int, colval string) DatasetInterface {
+ orgmode := dataset.GetMode()
+
+ if orgmode == DatasetModeColumns {
+ dataset.TransposeToRows()
+ }
+
+ selected := NewDataset(dataset.GetMode(), nil, nil)
+
+ selected.Rows = dataset.GetRows().SelectWhere(colidx, colval)
+
+ switch orgmode {
+ case DatasetModeColumns:
+ dataset.TransposeToColumns()
+ selected.TransposeToColumns()
+ case DatasetModeMatrix, DatasetNoMode:
+ selected.TransposeToColumns()
+ }
+
+ return selected
+}
+
+//
+// RandomPickRows return `n` item of row that has been selected randomly from
+// dataset.Rows. The ids of rows that has been picked is saved id `pickedIdx`.
+//
+// If duplicate is true, the row that has been picked can be picked up again,
+// otherwise it only allow one pick. This is also called as random selection
+// with or without replacement in machine learning domain.
+//
+// If output mode is columns, it will be transposed to rows.
+//
+func RandomPickRows(dataset DatasetInterface, n int, duplicate bool) (
+ picked DatasetInterface,
+ unpicked DatasetInterface,
+ pickedIdx []int,
+ unpickedIdx []int,
+) {
+ orgmode := dataset.GetMode()
+
+ if orgmode == DatasetModeColumns {
+ dataset.TransposeToRows()
+ }
+
+ picked = dataset.Clone().(DatasetInterface)
+ unpicked = dataset.Clone().(DatasetInterface)
+
+ pickedRows, unpickedRows, pickedIdx, unpickedIdx :=
+ dataset.GetRows().RandomPick(n, duplicate)
+
+ picked.SetRows(&pickedRows)
+ unpicked.SetRows(&unpickedRows)
+
+ // switch the dataset based on original mode
+ switch orgmode {
+ case DatasetModeColumns:
+ dataset.TransposeToColumns()
+ // transform the picked and unpicked set.
+ picked.TransposeToColumns()
+ unpicked.TransposeToColumns()
+
+ case DatasetModeMatrix, DatasetNoMode:
+ // transform the picked and unpicked set.
+ picked.TransposeToColumns()
+ unpicked.TransposeToColumns()
+ }
+
+ return
+}
+
+//
+// RandomPickColumns will select `n` column randomly from dataset and return
+// new dataset with picked and unpicked columns, and their column index.
+//
+// If duplicate is true, column that has been pick up can be pick up again.
+//
+// If dataset output mode is rows, it will transposed to columns.
+//
+func RandomPickColumns(dataset DatasetInterface, n int, dup bool,
+ excludeIdx []int) (
+ picked DatasetInterface,
+ unpicked DatasetInterface,
+ pickedIdx []int,
+ unpickedIdx []int,
+) {
+ orgmode := dataset.GetMode()
+
+ if orgmode == DatasetModeRows {
+ dataset.TransposeToColumns()
+ }
+
+ picked = dataset.Clone().(DatasetInterface)
+ unpicked = dataset.Clone().(DatasetInterface)
+
+ pickedColumns, unpickedColumns, pickedIdx, unpickedIdx :=
+ dataset.GetColumns().RandomPick(n, dup, excludeIdx)
+
+ picked.SetColumns(&pickedColumns)
+ unpicked.SetColumns(&unpickedColumns)
+
+ // transpose picked and unpicked dataset based on original mode
+ switch orgmode {
+ case DatasetModeRows:
+ dataset.TransposeToRows()
+ picked.TransposeToRows()
+ unpicked.TransposeToRows()
+ case DatasetModeMatrix, DatasetNoMode:
+ picked.TransposeToRows()
+ unpicked.TransposeToRows()
+ }
+
+ return
+}
+
+//
+// SelectColumnsByIdx return new dataset with selected column index.
+//
+func SelectColumnsByIdx(dataset DatasetInterface, colsIdx []int) (
+ newset DatasetInterface,
+) {
+ var col *Column
+
+ orgmode := dataset.GetMode()
+
+ if orgmode == DatasetModeRows {
+ dataset.TransposeToColumns()
+ }
+
+ newset = dataset.Clone().(DatasetInterface)
+
+ for _, idx := range colsIdx {
+ col = dataset.GetColumn(idx)
+ if col == nil {
+ continue
+ }
+
+ newset.PushColumn(*col)
+ }
+
+ // revert the mode back
+ switch orgmode {
+ case DatasetModeRows:
+ dataset.TransposeToRows()
+ newset.TransposeToRows()
+ case DatasetModeColumns:
+ // do nothing
+ case DatasetModeMatrix:
+ // do nothing
+ }
+
+ return
+}
diff --git a/lib/tabula/maprows.go b/lib/tabula/maprows.go
new file mode 100644
index 00000000..a93f0308
--- /dev/null
+++ b/lib/tabula/maprows.go
@@ -0,0 +1,65 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "math"
+)
+
+//
+// MapRowsElement represent a single mapping of string key to rows.
+//
+type MapRowsElement struct {
+ Key string
+ Value Rows
+}
+
+//
+// MapRows represent a list of mapping between string key and rows.
+//
+type MapRows []MapRowsElement
+
+//
+// insertRow will insert a row `v` into map using key `k`.
+//
+func (mapRows *MapRows) insertRow(k string, v *Row) {
+ rows := Rows{}
+ rows.PushBack(v)
+ el := MapRowsElement{k, rows}
+ (*mapRows) = append((*mapRows), el)
+}
+
+//
+// AddRow will append a row `v` into map value if they key `k` exist in map,
+// otherwise it will insert a new map element.
+//
+func (mapRows *MapRows) AddRow(k string, v *Row) {
+ for x := range *mapRows {
+ if (*mapRows)[x].Key == k {
+ (*mapRows)[x].Value.PushBack(v)
+ return
+ }
+ }
+ // no key found on map
+ mapRows.insertRow(k, v)
+}
+
+//
+// GetMinority return map value which contain the minimum rows.
+//
+func (mapRows *MapRows) GetMinority() (keyMin string, valMin Rows) {
+ min := math.MaxInt32
+
+ for k := range *mapRows {
+ v := (*mapRows)[k].Value
+ l := len(v)
+ if l < min {
+ keyMin = (*mapRows)[k].Key
+ valMin = v
+ min = l
+ }
+ }
+ return
+}
diff --git a/lib/tabula/maprows_test.go b/lib/tabula/maprows_test.go
new file mode 100644
index 00000000..19cd5ac8
--- /dev/null
+++ b/lib/tabula/maprows_test.go
@@ -0,0 +1,54 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestAddRow(t *testing.T) {
+ mapRows := MapRows{}
+ rows, e := initRows()
+
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ for _, row := range rows {
+ key := fmt.Sprint((*row)[testClassIdx].Interface())
+ mapRows.AddRow(key, row)
+ }
+
+ got := fmt.Sprint(mapRows)
+
+ test.Assert(t, "", groupByExpect, got, true)
+}
+
+func TestGetMinority(t *testing.T) {
+ mapRows := MapRows{}
+ rows, e := initRows()
+
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ for _, row := range rows {
+ key := fmt.Sprint((*row)[testClassIdx].Interface())
+ mapRows.AddRow(key, row)
+ }
+
+ // remove the first row in the first key, so we can make it minority.
+ mapRows[0].Value.PopFront()
+
+ _, minRows := mapRows.GetMinority()
+
+ exp := rowsExpect[3]
+ got := fmt.Sprint(minRows)
+
+ test.Assert(t, "", exp, got, true)
+}
diff --git a/lib/tabula/matrix.go b/lib/tabula/matrix.go
new file mode 100644
index 00000000..62ab68ac
--- /dev/null
+++ b/lib/tabula/matrix.go
@@ -0,0 +1,13 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+//
+// Matrix is a combination of columns and rows.
+//
+type Matrix struct {
+ Columns *Columns
+ Rows *Rows
+}
diff --git a/lib/tabula/record.go b/lib/tabula/record.go
new file mode 100644
index 00000000..527ab430
--- /dev/null
+++ b/lib/tabula/record.go
@@ -0,0 +1,292 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "math"
+ "reflect"
+ "strconv"
+)
+
+const (
+ // TUndefined for undefined type
+ TUndefined = -1
+ // TString string type.
+ TString = 0
+ // TInteger integer type (64 bit).
+ TInteger = 1
+ // TReal float type (64 bit).
+ TReal = 2
+)
+
+//
+// Record represent the smallest building block of data-set.
+//
+type Record struct {
+ v interface{}
+}
+
+//
+// NewRecord will create and return record with nil value.
+//
+func NewRecord() *Record {
+ return &Record{v: nil}
+}
+
+//
+// NewRecordBy create new record from string with type set to `t`.
+//
+func NewRecordBy(v string, t int) (r *Record, e error) {
+ r = NewRecord()
+ e = r.SetValue(v, t)
+ return
+}
+
+//
+// NewRecordString will create new record from string.
+//
+func NewRecordString(v string) (r *Record) {
+ return &Record{v: v}
+}
+
+//
+// NewRecordInt create new record from integer value.
+//
+func NewRecordInt(v int64) (r *Record) {
+ return &Record{v: v}
+}
+
+//
+// NewRecordReal create new record from float value.
+//
+func NewRecordReal(v float64) (r *Record) {
+ return &Record{v: v}
+}
+
+//
+// Clone will create and return a clone of record.
+//
+func (r *Record) Clone() *Record {
+ return &Record{v: r.v}
+}
+
+//
+// IsNil return true if record has not been set with value, or nil.
+//
+func (r *Record) IsNil() bool {
+ return r.v == nil
+}
+
+//
+// Type of record.
+//
+func (r *Record) Type() int {
+ switch r.v.(type) {
+ case int64:
+ return TInteger
+ case float64:
+ return TReal
+ }
+ return TString
+}
+
+//
+// SetValue set the record value from string using type `t`. If value can not
+// be converted to type, it will return an error.
+//
+func (r *Record) SetValue(v string, t int) error {
+ switch t {
+ case TString:
+ r.v = v
+
+ case TInteger:
+ i64, e := strconv.ParseInt(v, 10, 64)
+ if nil != e {
+ return e
+ }
+
+ r.v = i64
+
+ case TReal:
+ f64, e := strconv.ParseFloat(v, 64)
+ if nil != e {
+ return e
+ }
+
+ r.v = f64
+ }
+ return nil
+}
+
+//
+// SetString will set the record value with string value.
+//
+func (r *Record) SetString(v string) {
+ r.v = v
+}
+
+//
+// SetFloat will set the record value with float 64bit.
+//
+func (r *Record) SetFloat(v float64) {
+ r.v = v
+}
+
+//
+// SetInteger will set the record value with integer 64bit.
+//
+func (r *Record) SetInteger(v int64) {
+ r.v = v
+}
+
+//
+// IsMissingValue check wether the value is a missing attribute.
+//
+// If its string the missing value is indicated by character '?'.
+//
+// If its integer the missing value is indicated by minimum negative integer,
+// or math.MinInt64.
+//
+// If its real the missing value is indicated by -Inf.
+//
+func (r *Record) IsMissingValue() bool {
+ switch r.v.(type) {
+ case string:
+ str := r.v.(string)
+ if str == "?" {
+ return true
+ }
+
+ case int64:
+ i64 := r.v.(int64)
+ if i64 == math.MinInt64 {
+ return true
+ }
+
+ case float64:
+ f64 := r.v.(float64)
+ return math.IsInf(f64, -1)
+ }
+
+ return false
+}
+
+//
+// Interface return record value as interface.
+//
+func (r *Record) Interface() interface{} {
+ return r.v
+}
+
+//
+// Bytes convert record value to slice of byte.
+//
+func (r *Record) Bytes() []byte {
+ return []byte(r.String())
+}
+
+//
+// String convert record value to string.
+//
+func (r Record) String() (s string) {
+ switch r.v.(type) {
+ case string:
+ s = r.v.(string)
+
+ case int64:
+ s = strconv.FormatInt(r.v.(int64), 10)
+
+ case float64:
+ s = strconv.FormatFloat(r.v.(float64), 'f', -1, 64)
+ }
+ return
+}
+
+//
+// Float convert given record to float value. If its failed it will return
+// the -Infinity value.
+//
+func (r *Record) Float() (f64 float64) {
+ var e error
+
+ switch r.v.(type) {
+ case string:
+ f64, e = strconv.ParseFloat(r.v.(string), 64)
+
+ if nil != e {
+ f64 = math.Inf(-1)
+ }
+
+ case int64:
+ f64 = float64(r.v.(int64))
+
+ case float64:
+ f64 = r.v.(float64)
+ }
+
+ return
+}
+
+//
+// Integer convert given record to integer value. If its failed, it will return
+// the minimum integer in 64bit.
+//
+func (r *Record) Integer() (i64 int64) {
+ var e error
+
+ switch r.v.(type) {
+ case string:
+ i64, e = strconv.ParseInt(r.v.(string), 10, 64)
+
+ if nil != e {
+ i64 = math.MinInt64
+ }
+
+ case int64:
+ i64 = r.v.(int64)
+
+ case float64:
+ i64 = int64(r.v.(float64))
+ }
+
+ return
+}
+
+//
+// IsEqual return true if record is equal with other, otherwise return false.
+//
+func (r *Record) IsEqual(o *Record) bool {
+ return reflect.DeepEqual(r.v, o.Interface())
+}
+
+//
+// IsEqualToString return true if string representation of record value is
+// equal to string `v`.
+//
+func (r *Record) IsEqualToString(v string) bool {
+ return r.String() == v
+}
+
+//
+// IsEqualToInterface return true if interface type and value equal to record
+// type and value.
+//
+func (r *Record) IsEqualToInterface(v interface{}) bool {
+ return reflect.DeepEqual(r.v, v)
+}
+
+//
+// Reset will reset record value to empty string or zero, depend on type.
+//
+func (r *Record) Reset() {
+ switch r.v.(type) {
+ case string:
+ r.v = ""
+ case int64:
+ r.v = int64(0)
+ case float64:
+ r.v = float64(0)
+ }
+}
diff --git a/lib/tabula/record_test.go b/lib/tabula/record_test.go
new file mode 100644
index 00000000..223f9235
--- /dev/null
+++ b/lib/tabula/record_test.go
@@ -0,0 +1,35 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+//
+// TestRecord simply check how the stringer work.
+//
+func TestRecord(t *testing.T) {
+ expec := []string{"test", "1", "2"}
+ expType := []int{TString, TInteger, TInteger}
+
+ row := make(Row, 0)
+
+ for i := range expec {
+ r, e := NewRecordBy(expec[i], expType[i])
+ if nil != e {
+ t.Error(e)
+ }
+
+ row = append(row, r)
+ }
+
+ exp := fmt.Sprint(expec)
+ got := fmt.Sprint(row)
+ test.Assert(t, "", exp, got, true)
+}
diff --git a/lib/tabula/records.go b/lib/tabula/records.go
new file mode 100644
index 00000000..e00c03b9
--- /dev/null
+++ b/lib/tabula/records.go
@@ -0,0 +1,54 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+//
+// Records define slice of pointer to Record.
+//
+type Records []*Record
+
+//
+// Len will return the length of records.
+//
+func (recs *Records) Len() int {
+ return len(*recs)
+}
+
+//
+// SortByIndex will sort the records using slice of index `sortedIDx` and
+// return it.
+//
+func (recs *Records) SortByIndex(sortedIdx []int) *Records {
+ sorted := make(Records, len(*recs))
+
+ for x, v := range sortedIdx {
+ sorted[x] = (*recs)[v]
+ }
+ return &sorted
+}
+
+//
+// CountWhere return number of record where its value is equal to `v` type and
+// value.
+//
+func (recs *Records) CountWhere(v interface{}) (c int) {
+ for _, r := range *recs {
+ if r.IsEqualToInterface(v) {
+ c++
+ }
+ }
+ return
+}
+
+//
+// CountsWhere will return count of each value in slice `sv`.
+//
+func (recs *Records) CountsWhere(vs []interface{}) (counts []int) {
+ for _, v := range vs {
+ c := recs.CountWhere(v)
+ counts = append(counts, c)
+ }
+ return
+}
diff --git a/lib/tabula/records_test.go b/lib/tabula/records_test.go
new file mode 100644
index 00000000..2be6f7b1
--- /dev/null
+++ b/lib/tabula/records_test.go
@@ -0,0 +1,29 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestSortByIndex(t *testing.T) {
+ data := make(Records, 3)
+ data[0] = NewRecordInt(3)
+ data[1] = NewRecordInt(2)
+ data[2] = NewRecordInt(1)
+
+ sortedIdx := []int{2, 1, 0}
+ expect := []int{1, 2, 3}
+
+ sorted := data.SortByIndex(sortedIdx)
+
+ got := fmt.Sprint(sorted)
+ exp := fmt.Sprint(&expect)
+
+ test.Assert(t, "", exp, got, true)
+}
diff --git a/lib/tabula/row.go b/lib/tabula/row.go
new file mode 100644
index 00000000..105577c5
--- /dev/null
+++ b/lib/tabula/row.go
@@ -0,0 +1,123 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+//
+// Row represent slice of record.
+//
+type Row []*Record
+
+//
+// Len return number of record in row.
+//
+func (row *Row) Len() int {
+ return len(*row)
+}
+
+//
+// PushBack will add new record to the end of row.
+//
+func (row *Row) PushBack(r *Record) {
+ *row = append(*row, r)
+}
+
+//
+// Types return type of all records.
+//
+func (row *Row) Types() (types []int) {
+ for _, r := range *row {
+ types = append(types, r.Type())
+ }
+ return
+}
+
+//
+// Clone create and return a clone of row.
+//
+func (row *Row) Clone() *Row {
+ clone := make(Row, len(*row))
+
+ for x, rec := range *row {
+ clone[x] = rec.Clone()
+ }
+ return &clone
+}
+
+//
+// IsNilAt return true if there is no record value in row at `idx`, otherwise
+// return false.
+//
+func (row *Row) IsNilAt(idx int) bool {
+ if idx < 0 {
+ return true
+ }
+ if idx >= len(*row) {
+ return true
+ }
+ if (*row)[idx] == nil {
+ return true
+ }
+ return (*row)[idx].IsNil()
+}
+
+//
+// SetValueAt will set the value of row at cell index `idx` with record `rec`.
+//
+func (row *Row) SetValueAt(idx int, rec *Record) {
+ (*row)[idx] = rec
+}
+
+//
+// GetRecord will return pointer to record at index `i`, or nil if index
+// is out of range.
+//
+func (row *Row) GetRecord(i int) *Record {
+ if i < 0 {
+ return nil
+ }
+ if i >= row.Len() {
+ return nil
+ }
+ return (*row)[i]
+}
+
+//
+// GetValueAt return the value of row record at index `idx`. If the index is
+// out of range it will return nil and false
+//
+func (row *Row) GetValueAt(idx int) (interface{}, bool) {
+ if row.Len() <= idx {
+ return nil, false
+ }
+ return (*row)[idx].Interface(), true
+}
+
+//
+// GetIntAt return the integer value of row record at index `idx`.
+// If the index is out of range it will return 0 and false.
+//
+func (row *Row) GetIntAt(idx int) (int64, bool) {
+ if row.Len() <= idx {
+ return 0, false
+ }
+
+ return (*row)[idx].Integer(), true
+}
+
+//
+// IsEqual return true if row content equal with `other` row, otherwise return
+// false.
+//
+func (row *Row) IsEqual(other *Row) bool {
+ if len(*row) != len(*other) {
+ return false
+ }
+ for x, xrec := range *row {
+ if !xrec.IsEqual((*other)[x]) {
+ return false
+ }
+ }
+ return true
+}
diff --git a/lib/tabula/row_test.go b/lib/tabula/row_test.go
new file mode 100644
index 00000000..5fa45775
--- /dev/null
+++ b/lib/tabula/row_test.go
@@ -0,0 +1,33 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+var dataFloat64 = []float64{0.1, 0.2, 0.3, 0.4, 0.5}
+
+func createRow() (row Row) {
+ for _, v := range dataFloat64 {
+ row.PushBack(NewRecordReal(v))
+ }
+ return
+}
+
+func TestClone(t *testing.T) {
+ row := createRow()
+ rowClone := row.Clone()
+ rowClone2 := row.Clone()
+
+ test.Assert(t, "", &row, rowClone, true)
+
+ // changing the clone value should not change the original copy.
+ (*rowClone2)[0].SetFloat(0)
+ test.Assert(t, "", &row, rowClone, true)
+ test.Assert(t, "", &row, rowClone2, false)
+}
diff --git a/lib/tabula/rows.go b/lib/tabula/rows.go
new file mode 100644
index 00000000..fcaed021
--- /dev/null
+++ b/lib/tabula/rows.go
@@ -0,0 +1,251 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "math/rand"
+ "time"
+)
+
+//
+// Rows represent slice of Row.
+//
+type Rows []*Row
+
+//
+// Len return number of row.
+//
+func (rows *Rows) Len() int {
+ return len(*rows)
+}
+
+//
+// PushBack append record r to the end of rows.
+//
+func (rows *Rows) PushBack(r *Row) {
+ if r != nil {
+ (*rows) = append((*rows), r)
+ }
+}
+
+//
+// PopFront remove the head, return the record value.
+//
+func (rows *Rows) PopFront() (row *Row) {
+ l := len(*rows)
+ if l > 0 {
+ row = (*rows)[0]
+ (*rows) = (*rows)[1:]
+ }
+ return
+}
+
+//
+// PopFrontAsRows remove the head and return ex-head as new rows.
+//
+func (rows *Rows) PopFrontAsRows() (newRows Rows) {
+ row := rows.PopFront()
+ if nil == row {
+ return
+ }
+ newRows.PushBack(row)
+ return
+}
+
+//
+// Del will detach row at index `i` from slice and return it.
+//
+func (rows *Rows) Del(i int) (row *Row) {
+ if i < 0 {
+ return
+ }
+ if i >= rows.Len() {
+ return
+ }
+
+ row = (*rows)[i]
+
+ last := len(*rows) - 1
+ copy((*rows)[i:], (*rows)[i+1:])
+ (*rows)[last] = nil
+ (*rows) = (*rows)[0:last]
+
+ return row
+}
+
+//
+// GroupByValue will group each row based on record value in index recGroupIdx
+// into map of string -> *Row.
+//
+// WARNING: returned rows will be empty!
+//
+// For example, given rows with target group in column index 1,
+//
+// [1 +]
+// [2 -]
+// [3 -]
+// [4 +]
+//
+// this function will create a map with key is string of target and value is
+// pointer to sub-rows,
+//
+// + -> [1 +]
+// [4 +]
+// - -> [2 -]
+// [3 -]
+//
+//
+func (rows *Rows) GroupByValue(GroupIdx int) (mapRows MapRows) {
+ for {
+ row := rows.PopFront()
+ if nil == row {
+ break
+ }
+
+ key := fmt.Sprint((*row)[GroupIdx])
+
+ mapRows.AddRow(key, row)
+ }
+ return
+}
+
+//
+// RandomPick row in rows until n item and return it like its has been shuffled.
+// If duplicate is true, row that has been picked can be picked up again,
+// otherwise it will only picked up once.
+//
+// This function return picked and unpicked rows and index of them.
+//
+func (rows *Rows) RandomPick(n int, duplicate bool) (
+ picked Rows,
+ unpicked Rows,
+ pickedIdx []int,
+ unpickedIdx []int,
+) {
+ rowsLen := len(*rows)
+
+ // if duplication is not allowed, we can only select as many as rows
+ // that we have.
+ if n > rowsLen && !duplicate {
+ n = rowsLen
+ }
+
+ rand.Seed(time.Now().UnixNano())
+
+ for ; n >= 1; n-- {
+ idx := 0
+ for {
+ idx = rand.Intn(len(*rows))
+
+ if duplicate {
+ // allow duplicate idx
+ pickedIdx = append(pickedIdx, idx)
+ break
+ }
+
+ // check if its already picked
+ isPicked := false
+ for _, pastIdx := range pickedIdx {
+ if idx == pastIdx {
+ isPicked = true
+ break
+ }
+ }
+ // get another random idx again
+ if isPicked {
+ continue
+ }
+
+ // bingo, we found unique idx that has not been picked.
+ pickedIdx = append(pickedIdx, idx)
+ break
+ }
+
+ row := (*rows)[idx]
+
+ picked.PushBack(row)
+ }
+
+ // select unpicked rows using picked index.
+ for rid := range *rows {
+ // check if row index has been picked up
+ isPicked := false
+ for _, idx := range pickedIdx {
+ if rid == idx {
+ isPicked = true
+ break
+ }
+ }
+ if !isPicked {
+ unpicked.PushBack((*rows)[rid])
+ unpickedIdx = append(unpickedIdx, rid)
+ }
+ }
+ return
+}
+
+//
+// Contain return true and index of row, if rows has data that has the same value
+// with `row`, otherwise return false and -1 as index.
+//
+func (rows *Rows) Contain(xrow *Row) (bool, int) {
+ for x, row := range *rows {
+ if xrow.IsEqual(row) {
+ return true, x
+ }
+ }
+ return false, -1
+}
+
+//
+// Contains return true and indices of row, if rows has data that has the same
+// value with `rows`, otherwise return false and empty indices.
+//
+func (rows *Rows) Contains(xrows Rows) (isin bool, indices []int) {
+ // No data to compare.
+ if len(xrows) <= 0 {
+ return
+ }
+
+ for _, xrow := range xrows {
+ isin, idx := rows.Contain(xrow)
+
+ if isin {
+ indices = append(indices, idx)
+ }
+ }
+
+ // Check if indices length equal to searched rows
+ if len(indices) == len(xrows) {
+ return true, indices
+ }
+
+ return false, nil
+}
+
+//
+// SelectWhere return all rows which column value in `colidx` is equal
+// to `colval`.
+//
+func (rows *Rows) SelectWhere(colidx int, colval string) (selected Rows) {
+ for _, row := range *rows {
+ col := (*row)[colidx]
+ if col.IsEqualToString(colval) {
+ selected.PushBack(row)
+ }
+ }
+ return
+}
+
+//
+// String return the string representation of each row.
+//
+func (rows Rows) String() (s string) {
+ for x := range rows {
+ s += fmt.Sprint(rows[x])
+ }
+ return
+}
diff --git a/lib/tabula/rows_test.go b/lib/tabula/rows_test.go
new file mode 100644
index 00000000..174dd10f
--- /dev/null
+++ b/lib/tabula/rows_test.go
@@ -0,0 +1,181 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestPushBack(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ exp := strings.Join(rowsExpect, "")
+ got := fmt.Sprint(rows)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestPopFront(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ l := len(rows) - 1
+ for i := range rows {
+ row := rows.PopFront()
+
+ exp := rowsExpect[i]
+ got := fmt.Sprint(row)
+
+ test.Assert(t, "", exp, got, true)
+
+ if i < l {
+ exp = strings.Join(rowsExpect[i+1:], "")
+ } else {
+ exp = ""
+ }
+ got = fmt.Sprint(rows)
+
+ test.Assert(t, "", exp, got, true)
+ }
+
+ // empty rows
+ row := rows.PopFront()
+
+ exp := "<nil>"
+ got := fmt.Sprint(row)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestPopFrontRow(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ l := len(rows) - 1
+ for i := range rows {
+ newRows := rows.PopFrontAsRows()
+
+ exp := rowsExpect[i]
+ got := fmt.Sprint(newRows)
+
+ test.Assert(t, "", exp, got, true)
+
+ if i < l {
+ exp = strings.Join(rowsExpect[i+1:], "")
+ } else {
+ exp = ""
+ }
+ got = fmt.Sprint(rows)
+
+ test.Assert(t, "", exp, got, true)
+ }
+
+ // empty rows
+ row := rows.PopFrontAsRows()
+
+ exp := ""
+ got := fmt.Sprint(row)
+
+ test.Assert(t, "", exp, got, true)
+}
+
+func TestGroupByValue(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ mapRows := rows.GroupByValue(testClassIdx)
+
+ got := fmt.Sprint(mapRows)
+
+ test.Assert(t, "", groupByExpect, got, true)
+}
+
+func TestRandomPick(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // random pick with duplicate
+ for i := 0; i < 5; i++ {
+ picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(6,
+ true)
+
+ // check if unpicked item exist in picked items.
+ isin, _ := picked.Contains(unpicked)
+
+ if isin {
+ fmt.Println("Random pick with duplicate rows")
+ fmt.Println("==> picked rows :", picked)
+ fmt.Println("==> picked idx :", pickedIdx)
+ fmt.Println("==> unpicked rows :", unpicked)
+ fmt.Println("==> unpicked idx :", unpickedIdx)
+ t.Fatal("random pick: unpicked is false")
+ }
+ }
+
+ // random pick without duplication
+ for i := 0; i < 5; i++ {
+ picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(3,
+ false)
+
+ // check if picked rows is duplicate
+ test.Assert(t, "", picked[0], picked[1], false)
+
+ // check if unpicked item exist in picked items.
+ isin, _ := picked.Contains(unpicked)
+
+ if isin {
+ fmt.Println("Random pick with no duplicate rows")
+ fmt.Println("==> picked rows :", picked)
+ fmt.Println("==> picked idx :", pickedIdx)
+ fmt.Println("==> unpicked rows :", unpicked)
+ fmt.Println("==> unpicked idx :", unpickedIdx)
+ t.Fatal("random pick: unpicked is false")
+ }
+ }
+}
+
+func TestRowsDel(t *testing.T) {
+ rows, e := initRows()
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // Test deleting row index out of range.
+ row := rows.Del(-1)
+ if row != nil {
+ t.Fatal("row should be nil!")
+ }
+
+ row = rows.Del(rows.Len())
+ if row != nil {
+ t.Fatal("row should be nil!")
+ }
+
+ // Test deleting index that is actually exist.
+ row = rows.Del(0)
+
+ exp := strings.Join(rowsExpect[1:], "")
+ got := fmt.Sprint(rows)
+
+ test.Assert(t, "", exp, got, true)
+
+ got = fmt.Sprint(row)
+ test.Assert(t, "", rowsExpect[0], got, true)
+}
diff --git a/lib/tabula/tabula.go b/lib/tabula/tabula.go
new file mode 100644
index 00000000..3d7f57df
--- /dev/null
+++ b/lib/tabula/tabula.go
@@ -0,0 +1,76 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+//
+// Package tabula is a Go library for working with rows, columns, or matrix
+// (table), or in another terms working with data set.
+//
+// Introduction
+//
+// Go's slice gave a flexible way to manage sequence of data in one type, but
+// what if you want to manage a sequence of value but with different type of
+// data? Or manage a bunch of values like a table?
+//
+// You can use this library to manage sequence of value with different type
+// and manage data in two dimensional tuple.
+//
+// Terminology
+//
+// Here are some terminologies that we used in developing this library, which
+// may help reader understand the internal and API.
+//
+// Record is a single cell in row or column, or the smallest building block of
+// dataset.
+//
+// Row is a horizontal representation of records in dataset.
+//
+// Column is a vertical representation of records in dataset.
+// Each column has a unique name and has the same type data.
+//
+// Dataset is a collection of rows and columns.
+//
+// Given those definitions we can draw the representation of rows, columns, or
+// matrix:
+//
+// COL-0 COL-1 ... COL-x
+// ROW-0: record record ... record
+// ROW-1: record record ... record
+// ...
+// ROW-y: record record ... record
+//
+// Record Type
+//
+// There are only three valid type in record: int64, float64, and string.
+//
+// Dataset Mode
+//
+// Tabula has three mode for dataset: rows, columns, or matrix.
+//
+// For example, given a table of data,
+//
+// col1,col2,col3
+// a,b,c
+// 1,2,3
+//
+// "rows" mode is where each line saved in its own slice, resulting in Rows:
+//
+// Rows[0]: [a b c]
+// Rows[1]: [1 2 3]
+//
+// "columns" mode is where each line saved by columns, resulting in Columns:
+//
+// Columns[0]: {col1 0 0 [] [a 1]}
+// Columns[1]: {col2 0 0 [] [b 2]}
+// Columns[1]: {col3 0 0 [] [c 3]}
+//
+// Unlike rows mode, each column contain metadata including column name, type,
+// flag, and value space (all possible value that _may_ contain in column
+// value).
+//
+// "matrix" mode is where each record saved both in row and column.
+//
+// Matrix mode consume more memory but give a flexible way to manage records.
+//
+//
+package tabula
diff --git a/lib/tabula/tabula_test.go b/lib/tabula/tabula_test.go
new file mode 100644
index 00000000..6b13d60c
--- /dev/null
+++ b/lib/tabula/tabula_test.go
@@ -0,0 +1,81 @@
+// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be found
+// in the LICENSE file.
+
+package tabula
+
+import (
+ "os"
+)
+
+var (
+ traces = make([]byte, 1024)
+)
+
+func printStackTrace() {
+ var lines, start, end int
+
+ for x, b := range traces {
+ if b != '\n' {
+ continue
+ }
+ lines++
+ if lines == 3 {
+ start = x
+ } else if lines == 5 {
+ end = x + 1
+ break
+ }
+ }
+
+ os.Stderr.Write(traces[start:end])
+}
+
+var testColTypes = []int{
+ TInteger,
+ TInteger,
+ TInteger,
+ TString,
+}
+
+var testColNames = []string{"int01", "int02", "int03", "class"}
+
+// Testing data and function for Rows and MapRows
+var rowsData = [][]string{
+ {"1", "5", "9", "+"},
+ {"2", "6", "0", "-"},
+ {"3", "7", "1", "-"},
+ {"4", "8", "2", "+"},
+}
+
+var testClassIdx = 3
+
+var rowsExpect = []string{
+ "&[1 5 9 +]",
+ "&[2 6 0 -]",
+ "&[3 7 1 -]",
+ "&[4 8 2 +]",
+}
+
+var groupByExpect = "[{+ &[1 5 9 +]&[4 8 2 +]} {- &[2 6 0 -]&[3 7 1 -]}]"
+
+func initRows() (rows Rows, e error) {
+ for i := range rowsData {
+ l := len(rowsData[i])
+ row := make(Row, 0)
+
+ for j := 0; j < l; j++ {
+ rec, e := NewRecordBy(rowsData[i][j],
+ testColTypes[j])
+
+ if nil != e {
+ return nil, e
+ }
+
+ row = append(row, rec)
+ }
+
+ rows.PushBack(&row)
+ }
+ return rows, nil
+}