diff options
| author | Shulhan <ms@kilabit.info> | 2018-09-17 01:21:27 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2018-09-18 01:50:21 +0700 |
| commit | 44b26edf7f390db383fe025454be0c4e30cfbd9b (patch) | |
| tree | 84d02953bc9095312182534936c1b60667957f07 /lib | |
| parent | 4a820ec157501c957d2e30f1670656cceec5c044 (diff) | |
| download | pakakeh.go-44b26edf7f390db383fe025454be0c4e30cfbd9b.tar.xz | |
Merge package "github.com/shuLhan/tabula"
Diffstat (limited to 'lib')
28 files changed, 4041 insertions, 0 deletions
diff --git a/lib/tabula/.gitignore b/lib/tabula/.gitignore new file mode 100644 index 00000000..f5ddbe1c --- /dev/null +++ b/lib/tabula/.gitignore @@ -0,0 +1,5 @@ +cover.html +cover.out +*.bench +*.prof +*.test diff --git a/lib/tabula/LICENSE b/lib/tabula/LICENSE new file mode 100644 index 00000000..d3ff23a6 --- /dev/null +++ b/lib/tabula/LICENSE @@ -0,0 +1,39 @@ +Copyright 2017, Shulhan (ms@kilabit.info). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of copyright holder nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + --- --- --- --- --- --- --- + + TT TT II BB AAAA LLLLLL II KKKKKKKK + TT TT II BB AA AA LL LL II KK + TTTT II BB AA AA LL LL II KK + TT TT II BB AAAAAAAA LLLLLL II KK + TT TT II BB AA AA LL LL II KK + TT TT II BBBBBBBB AA AA LLLLLL II KK + +Website: http://kilabit.info +Contact: ms@kilabit.info diff --git a/lib/tabula/Makefile b/lib/tabula/Makefile new file mode 100644 index 00000000..d77283bd --- /dev/null +++ b/lib/tabula/Makefile @@ -0,0 +1,31 @@ +#!/bin/make + +## Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +## Use of this source code is governed by a BSD-style license that can be found +## in the LICENSE file. + +SRC_FILES :=$(shell go list -f '{{ join .GoFiles " " }}') +TEST_FILES :=$(shell go list -f '{{ join .TestGoFiles " " }}') +XTEST_FILES :=$(shell go list -f '{{ join .XTestGoFiles " " }}') +COVER_OUT :=cover.out +COVER_HTML :=cover.html +TARGET :=$(shell go list -f '{{ .Target }}') + +.PHONY: all clean coverbrowse + +all: ${TARGET} + +${TARGET}: ${COVER_HTML} + go install -a . + +${COVER_HTML}: ${COVER_OUT} + go tool cover -html=$< -o $@ + +${COVER_OUT}: ${SRC_FILES} ${TEST_FILES} ${XTEST_FILES} + go test -v -coverprofile $@ + +coverbrowse: ${COVER_HTML} + xdg-open $< + +clean: + rm -f ${COVER_HTML} ${COVER_OUT} *.bench *.prof *.test diff --git a/lib/tabula/README.md b/lib/tabula/README.md new file mode 100644 index 00000000..8fbd2a40 --- /dev/null +++ b/lib/tabula/README.md @@ -0,0 +1,165 @@ +[](https://godoc.org/github.com/shuLhan/share/lib/tabula) +[](https://goreportcard.com/report/github.com/shuLhan/share/lib/tabula) + + +Package tabula is a Go library for working with rows, columns, or matrix +(table), or in another terms working with data set. + +# Overview + +Go's slice gave a flexible way to manage sequence of data in one type, but what +if you want to manage a sequence of value but with different type of data? +Or manage a bunch of values like a table? + +You can use this library to manage sequence of value with different type +and manage data in two dimensional tuple. + +## Terminology + +Here are some terminologies that we used in developing this library, which may +help reader understand the internal and API. + +Record is a single cell in row or column, or the smallest building block of +dataset. + +Row is a horizontal representation of records in dataset. + +Column is a vertical representation of records in dataset. +Each column has a unique name and has the same type data. + +Dataset is a collection of rows and columns. + +Given those definitions we can draw the representation of rows, columns, or +matrix: + + COL-0 COL-1 ... COL-x + ROW-0: record record ... record + ROW-1: record record ... record + ... + ROW-y: record record ... record + +## What make this package different from other dataset packages? + +### Record Type + +There are only three valid type in record: int64, float64, and string. + +Each record is a pointer to interface value. Which means, + +- Switching between rows to columns mode, or vice versa, is only a matter of + pointer switching, no memory relocations. +- When using matrix mode, additional memory is used only to allocate slice, the + record in each rows and columns is shared. + +### Dataset Mode + +Tabula has three mode for dataset: rows, columns, or matrix. + +For example, given a table of data, + + col1,col2,col3 + a,b,c + 1,2,3 + +- When in "rows" mode, each line is saved in its own slice, resulting in Rows: + + ``` + Rows[0]: [a b c] + Rows[1]: [1 2 3] + ``` + + Columns is used only to save record metadata: column name, type, flag and + value space. + +- When in "columns" mode, each line saved in columns, resulting in Columns: + + ``` + Columns[0]: {col1 0 0 [] [a 1]} + Columns[1]: {col2 0 0 [] [b 2]} + Columns[1]: {col3 0 0 [] [c 3]} + ``` + + Each column will contain metadata including column name, type, flag, and + value space (all possible value that _may_ contain in column value). + + Rows in "columns" mode is empty. + +- When in "matrix" mode, each record is saved both in row and column using + shared pointer to record. + + Matrix mode consume more memory by allocating two slice in rows and columns, + but give flexible way to manage records. + +## Features + +- **Switching between rows and columns mode**. + +- [**Random pick rows with or without replacement**](https://godoc.org/github.com/shuLhan/share/lib/tabula#RandomPickRows). + +- [**Random pick columns with or without replacement**](https://godoc.org/github.com/shuLhan/share/lib/tabula#RandomPickColumns). + +- [**Select column from dataset by index**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SelectColumnsByIdx). + +- [**Sort columns by index**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SortColumnsByIndex), + or indirect sort. + +- [**Split rows value by numeric**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SplitRowsByNumeric). + For example, given two numeric rows, + + ``` + A: {1,2,3,4} + B: {5,6,7,8} + ``` + + if we split row by value 7, the data will splitted into left set + + ``` + A': {1,2} + B': {5,6} + ``` + + and the right set would be + + ``` + A'': {3,4} + B'': {7,8} + ``` + +- [**Split rows by string**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SplitRowsByCategorical). + For example, given two rows, + + ``` + X: [A,B,A,B,C,D,C,D] + Y: [1,2,3,4,5,6,7,8] + ``` + + if we split the rows with value set `[A,C]`, the data will splitted into left + set which contain all rows that have A or C, + + ``` + X': [A,A,C,C] + Y': [1,3,5,7] + ``` + + and the right set, excluded set, will contain all rows which is not A or C, + + ``` + X'': [B,B,D,D] + Y'': [2,4,6,8] + ``` + +- [**Select row where**](https://godoc.org/github.com/shuLhan/share/lib/tabula#SelectRowsWhere). + Select row at column index x where their value is equal to y (an analogy to + _select where_ in SQL). + For example, given a rows of dataset, + ``` + ROW-1: {1,A} + ROW-2: {2,B} + ROW-3: {3,A} + ROW-4: {4,C} + ``` + we can select row where the second column contain 'A', which result in, + ``` + ROW-1: {1,A} + ROW-3: {3,A} + ``` diff --git a/lib/tabula/claset.go b/lib/tabula/claset.go new file mode 100644 index 00000000..5d7eea7e --- /dev/null +++ b/lib/tabula/claset.go @@ -0,0 +1,303 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "strconv" + + libnumbers "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" +) + +// +// Claset define a dataset with class attribute. +// +type Claset struct { + // Dataset embedded, for implementing the dataset interface. + Dataset + // ClassIndex contain index for target classification in columns. + ClassIndex int `json:"ClassIndex"` + + // vs contain a copy of value space. + vs []string + // counts number of value space in current set. + counts []int + + // major contain the name of majority class in dataset. + major string + // minor contain the name of minority class in dataset. + minor string +} + +// +// NewClaset create and return new Claset object. +// +func NewClaset(mode int, types []int, names []string) (claset *Claset) { + claset = &Claset{ + ClassIndex: -1, + } + + claset.Init(mode, types, names) + + return +} + +// +// Clone return a copy of current claset object. +// +func (claset *Claset) Clone() interface{} { + clone := Claset{ + ClassIndex: claset.GetClassIndex(), + major: claset.MajorityClass(), + minor: claset.MinorityClass(), + } + clone.SetDataset(claset.GetDataset().Clone().(DatasetInterface)) + return &clone +} + +// +// GetDataset return the dataset. +// +func (claset *Claset) GetDataset() DatasetInterface { + return &claset.Dataset +} + +// +// GetClassType return type of class in dataset. +// +func (claset *Claset) GetClassType() int { + if claset.Columns.Len() <= 0 { + return TString + } + return claset.Columns[claset.ClassIndex].Type +} + +// +// GetClassValueSpace return the class value space. +// +func (claset *Claset) GetClassValueSpace() []string { + if claset.Columns.Len() <= 0 { + return nil + } + return claset.Columns[claset.ClassIndex].ValueSpace +} + +// +// GetClassColumn return dataset class values in column. +// +func (claset *Claset) GetClassColumn() *Column { + if claset.Mode == DatasetModeRows { + claset.TransposeToColumns() + } + if claset.Columns.Len() <= 0 { + return nil + } + return &claset.Columns[claset.ClassIndex] +} + +// +// GetClassRecords return class values as records. +// +func (claset *Claset) GetClassRecords() *Records { + if claset.Mode == DatasetModeRows { + claset.TransposeToColumns() + } + if claset.Columns.Len() <= 0 { + return nil + } + return &claset.Columns[claset.ClassIndex].Records +} + +// +// GetClassAsStrings return all class values as slice of string. +// +func (claset *Claset) GetClassAsStrings() []string { + if claset.Mode == DatasetModeRows { + claset.TransposeToColumns() + } + if claset.Columns.Len() <= 0 { + return nil + } + return claset.Columns[claset.ClassIndex].ToStringSlice() +} + +// +// GetClassAsReals return class record value as slice of float64. +// +func (claset *Claset) GetClassAsReals() []float64 { + if claset.Mode == DatasetModeRows { + claset.TransposeToColumns() + } + if claset.Columns.Len() <= 0 { + return nil + } + return claset.Columns[claset.ClassIndex].ToFloatSlice() +} + +// +// GetClassAsInteger return class record value as slice of int64. +// +func (claset *Claset) GetClassAsInteger() []int64 { + if claset.Mode == DatasetModeRows { + claset.TransposeToColumns() + } + if claset.Columns.Len() <= 0 { + return nil + } + return claset.Columns[claset.ClassIndex].ToIntegers() +} + +// +// GetClassIndex return index of class attribute in dataset. +// +func (claset *Claset) GetClassIndex() int { + return claset.ClassIndex +} + +// +// MajorityClass return the majority class of data. +// +func (claset *Claset) MajorityClass() string { + return claset.major +} + +// +// MinorityClass return the minority class in dataset. +// +func (claset *Claset) MinorityClass() string { + return claset.minor +} + +// +// Counts return the number of each class in value-space. +// +func (claset *Claset) Counts() []int { + if len(claset.counts) <= 0 { + claset.CountValueSpaces() + } + return claset.counts +} + +// +// SetDataset in class set. +// +func (claset *Claset) SetDataset(dataset DatasetInterface) { + claset.Dataset = *(dataset.(*Dataset)) +} + +// +// SetClassIndex will set the class index to `v`. +// +func (claset *Claset) SetClassIndex(v int) { + claset.ClassIndex = v +} + +// +// SetMajorityClass will set the majority class to `v`. +// +func (claset *Claset) SetMajorityClass(v string) { + claset.major = v +} + +// +// SetMinorityClass will set the minority class to `v`. +// +func (claset *Claset) SetMinorityClass(v string) { + claset.minor = v +} + +// +// CountValueSpaces will count number of value space in current dataset. +// +func (claset *Claset) CountValueSpaces() { + classv := claset.GetClassAsStrings() + claset.vs = claset.GetClassValueSpace() + + claset.counts = libstrings.CountTokens(classv, claset.vs, false) +} + +// +// RecountMajorMinor recount major and minor class in claset. +// +func (claset *Claset) RecountMajorMinor() { + claset.CountValueSpaces() + + _, maxIdx, maxok := libnumbers.IntsFindMax(claset.counts) + _, minIdx, minok := libnumbers.IntsFindMin(claset.counts) + + if maxok { + claset.major = claset.vs[maxIdx] + } + if minok { + claset.minor = claset.vs[minIdx] + } +} + +// +// IsInSingleClass check whether all target class contain only single value. +// Return true and name of target if all rows is in the same class, +// false and empty string otherwise. +// +func (claset *Claset) IsInSingleClass() (single bool, class string) { + classv := claset.GetClassAsStrings() + + for i, t := range classv { + if i == 0 { + single = true + class = t + continue + } + if t != class { + return false, "" + } + } + return +} + +// +// GetMinorityRows return rows where their class is minority in dataset, or nil +// if dataset is empty. +// +func (claset *Claset) GetMinorityRows() *Rows { + if claset.Len() == 0 { + return nil + } + if claset.vs == nil { + claset.RecountMajorMinor() + } + + minRows := claset.GetRows().SelectWhere(claset.ClassIndex, + claset.minor) + + return &minRows +} + +// +// String, yes it will pretty print the meta-data in JSON format. +// +func (claset *Claset) String() (s string) { + if claset.vs == nil { + claset.RecountMajorMinor() + } + + s = fmt.Sprintf("'claset':{'rows': %d, 'columns': %d, ", claset.Len(), + claset.GetNColumn()) + + s += "'vs':{" + for x, v := range claset.vs { + if x > 0 { + s += ", " + } + s += "'" + v + "':" + strconv.Itoa(claset.counts[x]) + } + s += "}" + + s += ", 'major': '" + claset.major + "'" + s += ", 'minor': '" + claset.minor + "'" + s += "}" + + return +} diff --git a/lib/tabula/clasetinterface.go b/lib/tabula/clasetinterface.go new file mode 100644 index 00000000..ae8cdfcd --- /dev/null +++ b/lib/tabula/clasetinterface.go @@ -0,0 +1,38 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +// +// ClasetInterface is the interface for working with dataset containing class +// or target attribute. It embed dataset interface. +// +// Yes, the name is Claset with single `s` not Classset with triple `s` to +// minimize typo. +// +type ClasetInterface interface { + DatasetInterface + + GetClassType() int + GetClassValueSpace() []string + GetClassColumn() *Column + GetClassRecords() *Records + GetClassAsStrings() []string + GetClassAsReals() []float64 + GetClassIndex() int + MajorityClass() string + MinorityClass() string + Counts() []int + + SetDataset(DatasetInterface) + SetClassIndex(int) + SetMajorityClass(string) + SetMinorityClass(string) + + CountValueSpaces() + RecountMajorMinor() + IsInSingleClass() (bool, string) + + GetMinorityRows() *Rows +} diff --git a/lib/tabula/column.go b/lib/tabula/column.go new file mode 100644 index 00000000..f631fb30 --- /dev/null +++ b/lib/tabula/column.go @@ -0,0 +1,309 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "strconv" +) + +// +// Column represent slice of record. A vertical representation of data. +// +type Column struct { + // Name of column. String identifier for the column. + Name string + // Type of column. All record in column have the same type. + Type int + // Flag additional attribute that can be set to mark some value on this + // column + Flag int + // ValueSpace contain the possible value in records + ValueSpace []string + // Records contain column data. + Records Records +} + +// +// NewColumn return new column with type and name. +// +func NewColumn(colType int, colName string) (col *Column) { + col = &Column{ + Type: colType, + Name: colName, + Flag: 0, + } + + col.Records = make([]*Record, 0) + + return +} + +// +// NewColumnString initialize column with type anda data as string. +// +func NewColumnString(data []string, colType int, colName string) ( + col *Column, + e error, +) { + col = NewColumn(colType, colName) + + datalen := len(data) + + if datalen <= 0 { + return + } + + col.Records = make([]*Record, datalen) + + for x := 0; x < datalen; x++ { + col.Records[x] = NewRecordString(data[x]) + } + + return col, nil +} + +// +// NewColumnInt create new column with record type as integer, and fill it +// with `data`. +// +func NewColumnInt(data []int64, colName string) (col *Column) { + col = NewColumn(TInteger, colName) + + datalen := len(data) + if datalen <= 0 { + return + } + + col.Records = make([]*Record, datalen) + + for x, v := range data { + col.Records[x] = NewRecordInt(v) + } + return +} + +// +// NewColumnReal create new column with record type is real. +// +func NewColumnReal(data []float64, colName string) (col *Column) { + col = NewColumn(TReal, colName) + + datalen := len(data) + + if datalen <= 0 { + return + } + + col.Records = make([]*Record, datalen) + + for x := 0; x < datalen; x++ { + rec := NewRecordReal(data[x]) + col.Records[x] = rec + } + + return +} + +// +// SetType will set the type of column to `tipe`. +// +func (col *Column) SetType(tipe int) { + col.Type = tipe +} + +// +// SetName will set the name of column to `name`. +// +func (col *Column) SetName(name string) { + col.Name = name +} + +// +// GetType return the type of column. +// +func (col *Column) GetType() int { + return col.Type +} + +// +// GetName return the column name. +// +func (col *Column) GetName() string { + return col.Name +} + +// +// SetRecords will set records in column to `recs`. +// +func (col *Column) SetRecords(recs *Records) { + col.Records = *recs +} + +// +// Interface return the column object as an interface. +// +func (col *Column) Interface() interface{} { + return col +} + +// +// Reset column data and flag. +// +func (col *Column) Reset() { + col.Flag = 0 + col.Records = make([]*Record, 0) +} + +// +// Len return number of record. +// +func (col *Column) Len() int { + return len(col.Records) +} + +// +// PushBack push record the end of column. +// +func (col *Column) PushBack(r *Record) { + col.Records = append(col.Records, r) +} + +// +// PushRecords append slice of record to the end of column's records. +// +func (col *Column) PushRecords(rs []*Record) { + col.Records = append(col.Records, rs...) +} + +// +// ToIntegers convert slice of record to slice of int64. +// +func (col *Column) ToIntegers() []int64 { + newcol := make([]int64, col.Len()) + + for x := range col.Records { + newcol[x] = col.Records[x].Integer() + } + + return newcol +} + +// +// ToFloatSlice convert slice of record to slice of float64. +// +func (col *Column) ToFloatSlice() (newcol []float64) { + newcol = make([]float64, col.Len()) + + for i := range col.Records { + newcol[i] = col.Records[i].Float() + } + + return +} + +// +// ToStringSlice convert slice of record to slice of string. +// +func (col *Column) ToStringSlice() (newcol []string) { + newcol = make([]string, col.Len()) + + for i := range col.Records { + newcol[i] = col.Records[i].String() + } + + return +} + +// +// ClearValues set all value in column to empty string or zero if column type is +// numeric. +// +func (col *Column) ClearValues() { + for _, r := range col.Records { + r.Reset() + } +} + +// +// SetValueAt will set column value at cell `idx` with `v`, unless the index +// is out of range. +// +func (col *Column) SetValueAt(idx int, v string) { + if idx < 0 { + return + } + if col.Records.Len() <= idx { + return + } + _ = col.Records[idx].SetValue(v, col.Type) +} + +// +// SetValueByNumericAt will set column value at cell `idx` with numeric value +// `v`, unless the index is out of range. +// +func (col *Column) SetValueByNumericAt(idx int, v float64) { + if idx < 0 { + return + } + if col.Records.Len() <= idx { + return + } + switch col.Type { + case TString: + col.Records[idx].SetString(strconv.FormatFloat(v, 'f', -1, 64)) + case TInteger: + col.Records[idx].SetInteger(int64(v)) + case TReal: + col.Records[idx].SetFloat(v) + } +} + +// +// SetValues of all column record. +// +func (col *Column) SetValues(values []string) { + vallen := len(values) + reclen := col.Len() + + // initialize column record if its empty. + if reclen <= 0 { + col.Records = make([]*Record, vallen) + reclen = vallen + } + + // pick the least length + minlen := reclen + if vallen < reclen { + minlen = vallen + } + + for x := 0; x < minlen; x++ { + _ = col.Records[x].SetValue(values[x], col.Type) + } +} + +// +// DeleteRecordAt will delete record at index `i` and return it. +// +func (col *Column) DeleteRecordAt(i int) *Record { + if i < 0 { + return nil + } + + clen := col.Len() + if i >= clen { + return nil + } + + r := col.Records[i] + + last := clen - 1 + copy(col.Records[i:], col.Records[i+1:]) + col.Records[last] = nil + col.Records = col.Records[0:last] + + return r +} diff --git a/lib/tabula/column_test.go b/lib/tabula/column_test.go new file mode 100644 index 00000000..bf2434fc --- /dev/null +++ b/lib/tabula/column_test.go @@ -0,0 +1,67 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "testing" + + "github.com/shuLhan/share/lib/test" +) + +var data = []string{"9.987654321", "8.8", "7.7", "6.6", "5.5", "4.4", "3.3"} +var expFloat = []float64{9.987654321, 8.8, 7.7, 6.6, 5.5, 4.4, 3.3} + +func initColReal(t *testing.T) (col *Column) { + col = NewColumn(TReal, "TREAL") + + for x := range data { + rec, e := NewRecordBy(data[x], TReal) + if e != nil { + t.Fatal(e) + } + + col.PushBack(rec) + } + + return col +} + +func TestToFloatSlice(t *testing.T) { + col := initColReal(t) + got := col.ToFloatSlice() + + test.Assert(t, "", expFloat, got, true) +} + +func TestToStringSlice(t *testing.T) { + var col Column + + for x := range data { + rec, e := NewRecordBy(data[x], TString) + if e != nil { + t.Fatal(e) + } + + col.PushBack(rec) + } + + got := col.ToStringSlice() + + test.Assert(t, "", data, got, true) +} + +func TestDeleteRecordAt(t *testing.T) { + var exp []float64 + del := 2 + + exp = append(exp, expFloat[:del]...) + exp = append(exp, expFloat[del+1:]...) + + col := initColReal(t) + col.DeleteRecordAt(del) + got := col.ToFloatSlice() + + test.Assert(t, "", exp, got, true) +} diff --git a/lib/tabula/columninterface.go b/lib/tabula/columninterface.go new file mode 100644 index 00000000..8a961b8b --- /dev/null +++ b/lib/tabula/columninterface.go @@ -0,0 +1,20 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +// +// ColumnInterface define an interface for working with Column. +// +type ColumnInterface interface { + SetType(tipe int) + SetName(name string) + + GetType() int + GetName() string + + SetRecords(recs *Records) + + Interface() interface{} +} diff --git a/lib/tabula/columns.go b/lib/tabula/columns.go new file mode 100644 index 00000000..a5cd05d5 --- /dev/null +++ b/lib/tabula/columns.go @@ -0,0 +1,147 @@ +// Copyright 2017m Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + libbytes "github.com/shuLhan/share/lib/bytes" + libnumbers "github.com/shuLhan/share/lib/numbers" +) + +// +// Columns represent slice of Column. +// +type Columns []Column + +// +// Len return length of columns. +// +func (cols *Columns) Len() int { + return len(*cols) +} + +// +// Reset each data and attribute in all columns. +// +func (cols *Columns) Reset() { + for x := range *cols { + (*cols)[x].Reset() + } +} + +// +// SetTypes of each column. The length of type must be equal with the number of +// column, otherwise it will used the minimum length between types or columns. +// +func (cols *Columns) SetTypes(types []int) { + typeslen := len(types) + colslen := len(*cols) + minlen := typeslen + + if colslen < minlen { + minlen = colslen + } + + for x := 0; x < minlen; x++ { + (*cols)[x].Type = types[x] + } +} + +// +// RandomPick column in columns until n item and return it like its has been +// shuffled. If duplicate is true, column that has been picked can be picked up +// again, otherwise it will only picked up once. +// +// This function return picked and unpicked column and index of them. +// +func (cols *Columns) RandomPick(n int, dup bool, excludeIdx []int) ( + picked Columns, + unpicked Columns, + pickedIdx []int, + unpickedIdx []int, +) { + excLen := len(excludeIdx) + colsLen := len(*cols) + allowedLen := colsLen - excLen + + // if duplication is not allowed, limit the number of selected + // column. + if n > allowedLen && !dup { + n = allowedLen + } + + for ; n >= 1; n-- { + idx := libnumbers.IntPickRandPositive(colsLen, dup, pickedIdx, + excludeIdx) + + pickedIdx = append(pickedIdx, idx) + picked = append(picked, (*cols)[idx]) + } + + // select unpicked columns using picked index. + for cid := range *cols { + // check if column index has been picked up + isPicked := false + for _, idx := range pickedIdx { + if cid == idx { + isPicked = true + break + } + } + if !isPicked { + unpicked = append(unpicked, (*cols)[cid]) + unpickedIdx = append(unpickedIdx, cid) + } + } + + return +} + +// +// GetMinMaxLength given a slice of column, find the minimum and maximum column +// length among them. +// +func (cols *Columns) GetMinMaxLength() (min, max int) { + for _, col := range *cols { + collen := col.Len() + if collen < min { + min = collen + } else if collen > max { + max = collen + } + } + return +} + +// +// Join all column records value at index `row` using separator `sep` and make +// sure if there is a separator in value it will be escaped with `esc`. +// +// Given slice of columns, where row is 1 and sep is `,` and escape is `\` +// +// 0 1 2 +// 0 A B C +// 1 D , F <- row +// 2 G H I +// +// this function will return "D,\,,F" in bytes. +// +// +func (cols *Columns) Join(row int, sep, esc []byte) (v []byte) { + for y, col := range *cols { + if y > 0 { + v = append(v, sep...) + } + + rec := col.Records[row] + recV := rec.Bytes() + + if rec.Type() == TString { + recV, _ = libbytes.EncloseToken(recV, sep, esc, nil) + } + + v = append(v, recV...) + } + return +} diff --git a/lib/tabula/columns_test.go b/lib/tabula/columns_test.go new file mode 100644 index 00000000..43b30028 --- /dev/null +++ b/lib/tabula/columns_test.go @@ -0,0 +1,56 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestRandomPickColumns(t *testing.T) { + var dataset Dataset + var e error + + dataset.Init(DatasetModeRows, testColTypes, testColNames) + + dataset.Rows, e = initRows() + if e != nil { + t.Fatal(e) + } + + dataset.TransposeToColumns() + + // random pick with duplicate + ncols := 6 + dup := true + excludeIdx := []int{3} + + for i := 0; i < 5; i++ { + picked, unpicked, _, _ := + dataset.Columns.RandomPick(ncols, dup, excludeIdx) + + // check if unpicked item exist in picked items. + for _, un := range unpicked { + for _, pick := range picked { + test.Assert(t, "", un, pick, false) + } + } + } + + // random pick without duplicate + dup = false + for i := 0; i < 5; i++ { + picked, unpicked, _, _ := + dataset.Columns.RandomPick(ncols, dup, excludeIdx) + + // check if unpicked item exist in picked items. + for _, un := range unpicked { + for _, pick := range picked { + test.Assert(t, "", un, pick, false) + } + } + } +} diff --git a/lib/tabula/dataset.go b/lib/tabula/dataset.go new file mode 100644 index 00000000..703aca35 --- /dev/null +++ b/lib/tabula/dataset.go @@ -0,0 +1,747 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "errors" + "math" +) + +const ( + // DatasetNoMode default to matrix. + DatasetNoMode = 0 + // DatasetModeRows for output mode in rows. + DatasetModeRows = 1 + // DatasetModeColumns for output mode in columns. + DatasetModeColumns = 2 + // DatasetModeMatrix will save data in rows and columns. + DatasetModeMatrix = 4 +) + +var ( + // ErrColIdxOutOfRange operation on column index is invalid + ErrColIdxOutOfRange = errors.New("tabula: Column index out of range") + // ErrInvalidColType operation on column with different type + ErrInvalidColType = errors.New("tabula: Invalid column type") + // ErrMisColLength returned when operation on columns does not match + // between parameter and their length + ErrMisColLength = errors.New("tabula: mismatch on column length") +) + +// +// Dataset contain the data, mode of saved data, number of columns and rows in +// data. +// +type Dataset struct { + // Mode define the numeric value of output mode. + Mode int + // Columns is input data that has been parsed. + Columns Columns + // Rows is input data that has been parsed. + Rows Rows +} + +// +// NewDataset create new dataset, use the mode to initialize the dataset. +// +func NewDataset(mode int, types []int, names []string) ( + dataset *Dataset, +) { + dataset = &Dataset{} + + dataset.Init(mode, types, names) + + return +} + +// +// Init will set the dataset using mode and types. +// +func (dataset *Dataset) Init(mode int, types []int, names []string) { + if types == nil { + dataset.Columns = make(Columns, 0) + } else { + dataset.Columns = make(Columns, len(types)) + dataset.Columns.SetTypes(types) + } + + dataset.SetColumnsName(names) + dataset.SetMode(mode) +} + +// +// Clone return a copy of current dataset. +// +func (dataset *Dataset) Clone() interface{} { + clone := NewDataset(dataset.GetMode(), nil, nil) + + for _, col := range dataset.Columns { + newcol := Column{ + Type: col.Type, + Name: col.Name, + ValueSpace: col.ValueSpace, + } + clone.PushColumn(newcol) + } + + return clone +} + +// +// Reset all data and attributes. +// +func (dataset *Dataset) Reset() error { + dataset.Rows = Rows{} + dataset.Columns.Reset() + return nil +} + +// +// GetMode return mode of data. +// +func (dataset *Dataset) GetMode() int { + return dataset.Mode +} + +// +// SetMode of saved data to `mode`. +// +func (dataset *Dataset) SetMode(mode int) { + switch mode { + case DatasetModeRows: + dataset.Mode = DatasetModeRows + dataset.Rows = make(Rows, 0) + case DatasetModeColumns: + dataset.Mode = DatasetModeColumns + dataset.Columns.Reset() + default: + dataset.Mode = DatasetModeMatrix + dataset.Rows = make(Rows, 0) + dataset.Columns.Reset() + } +} + +// +// GetNColumn return the number of column in dataset. +// +func (dataset *Dataset) GetNColumn() (ncol int) { + ncol = len(dataset.Columns) + + if ncol > 0 { + return + } + + switch dataset.Mode { + case DatasetModeRows: + if len(dataset.Rows) <= 0 { + return 0 + } + return dataset.Rows[0].Len() + } + + return +} + +// +// GetNRow return number of rows in dataset. +// +func (dataset *Dataset) GetNRow() (nrow int) { + switch dataset.Mode { + case DatasetModeRows: + nrow = len(dataset.Rows) + case DatasetModeColumns: + if len(dataset.Columns) <= 0 { + nrow = 0 + } else { + // get length of record in the first column + nrow = dataset.Columns[0].Len() + } + case DatasetModeMatrix, DatasetNoMode: + // matrix mode could have empty either in rows or column. + nrow = len(dataset.Rows) + } + return +} + +// +// Len return number of row in dataset. +// +func (dataset *Dataset) Len() int { + return dataset.GetNRow() +} + +// +// GetColumnsType return the type of all columns. +// +func (dataset *Dataset) GetColumnsType() (types []int) { + for x := range dataset.Columns { + types = append(types, dataset.Columns[x].Type) + } + + return +} + +// +// SetColumnsType of data in all columns. +// +func (dataset *Dataset) SetColumnsType(types []int) { + dataset.Columns = make(Columns, len(types)) + dataset.Columns.SetTypes(types) +} + +// +// GetColumnTypeAt return type of column in index `colidx` in dataset. +// +func (dataset *Dataset) GetColumnTypeAt(idx int) (int, error) { + if idx >= dataset.GetNColumn() { + return TUndefined, ErrColIdxOutOfRange + } + + return dataset.Columns[idx].Type, nil +} + +// +// SetColumnTypeAt will set column type at index `colidx` to `tipe`. +// +func (dataset *Dataset) SetColumnTypeAt(idx, tipe int) error { + if idx >= dataset.GetNColumn() { + return ErrColIdxOutOfRange + } + + dataset.Columns[idx].Type = tipe + return nil +} + +// +// GetColumnsName return name of all columns. +// +func (dataset *Dataset) GetColumnsName() (names []string) { + for x := range dataset.Columns { + names = append(names, dataset.Columns[x].Name) + } + + return +} + +// +// SetColumnsName set column name. +// +func (dataset *Dataset) SetColumnsName(names []string) { + nameslen := len(names) + + if nameslen <= 0 { + // empty names, return immediately. + return + } + + collen := dataset.GetNColumn() + + if collen <= 0 { + dataset.Columns = make(Columns, nameslen) + collen = nameslen + } + + // find minimum length + minlen := collen + if nameslen < collen { + minlen = nameslen + } + + for x := 0; x < minlen; x++ { + dataset.Columns[x].Name = names[x] + } +} + +// +// AddColumn will create and add new empty column with specific type and name +// into dataset. +// +func (dataset *Dataset) AddColumn(tipe int, name string, vs []string) { + col := Column{ + Type: tipe, + Name: name, + ValueSpace: vs, + } + dataset.PushColumn(col) +} + +// +// GetColumn return pointer to column object at index `idx`. If `idx` is out of +// range return nil. +// +func (dataset *Dataset) GetColumn(idx int) (col *Column) { + if idx > dataset.GetNColumn() { + return + } + + switch dataset.Mode { + case DatasetModeRows: + dataset.TransposeToColumns() + case DatasetModeColumns: + // do nothing + case DatasetModeMatrix: + // do nothing + } + + return &dataset.Columns[idx] +} + +// +// GetColumnByName return column based on their `name`. +// +func (dataset *Dataset) GetColumnByName(name string) (col *Column) { + switch dataset.Mode { + case DatasetModeRows: + dataset.TransposeToColumns() + } + + for x, col := range dataset.Columns { + if col.Name == name { + return &dataset.Columns[x] + } + } + return +} + +// +// GetColumns return columns in dataset, without transposing. +// +func (dataset *Dataset) GetColumns() *Columns { + return &dataset.Columns +} + +// +// SetColumns will replace current columns with new one from parameter. +// +func (dataset *Dataset) SetColumns(cols *Columns) { + dataset.Columns = *cols +} + +// +// GetRow return pointer to row at index `idx` or nil if index is out of range. +// +func (dataset *Dataset) GetRow(idx int) *Row { + if idx < 0 { + return nil + } + if idx >= dataset.Rows.Len() { + return nil + } + return dataset.Rows[idx] +} + +// +// GetRows return rows in dataset, without transposing. +// +func (dataset *Dataset) GetRows() *Rows { + return &dataset.Rows +} + +// +// SetRows will replace current rows with new one from parameter. +// +func (dataset *Dataset) SetRows(rows *Rows) { + dataset.Rows = *rows +} + +// +// GetData return the data, based on mode (rows, columns, or matrix). +// +func (dataset *Dataset) GetData() interface{} { + switch dataset.Mode { + case DatasetModeRows: + return &dataset.Rows + case DatasetModeColumns: + return &dataset.Columns + case DatasetModeMatrix, DatasetNoMode: + return &Matrix{ + Columns: &dataset.Columns, + Rows: &dataset.Rows, + } + } + + return nil +} + +// +// GetDataAsRows return data in rows mode. +// +func (dataset *Dataset) GetDataAsRows() *Rows { + if dataset.Mode == DatasetModeColumns { + dataset.TransposeToRows() + } + return &dataset.Rows +} + +// +// GetDataAsColumns return data in columns mode. +// +func (dataset *Dataset) GetDataAsColumns() (columns *Columns) { + if dataset.Mode == DatasetModeRows { + dataset.TransposeToColumns() + } + return &dataset.Columns +} + +// +// TransposeToColumns move all data from rows (horizontal) to columns +// (vertical) mode. +// +func (dataset *Dataset) TransposeToColumns() { + if dataset.GetNRow() <= 0 { + // nothing to transpose + return + } + + ncol := dataset.GetNColumn() + if ncol <= 0 { + // if no columns defined, initialize it using record type + // in the first row. + types := dataset.GetRow(0).Types() + dataset.SetColumnsType(types) + ncol = len(types) + } + + orgmode := dataset.GetMode() + + switch orgmode { + case DatasetModeRows: + // do nothing. + case DatasetModeColumns, DatasetModeMatrix, DatasetNoMode: + // check if column records contain data. + nrow := dataset.Columns[0].Len() + if nrow > 0 { + // return if column record is not empty, its already + // transposed + return + } + } + + // use the least length + minlen := len(*dataset.GetRow(0)) + + if minlen > ncol { + minlen = ncol + } + + switch orgmode { + case DatasetModeRows, DatasetNoMode: + dataset.SetMode(DatasetModeColumns) + } + + for _, row := range dataset.Rows { + for y := 0; y < minlen; y++ { + dataset.Columns[y].PushBack((*row)[y]) + } + } + + // reset the rows data only if original mode is rows + // this to prevent empty data when mode is matrix. + switch orgmode { + case DatasetModeRows: + dataset.Rows = nil + } +} + +// +// TransposeToRows will move all data from columns (vertical) to rows +// (horizontal) mode. +// +func (dataset *Dataset) TransposeToRows() { + orgmode := dataset.GetMode() + + if orgmode == DatasetModeRows { + // already transposed + return + } + + if orgmode == DatasetModeColumns { + // only set mode if transposing from columns to rows + dataset.SetMode(DatasetModeRows) + } + + // Get the max length of columns. + rowlen := math.MinInt32 + flen := len(dataset.Columns) + + for f := 0; f < flen; f++ { + l := dataset.Columns[f].Len() + + if l > rowlen { + rowlen = l + } + } + + dataset.Rows = make(Rows, 0) + + // Transpose record from column to row. + for r := 0; r < rowlen; r++ { + row := make(Row, flen) + + for f := 0; f < flen; f++ { + if dataset.Columns[f].Len() > r { + row[f] = dataset.Columns[f].Records[r] + } else { + row[f] = NewRecord() + } + } + + dataset.Rows = append(dataset.Rows, &row) + } + + // Only reset the columns if original dataset mode is "columns". + // This to prevent empty data when mode is matrix. + if orgmode == DatasetModeColumns { + dataset.Columns.Reset() + } +} + +// +// PushRow save the data, which is already in row object, to Rows. +// +func (dataset *Dataset) PushRow(row *Row) { + switch dataset.GetMode() { + case DatasetModeRows: + dataset.Rows = append(dataset.Rows, row) + case DatasetModeColumns: + dataset.PushRowToColumns(row) + case DatasetModeMatrix, DatasetNoMode: + dataset.Rows = append(dataset.Rows, row) + dataset.PushRowToColumns(row) + } +} + +// +// PushRowToColumns push each data in Row to Columns. +// +func (dataset *Dataset) PushRowToColumns(row *Row) { + rowlen := row.Len() + if rowlen <= 0 { + // return immediately if no data in row. + return + } + + // check if columns is initialize. + collen := len(dataset.Columns) + if collen <= 0 { + dataset.Columns = make(Columns, rowlen) + collen = rowlen + } + + // pick the minimum length. + min := rowlen + if collen < rowlen { + min = collen + } + + for x := 0; x < min; x++ { + dataset.Columns[x].PushBack((*row)[x]) + } +} + +// +// FillRowsWithColumn given a column, fill the dataset with row where the record +// only set at index `colIdx`. +// +// Example, content of dataset was, +// +// index: 0 1 2 +// A B C +// X (step 1) nrow = 2 +// +// If we filled column at index 2 with [Y Z], the dataset will become: +// +// index: 0 1 2 +// A B C +// X Y (step 2) fill the empty row +// Z (step 3) create dummy row which contain the rest of column data. +// +func (dataset *Dataset) FillRowsWithColumn(colIdx int, col Column) { + if dataset.GetMode() != DatasetModeRows { + // Only work if dataset mode is ROWS + return + } + + nrow := dataset.GetNRow() + emptyAt := nrow + + // (step 1) Find the row with empty records + for x, row := range dataset.Rows { + if row.IsNilAt(colIdx) { + emptyAt = x + break + } + } + + // (step 2) Fill the empty rows using column records. + y := 0 + for x := emptyAt; x < nrow; x++ { + dataset.Rows[x].SetValueAt(colIdx, col.Records[y]) + y++ + } + + // (step 3) Continue filling the column but using dummy row which + // contain only record at index `colIdx`. + ncol := dataset.GetNColumn() + nrow = col.Len() + for ; y < nrow; y++ { + row := make(Row, ncol) + + for z := 0; z < ncol; z++ { + if z == colIdx { + row[colIdx] = col.Records[y] + } else { + row[z] = NewRecord() + } + } + + dataset.PushRow(&row) + } +} + +// +// PushColumn will append new column to the end of slice if no existing column +// with the same name. If it exist, the records will be merged. +// +func (dataset *Dataset) PushColumn(col Column) { + exist := false + colIdx := 0 + for x, c := range dataset.Columns { + if c.Name == col.Name { + exist = true + colIdx = x + break + } + } + + switch dataset.GetMode() { + case DatasetModeRows: + if exist { + dataset.FillRowsWithColumn(colIdx, col) + } else { + // append new column + dataset.Columns = append(dataset.Columns, col) + dataset.PushColumnToRows(col) + // Remove records in column + dataset.Columns[dataset.GetNColumn()-1].Reset() + } + case DatasetModeColumns: + if exist { + dataset.Columns[colIdx].PushRecords(col.Records) + } else { + dataset.Columns = append(dataset.Columns, col) + } + case DatasetModeMatrix, DatasetNoMode: + if exist { + dataset.Columns[colIdx].PushRecords(col.Records) + } else { + dataset.Columns = append(dataset.Columns, col) + dataset.PushColumnToRows(col) + } + } +} + +// +// PushColumnToRows add each record in column to each rows, from top to bottom. +// +func (dataset *Dataset) PushColumnToRows(col Column) { + colsize := col.Len() + if colsize <= 0 { + // Do nothing if column is empty. + return + } + + nrow := dataset.GetNRow() + if nrow <= 0 { + // If no existing rows in dataset, initialize the rows slice. + dataset.Rows = make(Rows, colsize) + + for nrow = 0; nrow < colsize; nrow++ { + row := make(Row, 0) + dataset.Rows[nrow] = &row + } + } + + // Pick the minimum length between column or current row length. + minrow := nrow + + if colsize < nrow { + minrow = colsize + } + + // Push each record in column to each rows + var row *Row + var rec *Record + + for x := 0; x < minrow; x++ { + row = dataset.Rows[x] + rec = col.Records[x] + + row.PushBack(rec) + } +} + +// +// MergeColumns append columns from other dataset into current dataset. +// +func (dataset *Dataset) MergeColumns(other DatasetInterface) { + othermode := other.GetMode() + if othermode == DatasetModeRows { + other.TransposeToColumns() + } + + cols := other.GetDataAsColumns() + for _, col := range *cols { + dataset.PushColumn(col) + } + + switch othermode { + case DatasetModeRows: + other.TransposeToRows() + } +} + +// +// MergeRows append rows from other dataset into current dataset. +// +func (dataset *Dataset) MergeRows(other DatasetInterface) { + rows := other.GetDataAsRows() + for _, row := range *rows { + dataset.PushRow(row) + } +} + +// +// DeleteRow will detach row at index `i` from dataset and return it. +// +func (dataset *Dataset) DeleteRow(i int) (row *Row) { + if i < 0 { + return + } + if i >= dataset.Rows.Len() { + return + } + + orgmode := dataset.GetMode() + if orgmode == DatasetModeColumns { + dataset.TransposeToRows() + } + + row = dataset.Rows.Del(i) + + if orgmode == DatasetModeColumns { + dataset.TransposeToColumns() + } + + if orgmode != DatasetModeRows { + // Delete record in each columns as the same index as deleted + // row. + for x := range dataset.Columns { + dataset.Columns[x].DeleteRecordAt(i) + } + } + + return row +} diff --git a/lib/tabula/dataset_bench_test.go b/lib/tabula/dataset_bench_test.go new file mode 100644 index 00000000..86e36cc9 --- /dev/null +++ b/lib/tabula/dataset_bench_test.go @@ -0,0 +1,20 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "testing" +) + +func BenchmarkPushRow(b *testing.B) { + dataset := NewDataset(DatasetModeRows, nil, nil) + + for i := 0; i < b.N; i++ { + e := populateWithRows(dataset) + if e != nil { + b.Fatal(e) + } + } +} diff --git a/lib/tabula/dataset_test.go b/lib/tabula/dataset_test.go new file mode 100644 index 00000000..0b43f71c --- /dev/null +++ b/lib/tabula/dataset_test.go @@ -0,0 +1,365 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +var datasetRows = [][]string{ + {"0", "1", "A"}, + {"1", "1.1", "B"}, + {"2", "1.2", "A"}, + {"3", "1.3", "B"}, + {"4", "1.4", "C"}, + {"5", "1.5", "D"}, + {"6", "1.6", "C"}, + {"7", "1.7", "D"}, + {"8", "1.8", "E"}, + {"9", "1.9", "F"}, +} + +var datasetCols = [][]string{ + {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + {"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"}, + {"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"}, +} + +var datasetTypes = []int{ + TInteger, + TReal, + TString, +} + +var datasetNames = []string{"int", "real", "string"} + +func populateWithRows(dataset *Dataset) error { + for _, rowin := range datasetRows { + row := make(Row, len(rowin)) + + for x, recin := range rowin { + rec, e := NewRecordBy(recin, datasetTypes[x]) + if e != nil { + return e + } + + row[x] = rec + } + + dataset.PushRow(&row) + } + return nil +} + +func populateWithColumns(t *testing.T, dataset *Dataset) { + for x := range datasetCols { + col, e := NewColumnString(datasetCols[x], datasetTypes[x], + datasetNames[x]) + if e != nil { + t.Fatal(e) + } + + dataset.PushColumn(*col) + } +} + +func createDataset(t *testing.T) (dataset *Dataset) { + dataset = NewDataset(DatasetModeRows, datasetTypes, + datasetNames) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + return +} + +func DatasetStringJoinByIndex(t *testing.T, dataset [][]string, indis []int) (res string) { + for x := range indis { + res += fmt.Sprint("&", dataset[indis[x]]) + } + return res +} + +func DatasetRowsJoin(t *testing.T) (s string) { + for x := range datasetRows { + s += fmt.Sprint("&", datasetRows[x]) + } + return +} + +func DatasetColumnsJoin(t *testing.T) (s string) { + for x := range datasetCols { + s += fmt.Sprint(datasetCols[x]) + } + return +} + +func TestSplitRowsByNumeric(t *testing.T) { + dataset := createDataset(t) + + // Split integer by float + splitL, splitR, e := SplitRowsByNumeric(dataset, 0, 4.5) + if e != nil { + t.Fatal(e) + } + + expIdx := []int{0, 1, 2, 3, 4} + exp := DatasetStringJoinByIndex(t, datasetRows, expIdx) + rows := splitL.GetDataAsRows() + got := fmt.Sprint(rows) + + test.Assert(t, "", exp, got, true) + + expIdx = []int{5, 6, 7, 8, 9} + exp = DatasetStringJoinByIndex(t, datasetRows, expIdx) + got = fmt.Sprint(splitR.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + // Split by float + splitL, splitR, e = SplitRowsByNumeric(dataset, 1, 1.8) + if e != nil { + t.Fatal(e) + } + + expIdx = []int{0, 1, 2, 3, 4, 5, 6, 7} + exp = DatasetStringJoinByIndex(t, datasetRows, expIdx) + got = fmt.Sprint(splitL.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + expIdx = []int{8, 9} + exp = DatasetStringJoinByIndex(t, datasetRows, expIdx) + got = fmt.Sprint(splitR.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) +} + +func TestSplitRowsByCategorical(t *testing.T) { + dataset := createDataset(t) + splitval := []string{"A", "D"} + + splitL, splitR, e := SplitRowsByCategorical(dataset, 2, + splitval) + if e != nil { + t.Fatal(e) + } + + expIdx := []int{0, 2, 5, 7} + exp := DatasetStringJoinByIndex(t, datasetRows, expIdx) + got := fmt.Sprint(splitL.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) + + expIdx = []int{1, 3, 4, 6, 8, 9} + exp = DatasetStringJoinByIndex(t, datasetRows, expIdx) + got = fmt.Sprint(splitR.GetDataAsRows()) + + test.Assert(t, "", exp, got, true) +} + +func TestModeColumnsPushColumn(t *testing.T) { + dataset := NewDataset(DatasetModeColumns, nil, nil) + + exp := "" + got := "" + for x := range datasetCols { + col, e := NewColumnString(datasetCols[x], datasetTypes[x], + datasetNames[x]) + if e != nil { + t.Fatal(e) + } + + dataset.PushColumn(*col) + + exp += fmt.Sprint(datasetCols[x]) + got += fmt.Sprint(dataset.Columns[x].Records) + } + + test.Assert(t, "", exp, got, true) + + // Check rows + exp = "" + got = fmt.Sprint(dataset.Rows) + test.Assert(t, "", exp, got, true) +} + +func TestModeRowsPushColumn(t *testing.T) { + dataset := NewDataset(DatasetModeRows, nil, nil) + + populateWithColumns(t, dataset) + + // Check rows + exp := DatasetRowsJoin(t) + got := fmt.Sprint(dataset.Rows) + + test.Assert(t, "", exp, got, true) + + // Check columns + exp = "[{int 1 0 [] []} {real 2 0 [] []} {string 0 0 [] []}]" + got = fmt.Sprint(dataset.Columns) + + test.Assert(t, "", exp, got, true) +} + +func TestModeMatrixPushColumn(t *testing.T) { + dataset := NewDataset(DatasetModeMatrix, nil, nil) + + exp := "" + got := "" + for x := range datasetCols { + col, e := NewColumnString(datasetCols[x], datasetTypes[x], + datasetNames[x]) + if e != nil { + t.Fatal(e) + } + + dataset.PushColumn(*col) + + exp += fmt.Sprint(datasetCols[x]) + got += fmt.Sprint(dataset.Columns[x].Records) + } + + test.Assert(t, "", exp, got, true) + + // Check rows + exp = DatasetRowsJoin(t) + got = fmt.Sprint(dataset.Rows) + + test.Assert(t, "", exp, got, true) +} + +func TestModeRowsPushRows(t *testing.T) { + dataset := NewDataset(DatasetModeRows, nil, nil) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + exp := DatasetRowsJoin(t) + got := fmt.Sprint(dataset.Rows) + + test.Assert(t, "", exp, got, true) +} + +func TestModeColumnsPushRows(t *testing.T) { + dataset := NewDataset(DatasetModeColumns, nil, nil) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + // check rows + exp := "" + got := fmt.Sprint(dataset.Rows) + + test.Assert(t, "", exp, got, true) + + // check columns + exp = DatasetColumnsJoin(t) + got = "" + for x := range dataset.Columns { + got += fmt.Sprint(dataset.Columns[x].Records) + } + + test.Assert(t, "", exp, got, true) +} + +func TestModeMatrixPushRows(t *testing.T) { + dataset := NewDataset(DatasetModeMatrix, nil, nil) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + exp := DatasetRowsJoin(t) + got := fmt.Sprint(dataset.Rows) + + test.Assert(t, "", exp, got, true) + + // check columns + exp = DatasetColumnsJoin(t) + got = "" + for x := range dataset.Columns { + got += fmt.Sprint(dataset.Columns[x].Records) + } + + test.Assert(t, "", exp, got, true) +} + +func TestSelectRowsWhere(t *testing.T) { + dataset := NewDataset(DatasetModeMatrix, nil, nil) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + // select all rows where the first column value is 9. + selected := SelectRowsWhere(dataset, 0, "9") + exp := dataset.GetRow(9) + got := selected.GetRow(0) + + test.Assert(t, "", exp, got, true) +} + +func TestDeleteRow(t *testing.T) { + dataset := NewDataset(DatasetModeMatrix, nil, nil) + + e := populateWithRows(dataset) + if e != nil { + t.Fatal(e) + } + + delIdx := 2 + + // Check rows len. + exp := dataset.Len() - 1 + dataset.DeleteRow(delIdx) + got := dataset.Len() + + test.Assert(t, "", exp, got, true) + + // Check columns len. + for _, col := range dataset.Columns { + got = col.Len() + + test.Assert(t, "", exp, got, true) + } + + // Check rows data. + ridx := 0 + for x, row := range datasetRows { + if x == delIdx { + continue + } + exp := fmt.Sprint("&", row) + got := fmt.Sprint(dataset.GetRow(ridx)) + ridx++ + + test.Assert(t, "", exp, got, true) + } + + // Check columns data. + for x := range dataset.Columns { + col := datasetCols[x] + + coldel := []string{} + coldel = append(coldel, col[:delIdx]...) + coldel = append(coldel, col[delIdx+1:]...) + + exp := fmt.Sprint(coldel) + got := fmt.Sprint(dataset.Columns[x].Records) + test.Assert(t, "", exp, got, true) + } +} diff --git a/lib/tabula/datasetinterface.go b/lib/tabula/datasetinterface.go new file mode 100644 index 00000000..b68b5b12 --- /dev/null +++ b/lib/tabula/datasetinterface.go @@ -0,0 +1,442 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "encoding/json" + "fmt" + "io/ioutil" + + "github.com/shuLhan/share/lib/debug" +) + +// +// DatasetInterface is the interface for working with DSV data. +// +type DatasetInterface interface { + Init(mode int, types []int, names []string) + Clone() interface{} + Reset() error + + GetMode() int + SetMode(mode int) + + GetNColumn() int + GetNRow() int + Len() int + + GetColumnsType() []int + SetColumnsType(types []int) + + GetColumnTypeAt(idx int) (int, error) + SetColumnTypeAt(idx, tipe int) error + + GetColumnsName() []string + SetColumnsName(names []string) + + AddColumn(tipe int, name string, vs []string) + GetColumn(idx int) *Column + GetColumnByName(name string) *Column + GetColumns() *Columns + SetColumns(*Columns) + + GetRow(idx int) *Row + GetRows() *Rows + SetRows(*Rows) + DeleteRow(idx int) *Row + + GetData() interface{} + GetDataAsRows() *Rows + GetDataAsColumns() *Columns + + TransposeToColumns() + TransposeToRows() + + PushRow(r *Row) + PushRowToColumns(r *Row) + FillRowsWithColumn(colidx int, col Column) + PushColumn(col Column) + PushColumnToRows(col Column) + + MergeColumns(DatasetInterface) + MergeRows(DatasetInterface) +} + +// +// ReadDatasetConfig open dataset configuration file and initialize dataset +// field from there. +// +func ReadDatasetConfig(ds interface{}, fcfg string) (e error) { + cfg, e := ioutil.ReadFile(fcfg) + + if nil != e { + return e + } + + return json.Unmarshal(cfg, ds) +} + +// +// SortColumnsByIndex will sort all columns using sorted index. +// +func SortColumnsByIndex(di DatasetInterface, sortedIdx []int) { + if di.GetMode() == DatasetModeRows { + di.TransposeToColumns() + } + + cols := di.GetColumns() + for x, col := range *cols { + colsorted := col.Records.SortByIndex(sortedIdx) + (*cols)[x].SetRecords(colsorted) + } +} + +// +// SplitRowsByNumeric will split the data using splitVal in column `colidx`. +// +// For example, given two continuous attribute, +// +// A: {1,2,3,4} +// B: {5,6,7,8} +// +// if colidx is (1) B and splitVal is 7, the data will splitted into left set +// +// A': {1,2} +// B': {5,6} +// +// and right set +// +// A'': {3,4} +// B'': {7,8} +// +func SplitRowsByNumeric(di DatasetInterface, colidx int, splitVal float64) ( + splitLess DatasetInterface, + splitGreater DatasetInterface, + e error, +) { + // check type of column + coltype, e := di.GetColumnTypeAt(colidx) + if e != nil { + return + } + + if !(coltype == TInteger || coltype == TReal) { + return splitLess, splitGreater, ErrInvalidColType + } + + // Should we convert the data mode back later. + orgmode := di.GetMode() + + if orgmode == DatasetModeColumns { + di.TransposeToRows() + } + + if debug.Value >= 2 { + fmt.Println("[tabula] dataset:", di) + } + + splitLess = di.Clone().(DatasetInterface) + splitGreater = di.Clone().(DatasetInterface) + + rows := di.GetRows() + for _, row := range *rows { + if (*row)[colidx].Float() < splitVal { + splitLess.PushRow(row) + } else { + splitGreater.PushRow(row) + } + } + + if debug.Value >= 2 { + fmt.Println("[tabula] split less:", splitLess) + fmt.Println("[tabula] split greater:", splitGreater) + } + + switch orgmode { + case DatasetModeColumns: + di.TransposeToColumns() + splitLess.TransposeToColumns() + splitGreater.TransposeToColumns() + case DatasetModeMatrix: + // do nothing, since its already filled when pushing new row. + } + + return +} + +// +// SplitRowsByCategorical will split the data using a set of split value in +// column `colidx`. +// +// For example, given two attributes, +// +// X: [A,B,A,B,C,D,C,D] +// Y: [1,2,3,4,5,6,7,8] +// +// if colidx is (0) or A and split value is a set `[A,C]`, the data will +// splitted into left set which contain all rows that have A or C, +// +// X': [A,A,C,C] +// Y': [1,3,5,7] +// +// and the right set, excluded set, will contain all rows which is not A or C, +// +// X'': [B,B,D,D] +// Y'': [2,4,6,8] +// +func SplitRowsByCategorical(di DatasetInterface, colidx int, + splitVal []string) ( + splitIn DatasetInterface, + splitEx DatasetInterface, + e error, +) { + // check type of column + coltype, e := di.GetColumnTypeAt(colidx) + if e != nil { + return + } + + if coltype != TString { + return splitIn, splitEx, ErrInvalidColType + } + + // should we convert the data mode back? + orgmode := di.GetMode() + + if orgmode == DatasetModeColumns { + di.TransposeToRows() + } + + splitIn = di.Clone().(DatasetInterface) + splitEx = di.Clone().(DatasetInterface) + + for _, row := range *di.GetRows() { + found := false + for _, val := range splitVal { + if (*row)[colidx].String() == val { + splitIn.PushRow(row) + found = true + break + } + } + if !found { + splitEx.PushRow(row) + } + } + + // convert all dataset based on original + switch orgmode { + case DatasetModeColumns: + di.TransposeToColumns() + splitIn.TransposeToColumns() + splitEx.TransposeToColumns() + case DatasetModeMatrix, DatasetNoMode: + splitIn.TransposeToColumns() + splitEx.TransposeToColumns() + } + + return +} + +// +// SplitRowsByValue generic function to split data by value. This function will +// split data using value in column `colidx`. If value is numeric it will return +// any rows that have column value less than `value` in `splitL`, and any column +// value greater or equal to `value` in `splitR`. +// +func SplitRowsByValue(di DatasetInterface, colidx int, value interface{}) ( + splitL DatasetInterface, + splitR DatasetInterface, + e error, +) { + coltype, e := di.GetColumnTypeAt(colidx) + if e != nil { + return + } + + if coltype == TString { + splitL, splitR, e = SplitRowsByCategorical(di, colidx, + value.([]string)) + } else { + var splitval float64 + + switch value.(type) { + case int: + splitval = float64(value.(int)) + case int64: + splitval = float64(value.(int64)) + case float32: + splitval = float64(value.(float32)) + case float64: + splitval = value.(float64) + } + + splitL, splitR, e = SplitRowsByNumeric(di, colidx, + splitval) + } + + if e != nil { + return nil, nil, e + } + + return +} + +// +// SelectRowsWhere return all rows which column value in `colidx` is equal to +// `colval`. +// +func SelectRowsWhere(dataset DatasetInterface, colidx int, colval string) DatasetInterface { + orgmode := dataset.GetMode() + + if orgmode == DatasetModeColumns { + dataset.TransposeToRows() + } + + selected := NewDataset(dataset.GetMode(), nil, nil) + + selected.Rows = dataset.GetRows().SelectWhere(colidx, colval) + + switch orgmode { + case DatasetModeColumns: + dataset.TransposeToColumns() + selected.TransposeToColumns() + case DatasetModeMatrix, DatasetNoMode: + selected.TransposeToColumns() + } + + return selected +} + +// +// RandomPickRows return `n` item of row that has been selected randomly from +// dataset.Rows. The ids of rows that has been picked is saved id `pickedIdx`. +// +// If duplicate is true, the row that has been picked can be picked up again, +// otherwise it only allow one pick. This is also called as random selection +// with or without replacement in machine learning domain. +// +// If output mode is columns, it will be transposed to rows. +// +func RandomPickRows(dataset DatasetInterface, n int, duplicate bool) ( + picked DatasetInterface, + unpicked DatasetInterface, + pickedIdx []int, + unpickedIdx []int, +) { + orgmode := dataset.GetMode() + + if orgmode == DatasetModeColumns { + dataset.TransposeToRows() + } + + picked = dataset.Clone().(DatasetInterface) + unpicked = dataset.Clone().(DatasetInterface) + + pickedRows, unpickedRows, pickedIdx, unpickedIdx := + dataset.GetRows().RandomPick(n, duplicate) + + picked.SetRows(&pickedRows) + unpicked.SetRows(&unpickedRows) + + // switch the dataset based on original mode + switch orgmode { + case DatasetModeColumns: + dataset.TransposeToColumns() + // transform the picked and unpicked set. + picked.TransposeToColumns() + unpicked.TransposeToColumns() + + case DatasetModeMatrix, DatasetNoMode: + // transform the picked and unpicked set. + picked.TransposeToColumns() + unpicked.TransposeToColumns() + } + + return +} + +// +// RandomPickColumns will select `n` column randomly from dataset and return +// new dataset with picked and unpicked columns, and their column index. +// +// If duplicate is true, column that has been pick up can be pick up again. +// +// If dataset output mode is rows, it will transposed to columns. +// +func RandomPickColumns(dataset DatasetInterface, n int, dup bool, + excludeIdx []int) ( + picked DatasetInterface, + unpicked DatasetInterface, + pickedIdx []int, + unpickedIdx []int, +) { + orgmode := dataset.GetMode() + + if orgmode == DatasetModeRows { + dataset.TransposeToColumns() + } + + picked = dataset.Clone().(DatasetInterface) + unpicked = dataset.Clone().(DatasetInterface) + + pickedColumns, unpickedColumns, pickedIdx, unpickedIdx := + dataset.GetColumns().RandomPick(n, dup, excludeIdx) + + picked.SetColumns(&pickedColumns) + unpicked.SetColumns(&unpickedColumns) + + // transpose picked and unpicked dataset based on original mode + switch orgmode { + case DatasetModeRows: + dataset.TransposeToRows() + picked.TransposeToRows() + unpicked.TransposeToRows() + case DatasetModeMatrix, DatasetNoMode: + picked.TransposeToRows() + unpicked.TransposeToRows() + } + + return +} + +// +// SelectColumnsByIdx return new dataset with selected column index. +// +func SelectColumnsByIdx(dataset DatasetInterface, colsIdx []int) ( + newset DatasetInterface, +) { + var col *Column + + orgmode := dataset.GetMode() + + if orgmode == DatasetModeRows { + dataset.TransposeToColumns() + } + + newset = dataset.Clone().(DatasetInterface) + + for _, idx := range colsIdx { + col = dataset.GetColumn(idx) + if col == nil { + continue + } + + newset.PushColumn(*col) + } + + // revert the mode back + switch orgmode { + case DatasetModeRows: + dataset.TransposeToRows() + newset.TransposeToRows() + case DatasetModeColumns: + // do nothing + case DatasetModeMatrix: + // do nothing + } + + return +} diff --git a/lib/tabula/maprows.go b/lib/tabula/maprows.go new file mode 100644 index 00000000..a93f0308 --- /dev/null +++ b/lib/tabula/maprows.go @@ -0,0 +1,65 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "math" +) + +// +// MapRowsElement represent a single mapping of string key to rows. +// +type MapRowsElement struct { + Key string + Value Rows +} + +// +// MapRows represent a list of mapping between string key and rows. +// +type MapRows []MapRowsElement + +// +// insertRow will insert a row `v` into map using key `k`. +// +func (mapRows *MapRows) insertRow(k string, v *Row) { + rows := Rows{} + rows.PushBack(v) + el := MapRowsElement{k, rows} + (*mapRows) = append((*mapRows), el) +} + +// +// AddRow will append a row `v` into map value if they key `k` exist in map, +// otherwise it will insert a new map element. +// +func (mapRows *MapRows) AddRow(k string, v *Row) { + for x := range *mapRows { + if (*mapRows)[x].Key == k { + (*mapRows)[x].Value.PushBack(v) + return + } + } + // no key found on map + mapRows.insertRow(k, v) +} + +// +// GetMinority return map value which contain the minimum rows. +// +func (mapRows *MapRows) GetMinority() (keyMin string, valMin Rows) { + min := math.MaxInt32 + + for k := range *mapRows { + v := (*mapRows)[k].Value + l := len(v) + if l < min { + keyMin = (*mapRows)[k].Key + valMin = v + min = l + } + } + return +} diff --git a/lib/tabula/maprows_test.go b/lib/tabula/maprows_test.go new file mode 100644 index 00000000..19cd5ac8 --- /dev/null +++ b/lib/tabula/maprows_test.go @@ -0,0 +1,54 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestAddRow(t *testing.T) { + mapRows := MapRows{} + rows, e := initRows() + + if e != nil { + t.Fatal(e) + } + + for _, row := range rows { + key := fmt.Sprint((*row)[testClassIdx].Interface()) + mapRows.AddRow(key, row) + } + + got := fmt.Sprint(mapRows) + + test.Assert(t, "", groupByExpect, got, true) +} + +func TestGetMinority(t *testing.T) { + mapRows := MapRows{} + rows, e := initRows() + + if e != nil { + t.Fatal(e) + } + + for _, row := range rows { + key := fmt.Sprint((*row)[testClassIdx].Interface()) + mapRows.AddRow(key, row) + } + + // remove the first row in the first key, so we can make it minority. + mapRows[0].Value.PopFront() + + _, minRows := mapRows.GetMinority() + + exp := rowsExpect[3] + got := fmt.Sprint(minRows) + + test.Assert(t, "", exp, got, true) +} diff --git a/lib/tabula/matrix.go b/lib/tabula/matrix.go new file mode 100644 index 00000000..62ab68ac --- /dev/null +++ b/lib/tabula/matrix.go @@ -0,0 +1,13 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +// +// Matrix is a combination of columns and rows. +// +type Matrix struct { + Columns *Columns + Rows *Rows +} diff --git a/lib/tabula/record.go b/lib/tabula/record.go new file mode 100644 index 00000000..527ab430 --- /dev/null +++ b/lib/tabula/record.go @@ -0,0 +1,292 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "math" + "reflect" + "strconv" +) + +const ( + // TUndefined for undefined type + TUndefined = -1 + // TString string type. + TString = 0 + // TInteger integer type (64 bit). + TInteger = 1 + // TReal float type (64 bit). + TReal = 2 +) + +// +// Record represent the smallest building block of data-set. +// +type Record struct { + v interface{} +} + +// +// NewRecord will create and return record with nil value. +// +func NewRecord() *Record { + return &Record{v: nil} +} + +// +// NewRecordBy create new record from string with type set to `t`. +// +func NewRecordBy(v string, t int) (r *Record, e error) { + r = NewRecord() + e = r.SetValue(v, t) + return +} + +// +// NewRecordString will create new record from string. +// +func NewRecordString(v string) (r *Record) { + return &Record{v: v} +} + +// +// NewRecordInt create new record from integer value. +// +func NewRecordInt(v int64) (r *Record) { + return &Record{v: v} +} + +// +// NewRecordReal create new record from float value. +// +func NewRecordReal(v float64) (r *Record) { + return &Record{v: v} +} + +// +// Clone will create and return a clone of record. +// +func (r *Record) Clone() *Record { + return &Record{v: r.v} +} + +// +// IsNil return true if record has not been set with value, or nil. +// +func (r *Record) IsNil() bool { + return r.v == nil +} + +// +// Type of record. +// +func (r *Record) Type() int { + switch r.v.(type) { + case int64: + return TInteger + case float64: + return TReal + } + return TString +} + +// +// SetValue set the record value from string using type `t`. If value can not +// be converted to type, it will return an error. +// +func (r *Record) SetValue(v string, t int) error { + switch t { + case TString: + r.v = v + + case TInteger: + i64, e := strconv.ParseInt(v, 10, 64) + if nil != e { + return e + } + + r.v = i64 + + case TReal: + f64, e := strconv.ParseFloat(v, 64) + if nil != e { + return e + } + + r.v = f64 + } + return nil +} + +// +// SetString will set the record value with string value. +// +func (r *Record) SetString(v string) { + r.v = v +} + +// +// SetFloat will set the record value with float 64bit. +// +func (r *Record) SetFloat(v float64) { + r.v = v +} + +// +// SetInteger will set the record value with integer 64bit. +// +func (r *Record) SetInteger(v int64) { + r.v = v +} + +// +// IsMissingValue check wether the value is a missing attribute. +// +// If its string the missing value is indicated by character '?'. +// +// If its integer the missing value is indicated by minimum negative integer, +// or math.MinInt64. +// +// If its real the missing value is indicated by -Inf. +// +func (r *Record) IsMissingValue() bool { + switch r.v.(type) { + case string: + str := r.v.(string) + if str == "?" { + return true + } + + case int64: + i64 := r.v.(int64) + if i64 == math.MinInt64 { + return true + } + + case float64: + f64 := r.v.(float64) + return math.IsInf(f64, -1) + } + + return false +} + +// +// Interface return record value as interface. +// +func (r *Record) Interface() interface{} { + return r.v +} + +// +// Bytes convert record value to slice of byte. +// +func (r *Record) Bytes() []byte { + return []byte(r.String()) +} + +// +// String convert record value to string. +// +func (r Record) String() (s string) { + switch r.v.(type) { + case string: + s = r.v.(string) + + case int64: + s = strconv.FormatInt(r.v.(int64), 10) + + case float64: + s = strconv.FormatFloat(r.v.(float64), 'f', -1, 64) + } + return +} + +// +// Float convert given record to float value. If its failed it will return +// the -Infinity value. +// +func (r *Record) Float() (f64 float64) { + var e error + + switch r.v.(type) { + case string: + f64, e = strconv.ParseFloat(r.v.(string), 64) + + if nil != e { + f64 = math.Inf(-1) + } + + case int64: + f64 = float64(r.v.(int64)) + + case float64: + f64 = r.v.(float64) + } + + return +} + +// +// Integer convert given record to integer value. If its failed, it will return +// the minimum integer in 64bit. +// +func (r *Record) Integer() (i64 int64) { + var e error + + switch r.v.(type) { + case string: + i64, e = strconv.ParseInt(r.v.(string), 10, 64) + + if nil != e { + i64 = math.MinInt64 + } + + case int64: + i64 = r.v.(int64) + + case float64: + i64 = int64(r.v.(float64)) + } + + return +} + +// +// IsEqual return true if record is equal with other, otherwise return false. +// +func (r *Record) IsEqual(o *Record) bool { + return reflect.DeepEqual(r.v, o.Interface()) +} + +// +// IsEqualToString return true if string representation of record value is +// equal to string `v`. +// +func (r *Record) IsEqualToString(v string) bool { + return r.String() == v +} + +// +// IsEqualToInterface return true if interface type and value equal to record +// type and value. +// +func (r *Record) IsEqualToInterface(v interface{}) bool { + return reflect.DeepEqual(r.v, v) +} + +// +// Reset will reset record value to empty string or zero, depend on type. +// +func (r *Record) Reset() { + switch r.v.(type) { + case string: + r.v = "" + case int64: + r.v = int64(0) + case float64: + r.v = float64(0) + } +} diff --git a/lib/tabula/record_test.go b/lib/tabula/record_test.go new file mode 100644 index 00000000..223f9235 --- /dev/null +++ b/lib/tabula/record_test.go @@ -0,0 +1,35 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +// +// TestRecord simply check how the stringer work. +// +func TestRecord(t *testing.T) { + expec := []string{"test", "1", "2"} + expType := []int{TString, TInteger, TInteger} + + row := make(Row, 0) + + for i := range expec { + r, e := NewRecordBy(expec[i], expType[i]) + if nil != e { + t.Error(e) + } + + row = append(row, r) + } + + exp := fmt.Sprint(expec) + got := fmt.Sprint(row) + test.Assert(t, "", exp, got, true) +} diff --git a/lib/tabula/records.go b/lib/tabula/records.go new file mode 100644 index 00000000..e00c03b9 --- /dev/null +++ b/lib/tabula/records.go @@ -0,0 +1,54 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +// +// Records define slice of pointer to Record. +// +type Records []*Record + +// +// Len will return the length of records. +// +func (recs *Records) Len() int { + return len(*recs) +} + +// +// SortByIndex will sort the records using slice of index `sortedIDx` and +// return it. +// +func (recs *Records) SortByIndex(sortedIdx []int) *Records { + sorted := make(Records, len(*recs)) + + for x, v := range sortedIdx { + sorted[x] = (*recs)[v] + } + return &sorted +} + +// +// CountWhere return number of record where its value is equal to `v` type and +// value. +// +func (recs *Records) CountWhere(v interface{}) (c int) { + for _, r := range *recs { + if r.IsEqualToInterface(v) { + c++ + } + } + return +} + +// +// CountsWhere will return count of each value in slice `sv`. +// +func (recs *Records) CountsWhere(vs []interface{}) (counts []int) { + for _, v := range vs { + c := recs.CountWhere(v) + counts = append(counts, c) + } + return +} diff --git a/lib/tabula/records_test.go b/lib/tabula/records_test.go new file mode 100644 index 00000000..2be6f7b1 --- /dev/null +++ b/lib/tabula/records_test.go @@ -0,0 +1,29 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestSortByIndex(t *testing.T) { + data := make(Records, 3) + data[0] = NewRecordInt(3) + data[1] = NewRecordInt(2) + data[2] = NewRecordInt(1) + + sortedIdx := []int{2, 1, 0} + expect := []int{1, 2, 3} + + sorted := data.SortByIndex(sortedIdx) + + got := fmt.Sprint(sorted) + exp := fmt.Sprint(&expect) + + test.Assert(t, "", exp, got, true) +} diff --git a/lib/tabula/row.go b/lib/tabula/row.go new file mode 100644 index 00000000..105577c5 --- /dev/null +++ b/lib/tabula/row.go @@ -0,0 +1,123 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +// +// Row represent slice of record. +// +type Row []*Record + +// +// Len return number of record in row. +// +func (row *Row) Len() int { + return len(*row) +} + +// +// PushBack will add new record to the end of row. +// +func (row *Row) PushBack(r *Record) { + *row = append(*row, r) +} + +// +// Types return type of all records. +// +func (row *Row) Types() (types []int) { + for _, r := range *row { + types = append(types, r.Type()) + } + return +} + +// +// Clone create and return a clone of row. +// +func (row *Row) Clone() *Row { + clone := make(Row, len(*row)) + + for x, rec := range *row { + clone[x] = rec.Clone() + } + return &clone +} + +// +// IsNilAt return true if there is no record value in row at `idx`, otherwise +// return false. +// +func (row *Row) IsNilAt(idx int) bool { + if idx < 0 { + return true + } + if idx >= len(*row) { + return true + } + if (*row)[idx] == nil { + return true + } + return (*row)[idx].IsNil() +} + +// +// SetValueAt will set the value of row at cell index `idx` with record `rec`. +// +func (row *Row) SetValueAt(idx int, rec *Record) { + (*row)[idx] = rec +} + +// +// GetRecord will return pointer to record at index `i`, or nil if index +// is out of range. +// +func (row *Row) GetRecord(i int) *Record { + if i < 0 { + return nil + } + if i >= row.Len() { + return nil + } + return (*row)[i] +} + +// +// GetValueAt return the value of row record at index `idx`. If the index is +// out of range it will return nil and false +// +func (row *Row) GetValueAt(idx int) (interface{}, bool) { + if row.Len() <= idx { + return nil, false + } + return (*row)[idx].Interface(), true +} + +// +// GetIntAt return the integer value of row record at index `idx`. +// If the index is out of range it will return 0 and false. +// +func (row *Row) GetIntAt(idx int) (int64, bool) { + if row.Len() <= idx { + return 0, false + } + + return (*row)[idx].Integer(), true +} + +// +// IsEqual return true if row content equal with `other` row, otherwise return +// false. +// +func (row *Row) IsEqual(other *Row) bool { + if len(*row) != len(*other) { + return false + } + for x, xrec := range *row { + if !xrec.IsEqual((*other)[x]) { + return false + } + } + return true +} diff --git a/lib/tabula/row_test.go b/lib/tabula/row_test.go new file mode 100644 index 00000000..5fa45775 --- /dev/null +++ b/lib/tabula/row_test.go @@ -0,0 +1,33 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "testing" + + "github.com/shuLhan/share/lib/test" +) + +var dataFloat64 = []float64{0.1, 0.2, 0.3, 0.4, 0.5} + +func createRow() (row Row) { + for _, v := range dataFloat64 { + row.PushBack(NewRecordReal(v)) + } + return +} + +func TestClone(t *testing.T) { + row := createRow() + rowClone := row.Clone() + rowClone2 := row.Clone() + + test.Assert(t, "", &row, rowClone, true) + + // changing the clone value should not change the original copy. + (*rowClone2)[0].SetFloat(0) + test.Assert(t, "", &row, rowClone, true) + test.Assert(t, "", &row, rowClone2, false) +} diff --git a/lib/tabula/rows.go b/lib/tabula/rows.go new file mode 100644 index 00000000..fcaed021 --- /dev/null +++ b/lib/tabula/rows.go @@ -0,0 +1,251 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "math/rand" + "time" +) + +// +// Rows represent slice of Row. +// +type Rows []*Row + +// +// Len return number of row. +// +func (rows *Rows) Len() int { + return len(*rows) +} + +// +// PushBack append record r to the end of rows. +// +func (rows *Rows) PushBack(r *Row) { + if r != nil { + (*rows) = append((*rows), r) + } +} + +// +// PopFront remove the head, return the record value. +// +func (rows *Rows) PopFront() (row *Row) { + l := len(*rows) + if l > 0 { + row = (*rows)[0] + (*rows) = (*rows)[1:] + } + return +} + +// +// PopFrontAsRows remove the head and return ex-head as new rows. +// +func (rows *Rows) PopFrontAsRows() (newRows Rows) { + row := rows.PopFront() + if nil == row { + return + } + newRows.PushBack(row) + return +} + +// +// Del will detach row at index `i` from slice and return it. +// +func (rows *Rows) Del(i int) (row *Row) { + if i < 0 { + return + } + if i >= rows.Len() { + return + } + + row = (*rows)[i] + + last := len(*rows) - 1 + copy((*rows)[i:], (*rows)[i+1:]) + (*rows)[last] = nil + (*rows) = (*rows)[0:last] + + return row +} + +// +// GroupByValue will group each row based on record value in index recGroupIdx +// into map of string -> *Row. +// +// WARNING: returned rows will be empty! +// +// For example, given rows with target group in column index 1, +// +// [1 +] +// [2 -] +// [3 -] +// [4 +] +// +// this function will create a map with key is string of target and value is +// pointer to sub-rows, +// +// + -> [1 +] +// [4 +] +// - -> [2 -] +// [3 -] +// +// +func (rows *Rows) GroupByValue(GroupIdx int) (mapRows MapRows) { + for { + row := rows.PopFront() + if nil == row { + break + } + + key := fmt.Sprint((*row)[GroupIdx]) + + mapRows.AddRow(key, row) + } + return +} + +// +// RandomPick row in rows until n item and return it like its has been shuffled. +// If duplicate is true, row that has been picked can be picked up again, +// otherwise it will only picked up once. +// +// This function return picked and unpicked rows and index of them. +// +func (rows *Rows) RandomPick(n int, duplicate bool) ( + picked Rows, + unpicked Rows, + pickedIdx []int, + unpickedIdx []int, +) { + rowsLen := len(*rows) + + // if duplication is not allowed, we can only select as many as rows + // that we have. + if n > rowsLen && !duplicate { + n = rowsLen + } + + rand.Seed(time.Now().UnixNano()) + + for ; n >= 1; n-- { + idx := 0 + for { + idx = rand.Intn(len(*rows)) + + if duplicate { + // allow duplicate idx + pickedIdx = append(pickedIdx, idx) + break + } + + // check if its already picked + isPicked := false + for _, pastIdx := range pickedIdx { + if idx == pastIdx { + isPicked = true + break + } + } + // get another random idx again + if isPicked { + continue + } + + // bingo, we found unique idx that has not been picked. + pickedIdx = append(pickedIdx, idx) + break + } + + row := (*rows)[idx] + + picked.PushBack(row) + } + + // select unpicked rows using picked index. + for rid := range *rows { + // check if row index has been picked up + isPicked := false + for _, idx := range pickedIdx { + if rid == idx { + isPicked = true + break + } + } + if !isPicked { + unpicked.PushBack((*rows)[rid]) + unpickedIdx = append(unpickedIdx, rid) + } + } + return +} + +// +// Contain return true and index of row, if rows has data that has the same value +// with `row`, otherwise return false and -1 as index. +// +func (rows *Rows) Contain(xrow *Row) (bool, int) { + for x, row := range *rows { + if xrow.IsEqual(row) { + return true, x + } + } + return false, -1 +} + +// +// Contains return true and indices of row, if rows has data that has the same +// value with `rows`, otherwise return false and empty indices. +// +func (rows *Rows) Contains(xrows Rows) (isin bool, indices []int) { + // No data to compare. + if len(xrows) <= 0 { + return + } + + for _, xrow := range xrows { + isin, idx := rows.Contain(xrow) + + if isin { + indices = append(indices, idx) + } + } + + // Check if indices length equal to searched rows + if len(indices) == len(xrows) { + return true, indices + } + + return false, nil +} + +// +// SelectWhere return all rows which column value in `colidx` is equal +// to `colval`. +// +func (rows *Rows) SelectWhere(colidx int, colval string) (selected Rows) { + for _, row := range *rows { + col := (*row)[colidx] + if col.IsEqualToString(colval) { + selected.PushBack(row) + } + } + return +} + +// +// String return the string representation of each row. +// +func (rows Rows) String() (s string) { + for x := range rows { + s += fmt.Sprint(rows[x]) + } + return +} diff --git a/lib/tabula/rows_test.go b/lib/tabula/rows_test.go new file mode 100644 index 00000000..174dd10f --- /dev/null +++ b/lib/tabula/rows_test.go @@ -0,0 +1,181 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "fmt" + "strings" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestPushBack(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + exp := strings.Join(rowsExpect, "") + got := fmt.Sprint(rows) + + test.Assert(t, "", exp, got, true) +} + +func TestPopFront(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + l := len(rows) - 1 + for i := range rows { + row := rows.PopFront() + + exp := rowsExpect[i] + got := fmt.Sprint(row) + + test.Assert(t, "", exp, got, true) + + if i < l { + exp = strings.Join(rowsExpect[i+1:], "") + } else { + exp = "" + } + got = fmt.Sprint(rows) + + test.Assert(t, "", exp, got, true) + } + + // empty rows + row := rows.PopFront() + + exp := "<nil>" + got := fmt.Sprint(row) + + test.Assert(t, "", exp, got, true) +} + +func TestPopFrontRow(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + l := len(rows) - 1 + for i := range rows { + newRows := rows.PopFrontAsRows() + + exp := rowsExpect[i] + got := fmt.Sprint(newRows) + + test.Assert(t, "", exp, got, true) + + if i < l { + exp = strings.Join(rowsExpect[i+1:], "") + } else { + exp = "" + } + got = fmt.Sprint(rows) + + test.Assert(t, "", exp, got, true) + } + + // empty rows + row := rows.PopFrontAsRows() + + exp := "" + got := fmt.Sprint(row) + + test.Assert(t, "", exp, got, true) +} + +func TestGroupByValue(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + mapRows := rows.GroupByValue(testClassIdx) + + got := fmt.Sprint(mapRows) + + test.Assert(t, "", groupByExpect, got, true) +} + +func TestRandomPick(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + // random pick with duplicate + for i := 0; i < 5; i++ { + picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(6, + true) + + // check if unpicked item exist in picked items. + isin, _ := picked.Contains(unpicked) + + if isin { + fmt.Println("Random pick with duplicate rows") + fmt.Println("==> picked rows :", picked) + fmt.Println("==> picked idx :", pickedIdx) + fmt.Println("==> unpicked rows :", unpicked) + fmt.Println("==> unpicked idx :", unpickedIdx) + t.Fatal("random pick: unpicked is false") + } + } + + // random pick without duplication + for i := 0; i < 5; i++ { + picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(3, + false) + + // check if picked rows is duplicate + test.Assert(t, "", picked[0], picked[1], false) + + // check if unpicked item exist in picked items. + isin, _ := picked.Contains(unpicked) + + if isin { + fmt.Println("Random pick with no duplicate rows") + fmt.Println("==> picked rows :", picked) + fmt.Println("==> picked idx :", pickedIdx) + fmt.Println("==> unpicked rows :", unpicked) + fmt.Println("==> unpicked idx :", unpickedIdx) + t.Fatal("random pick: unpicked is false") + } + } +} + +func TestRowsDel(t *testing.T) { + rows, e := initRows() + if e != nil { + t.Fatal(e) + } + + // Test deleting row index out of range. + row := rows.Del(-1) + if row != nil { + t.Fatal("row should be nil!") + } + + row = rows.Del(rows.Len()) + if row != nil { + t.Fatal("row should be nil!") + } + + // Test deleting index that is actually exist. + row = rows.Del(0) + + exp := strings.Join(rowsExpect[1:], "") + got := fmt.Sprint(rows) + + test.Assert(t, "", exp, got, true) + + got = fmt.Sprint(row) + test.Assert(t, "", rowsExpect[0], got, true) +} diff --git a/lib/tabula/tabula.go b/lib/tabula/tabula.go new file mode 100644 index 00000000..3d7f57df --- /dev/null +++ b/lib/tabula/tabula.go @@ -0,0 +1,76 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +// +// Package tabula is a Go library for working with rows, columns, or matrix +// (table), or in another terms working with data set. +// +// Introduction +// +// Go's slice gave a flexible way to manage sequence of data in one type, but +// what if you want to manage a sequence of value but with different type of +// data? Or manage a bunch of values like a table? +// +// You can use this library to manage sequence of value with different type +// and manage data in two dimensional tuple. +// +// Terminology +// +// Here are some terminologies that we used in developing this library, which +// may help reader understand the internal and API. +// +// Record is a single cell in row or column, or the smallest building block of +// dataset. +// +// Row is a horizontal representation of records in dataset. +// +// Column is a vertical representation of records in dataset. +// Each column has a unique name and has the same type data. +// +// Dataset is a collection of rows and columns. +// +// Given those definitions we can draw the representation of rows, columns, or +// matrix: +// +// COL-0 COL-1 ... COL-x +// ROW-0: record record ... record +// ROW-1: record record ... record +// ... +// ROW-y: record record ... record +// +// Record Type +// +// There are only three valid type in record: int64, float64, and string. +// +// Dataset Mode +// +// Tabula has three mode for dataset: rows, columns, or matrix. +// +// For example, given a table of data, +// +// col1,col2,col3 +// a,b,c +// 1,2,3 +// +// "rows" mode is where each line saved in its own slice, resulting in Rows: +// +// Rows[0]: [a b c] +// Rows[1]: [1 2 3] +// +// "columns" mode is where each line saved by columns, resulting in Columns: +// +// Columns[0]: {col1 0 0 [] [a 1]} +// Columns[1]: {col2 0 0 [] [b 2]} +// Columns[1]: {col3 0 0 [] [c 3]} +// +// Unlike rows mode, each column contain metadata including column name, type, +// flag, and value space (all possible value that _may_ contain in column +// value). +// +// "matrix" mode is where each record saved both in row and column. +// +// Matrix mode consume more memory but give a flexible way to manage records. +// +// +package tabula diff --git a/lib/tabula/tabula_test.go b/lib/tabula/tabula_test.go new file mode 100644 index 00000000..6b13d60c --- /dev/null +++ b/lib/tabula/tabula_test.go @@ -0,0 +1,81 @@ +// Copyright 2017, Shulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be found +// in the LICENSE file. + +package tabula + +import ( + "os" +) + +var ( + traces = make([]byte, 1024) +) + +func printStackTrace() { + var lines, start, end int + + for x, b := range traces { + if b != '\n' { + continue + } + lines++ + if lines == 3 { + start = x + } else if lines == 5 { + end = x + 1 + break + } + } + + os.Stderr.Write(traces[start:end]) +} + +var testColTypes = []int{ + TInteger, + TInteger, + TInteger, + TString, +} + +var testColNames = []string{"int01", "int02", "int03", "class"} + +// Testing data and function for Rows and MapRows +var rowsData = [][]string{ + {"1", "5", "9", "+"}, + {"2", "6", "0", "-"}, + {"3", "7", "1", "-"}, + {"4", "8", "2", "+"}, +} + +var testClassIdx = 3 + +var rowsExpect = []string{ + "&[1 5 9 +]", + "&[2 6 0 -]", + "&[3 7 1 -]", + "&[4 8 2 +]", +} + +var groupByExpect = "[{+ &[1 5 9 +]&[4 8 2 +]} {- &[2 6 0 -]&[3 7 1 -]}]" + +func initRows() (rows Rows, e error) { + for i := range rowsData { + l := len(rowsData[i]) + row := make(Row, 0) + + for j := 0; j < l; j++ { + rec, e := NewRecordBy(rowsData[i][j], + testColTypes[j]) + + if nil != e { + return nil, e + } + + row = append(row, rec) + } + + rows.PushBack(&row) + } + return rows, nil +} |
